Previous changeset 0:dbff53e6f75f (2022-07-11) Next changeset 2:2336fbff8866 (2022-12-12) |
Commit message:
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 43e7a43b545c24b2dc33d039198551c032aa79be |
modified:
MaxQuantProcessingScript.R macros.xml mqppep_anova.R mqppep_anova.xml mqppep_anova_script.Rmd mqppep_mrgfltr.py search_ppep.py |
added:
KSEA_impl_flowchart.dia KSEA_impl_flowchart.pdf kinase_name_uniprot_lut.tabular.bz2 kinase_uniprot_description_lut.tabular.bz2 mqppep_anova_preamble.tex perpage.tex |
b |
diff -r dbff53e6f75f -r 08678c931f5d KSEA_impl_flowchart.dia |
b |
Binary file KSEA_impl_flowchart.dia has changed |
b |
diff -r dbff53e6f75f -r 08678c931f5d KSEA_impl_flowchart.pdf |
b |
Binary file KSEA_impl_flowchart.pdf has changed |
b |
diff -r dbff53e6f75f -r 08678c931f5d MaxQuantProcessingScript.R --- a/MaxQuantProcessingScript.R Mon Jul 11 19:22:25 2022 +0000 +++ b/MaxQuantProcessingScript.R Fri Oct 28 18:27:21 2022 +0000 |
[ |
@@ -220,7 +220,6 @@ type = "character", help = "pY or pST enriched samples (ie, 'Y' or 'ST')" ) - # default = "^Number of Phospho [(]STY[)]$", , make_option( c("-p", "--phosphoCol"), @@ -229,7 +228,6 @@ help = paste0("PERL-compatible regular expression matching", " header of column having number of 'Phospho (STY)'") ) - # default = "^Intensity[^_]", , make_option( c("-s", "--startCol"), @@ -238,7 +236,6 @@ help = paste0("PERL-compatible regular expression matching", " header of column having first sample intensity") ) - # default = 1, , make_option( c("-I", "--intervalCol"), @@ -247,7 +244,6 @@ help = paste0("Column interval between the Intensities of samples", " (eg, 1 if subsequent column; 2 if every other column") ) - # default = 0.75, , make_option( c("-l", "--localProbCutoff"), @@ -255,7 +251,6 @@ type = "double", help = "Localization Probability Cutoff" ) - # default = "sum", , make_option( c("-f", "--collapse_func"), @@ -264,7 +259,6 @@ help = paste0("merge identical phosphopeptides", " by ('sum' or 'average') the intensities") ) - # default = "filtered_data.txt", , make_option( c("-r", "--filtered_data"), @@ -272,7 +266,6 @@ type = "character", help = "filtered_data.txt" ) - # default = "quantData.txt", , make_option( c("-q", "--quant_data"), |
b |
diff -r dbff53e6f75f -r 08678c931f5d kinase_name_uniprot_lut.tabular.bz2 |
b |
Binary file kinase_name_uniprot_lut.tabular.bz2 has changed |
b |
diff -r dbff53e6f75f -r 08678c931f5d kinase_uniprot_description_lut.tabular.bz2 |
b |
Binary file kinase_uniprot_description_lut.tabular.bz2 has changed |
b |
diff -r dbff53e6f75f -r 08678c931f5d macros.xml --- a/macros.xml Mon Jul 11 19:22:25 2022 +0000 +++ b/macros.xml Fri Oct 28 18:27:21 2022 +0000 |
b |
@@ -1,89 +1,47 @@ <macros> - <token name="@TOOL_VERSION@">0.1.13</token> + <token name="@TOOL_VERSION@">0.1.15</token> <token name="@VERSION_SUFFIX@">0</token> <xml name="requirements"> <requirements> <requirement type="package" version="1.56.0" >bioconductor-preprocesscore</requirement> - <requirement type="package" version="1.22.2" >numpy</requirement> + <requirement type="package" version="6.2.1" >gmp</requirement> + <requirement type="package" version="1.23.4" >numpy</requirement> <requirement type="package" version="0.3.3" >openblas</requirement> - <requirement type="package" version="1.4.1" >pandas</requirement> - <requirement type="package" version="1.64" >perl-dbd-sqlite</requirement> - <requirement type="package" version="5.26.2" >perl</requirement> - <requirement type="package" version="1.4.0" >pyahocorasick</requirement> - <requirement type="package" version="3.9.10" >python</requirement> - <requirement type="package" version="1.14.2" >r-data.table</requirement> - <requirement type="package" version="1.1.2" >r-dbi</requirement> - <requirement type="package" version="3.3.5" >r-ggplot2</requirement> + <requirement type="package" version="1.5.1" >pandas</requirement> + <requirement type="package" version="1.70" >perl-dbd-sqlite</requirement> + <requirement type="package" version="5.32.1" >perl</requirement> + <requirement type="package" version="1.4.4" >pyahocorasick</requirement> + <requirement type="package" version="3.10.6" >python</requirement> + <requirement type="package" version="4.1.3" >r-base</requirement> + <requirement type="package" version="6.0_93" >r-caret</requirement> + <requirement type="package" version="1.14.4" >r-data.table</requirement> + <requirement type="package" version="1.1.3" >r-dbi</requirement> + <requirement type="package" version="3.3.6" >r-ggplot2</requirement> <requirement type="package" version="3.1.3" >r-gplots</requirement> - <requirement type="package" version="0.9.4" >r-latex2exp</requirement> - <requirement type="package" version="1.7.1" >r-optparse</requirement> + <requirement type="package" version="0.9.5" >r-latex2exp</requirement> + <requirement type="package" version="1.7.3" >r-optparse</requirement> <requirement type="package" version="1.4.4" >r-reshape2</requirement> - <requirement type="package" version="2.11" >r-rmarkdown</requirement> - <requirement type="package" version="2.2.8" >r-rsqlite</requirement> - <requirement type="package" version="0.4.0" >r-sass</requirement> + <requirement type="package" version="2.17" >r-rmarkdown</requirement> + <!-- + <requirement type="package" version="2.2.18" >r-rsqlite</requirement> + <requirement type="package" version="0.4.2" >r-sass</requirement> + --> + <requirement type="package" version="1.2.2" >r-sessioninfo</requirement> <requirement type="package" version="0.4_11" >r-sqldf</requirement> - <requirement type="package" version="1.4.0" >r-stringr</requirement> - <requirement type="package" version="0.37" >r-tinytex</requirement> + <requirement type="package" version="1.4.1" >r-stringr</requirement> + <requirement type="package" version="0.42" >r-tinytex</requirement> <requirement type="package" version="0.3.7" >r-vioplot</requirement> <!-- It would be nice to use conda-forge/texlive-core rather than r-tinytex because the - former installs texlive when the package is built, but issue 23 blocked PDF-creation. + former installs texlive when the package is built, but issue 19/61 blocked PDF-creation. Also, texlive-core also gave pango font errors (output had missing symbols replaced with boxes) unless I specified the build as well as the version when building a conda environment, e.g.: texlive-core=20210325=h97429d4_0 --> </requirements> - <!-- I specified the versions above because it takes a VERY long time to search for package versions when they are not omitted; also, version numbers should lead to reproducible behavior. Contrast execution times of this (about 18 seconds): - echo n | time conda create -n mqppep_ver -c conda-forge -c bioconda \ - bioconductor-preprocesscore=1.56.0 \ - numpy=1.22.2 \ - openblas=0.3.3 \ - pandas=1.4.1 \ - perl-dbd-sqlite=1.64 \ - perl-dbd-sqlite=1.64 \ - perl=5.26.2 \ - pyahocorasick=1.4.0 \ - python=3.9.10 \ - r-data.table=1.14.2 \ - r-dbi=1.1.2 \ - r-ggplot2=3.3.5 \ - r-gplots=3.1.3 \ - r-latex2exp=0.9.4 \ - r-optparse=1.7.1 \ - r-reshape2=1.4.4 \ - r-rmarkdown=2.11 \ - r-rsqlite=2.2.8 \ - r-sass=0.4.0 \ - r-sqldf=0.4_11 \ - r-stringr=1.4.0 \ - r-tinytex=0.37 \ - r-vioplot=0.3.7 - with this (42 or more seconds): - echo n | time conda create -n mqppep_nover -c conda-forge -c bioconda \ - bioconductor-preprocesscore= \ - numpy \ - openblas=0.3.3 \ - pandas \ - perl \ - perl-dbd-sqlite \ - perl-dbd-sqlite \ - pyahocorasick \ - python \ - r-data.table \ - r-dbi \ - r-ggplot2 \ - r-gplots \ - r-latex2exp \ - r-optparse \ - r-reshape2 \ - r-rmarkdown \ - r-rsqlite \ - r-sass \ - r-sqldf \ - r-stringr \ - r-tinytex \ - r-vioplot - + <!-- I specified the versions above because it takes a VERY long time + to search for package versions when they are not omitted; also, + locking version numbers might lead to more-reproducible behavior. --> </xml> </macros> |
b |
diff -r dbff53e6f75f -r 08678c931f5d mqppep_anova.R --- a/mqppep_anova.R Mon Jul 11 19:22:25 2022 +0000 +++ b/mqppep_anova.R Fri Oct 28 18:27:21 2022 +0000 |
[ |
b'@@ -1,20 +1,15 @@\n #!/usr/bin/env Rscript\n # libraries\n library(optparse)\n-library(data.table)\n library(stringr)\n+library(tinytex)\n \n # ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285\n \n # parse options\n option_list <- list(\n- make_option(\n- c("-i", "--inputFile"),\n- action = "store",\n- default = NA,\n- type = "character",\n- help = "Phosphopeptide Intensities sparse input file path"\n- ),\n+\n+ # files\n make_option(\n c("-a", "--alphaFile"),\n action = "store",\n@@ -24,64 +19,11 @@\n " path to text file having one column and no header")\n ),\n make_option(\n- c("-S", "--preproc_sqlite"),\n- action = "store",\n- default = NA,\n- type = "character",\n- help = "Path to \'preproc_sqlite\' produced by `mqppep_mrgfltr.py`"\n- ),\n- make_option(\n- c("-K", "--ksea_sqlite"),\n+ c("-M", "--anova_ksea_metadata"),\n action = "store",\n- default = NA,\n- type = "character",\n- help = "Path to \'ksea_sqlite\' output produced by this tool"\n- ),\n- make_option(\n- c("-f", "--firstDataColumn"),\n- action = "store",\n- default = "^Intensity[^_]",\n- type = "character",\n- help = "First column of intensity values"\n- ),\n- make_option(\n- c("-m", "--imputationMethod"),\n- action = "store",\n- default = "random",\n+ default = "anova_ksea_metadata.tsv",\n type = "character",\n- help = paste0("Method for missing-value imputation,",\n- " one of c(\'group-median\',\'median\',\'mean\',\'random\')")\n- ),\n- make_option(\n- c("-p", "--meanPercentile"),\n- action = "store",\n- default = 3,\n- type = "integer",\n- help = paste0("Mean percentile for randomly generated imputed values;",\n- ", range [1,99]")\n- ),\n- make_option(\n- c("-d", "--sdPercentile"),\n- action = "store",\n- default = 3,\n- type = "double",\n- help = paste0("Adjustment value for standard deviation of",\n- " randomly generated imputed values; real")\n- ),\n- make_option(\n- c("-s", "--regexSampleNames"),\n- action = "store",\n- default = "\\\\.(\\\\d+)[A-Z]$",\n- type = "character",\n- help = "Regular expression extracting sample-names"\n- ),\n- make_option(\n- c("-g", "--regexSampleGrouping"),\n- action = "store",\n- default = "(\\\\d+)",\n- type = "character",\n- help = paste0("Regular expression extracting sample-group",\n- " from an extracted sample-name")\n+ help = "Phosphopeptide metadata, ANOVA FDR, and KSEA enribhments"\n ),\n make_option(\n c("-o", "--imputedDataFile"),\n@@ -102,11 +44,56 @@\n )\n ),\n make_option(\n+ c("-i", "--inputFile"),\n+ action = "store",\n+ default = NA,\n+ type = "character",\n+ help = "Phosphopeptide Intensities sparse input file path"\n+ ),\n+ make_option(\n+ c("-K", "--ksea_sqlite"),\n+ action = "store",\n+ default = NA,\n+ type = "character",\n+ help = "Path to \'ksea_sqlite\' output produced by this tool"\n+ ),\n+ make_option(\n+ c("-S", "--preproc_sqlite"),\n+ action = "store",\n+ default = NA,\n+ type = "character",\n+ help = "Path to \'preproc_sqlite\' produced by `mqppep_mrgfltr.py`"\n+ ),\n+ make_option(\n c("-r", "--reportFile"),\n action = "store",\n- default = "QuantDataProcessingScript.html",\n+ default = "mqppep_anova.pdf",\n+ type = "character",\n+ help = "PDF report file path"\n+ ),\n+\n+ # parameters\n+ make_option(\n+ c("-f", "--firstDataColumn"),\n+ action = "store",\n+ default = "^Intensity[^_]",\n type = "character",\n- help = "HTML report file path"\n+ help = "First column of intensity values"\n+ ),\n+ make_option(\n+ c("-m", "--imputationMethod"),\n+ action = "store",\n+ default = "random",\n+ type = "character",\n+ help = paste0("Method for missing-value imputation,",\n+ " one of c(\'group-median\',\'median\',\'mean\',\'random\')")\n+ ),\n+ make_option(\n+ c("-C", "--intensityMinValuesPerClass"),\n+ action = "store",\n+ default = "0",\n+ type = "integer",\n+ hel'..b'_config_file_string(args$regexSampleGrouping, nc)\n+cat(paste0("regex_sample_grouping: ", regex_sample_grouping, "\\n"))\n+\n+cat(paste0("regex_sample_names file: ", args$regexSampleNames, "\\n"))\n+regex_sample_names <- read_config_file_string(args$regexSampleNames, nc)\n cat(paste0("regex_sample_names: ", regex_sample_names, "\\n"))\n-cat(paste0("regex_sample_grouping: ", regex_sample_grouping, "\\n"))\n+\n+if (group_filter != "none") {\n+ cat(paste0("group_filter_patterns file: \'", args$sampleGroupFilterPatterns, "\'\\n"))\n+ group_filter_patterns <- read_config_file_string(args$sampleGroupFilterPatterns, nc)\n+} else {\n+ group_filter_patterns <- ".*"\n+}\n+cat(paste0("group_filter_patterns: ", group_filter_patterns, "\\n"))\n+\n+sink()\n+\n \n # from: https://github.com/molgenis/molgenis-pipelines/wiki/\n # How-to-source-another_file.R-from-within-your-R-script\n@@ -253,45 +391,72 @@\n return(NULL)\n }\n \n-script_dir <- location_of_this_script()\n+# validation of input parameters is complete; it is now justifiable to\n+# install LaTeX tools to render markdown as PDF; this involves a big\n+# download from GitHub\n+if (!tinytex::is_tinytex()) tinytex::install_tinytex()\n \n rmarkdown_params <- list(\n- inputFile = input_file\n- , alphaFile = alpha_file\n- , preprocDb = preproc_sqlite\n+\n+ # files\n+ alphaFile = alpha_file\n+ , anovaKseaMetadata = anova_ksea_metadata_file\n+ , imputedDataFilename = imputed_data_file\n+ , imputedQNLTDataFile = imp_qn_lt_data_file\n+ , inputFile = input_file\n+ , kseaAppPrepDb = ksea_sqlite_file\n+ , preprocDb = preproc_sqlite_file\n+\n+ # parameters\n , firstDataColumn = first_data_column\n+ , groupFilter = group_filter\n+ , groupFilterMode = group_filter_mode # arg sampleGroupFilterMode\n+ , groupFilterPatterns = group_filter_patterns # arg sampleGroupFilterPatterns\n , imputationMethod = imputation_method\n+ , intensityMinValuesPerGroup = intensity_min_values_per_class\n+ , kseaCutoffStatistic = ksea_cutoff_statistic\n+ , kseaCutoffThreshold = ksea_cutoff_threshold\n+ , kseaMinSubstrateCount = ksea_min_substrate_count\n+ , kseaUseAbsoluteLog2FC = ksea_use_absolute_log2_fc # add\n , meanPercentile = mean_percentile\n- , sdPercentile = sd_percentile\n+ , minQuality = min_quality # add\n+ , regexSampleGrouping = regex_sample_grouping\n , regexSampleNames = regex_sample_names\n- , regexSampleGrouping = regex_sample_grouping\n- , imputedDataFilename = imputed_data_file_name\n- , imputedQNLTDataFile = imp_qn_lt_data_filenm\n- , anovaKseaMetadata = anova_ksea_metadata\n- , kseaAppPrepDb = ksea_sqlite\n- , kseaCutoffThreshold = ksea_cutoff_threshold\n- , kseaCutoffStatistic = ksea_cutoff_statistic\n+ , sdPercentile = sd_percentile\n )\n \n print("rmarkdown_params")\n-str(rmarkdown_params)\n+print(rmarkdown_params)\n+print(\n+ lapply(\n+ X = rmarkdown_params,\n+ FUN = function(x) {\n+ paste0(\n+ nchar(as.character(x)),\n+ ": \'",\n+ as.character(x),\n+ "\'"\n+ )\n+ }\n+ )\n+)\n+\n \n # freeze the random number generator so the same results will be produced\n # from run to run\n set.seed(28571)\n \n-# BUG (or "opportunity")\n-# To render as PDF for the time being requires installing the conda\n-# package `r-texlive` until this issue in `texlive-core` is resolved:\n-# https://github.com/conda-forge/texlive-core-feedstock/issues/19\n-# This workaround is detailed in the fourth comment of:\n-# https://github.com/conda-forge/texlive-core-feedstock/issues/61\n+script_dir <- location_of_this_script()\n \n-library(tinytex)\n-tinytex::install_tinytex()\n rmarkdown::render(\n input = paste(script_dir, "mqppep_anova_script.Rmd", sep = "/")\n-, output_format = rmarkdown::pdf_document(toc = TRUE)\n , output_file = report_file_name\n , params = rmarkdown_params\n+, output_format = rmarkdown::pdf_document(\n+ includes = rmarkdown::includes(in_header = "mqppep_anova_preamble.tex")\n+ , dev = "pdf"\n+ , toc = TRUE\n+ , toc_depth = 2\n+ , number_sections = FALSE\n+ )\n )\n' |
b |
diff -r dbff53e6f75f -r 08678c931f5d mqppep_anova.xml --- a/mqppep_anova.xml Mon Jul 11 19:22:25 2022 +0000 +++ b/mqppep_anova.xml Fri Oct 28 18:27:21 2022 +0000 |
[ |
b'@@ -7,6 +7,28 @@\n <description>Runs ANOVA and KSEA for phosphopeptides.</description>\n <macros>\n <import>macros.xml</import>\n+ <xml name="group_matching_parm">\n+ <param name="group_filter_mode" type="select"\n+ help="Regular expression matching mode \'fixed\', \'perl\', or \'grep\' with option for case insensitivity. See https://rdrr.io/r/base/grep.html"\n+ label="Sample-group matching mode"\n+ >\n+ <option value="r">ERE ("extended regular expressions")</option>\n+ <option value="ri"> - ERE, case insensitive</option>\n+ <option value="p" selected="true">PCRE ("PERL-compatible regular expressions")</option>\n+ <option value="pi"> - PCRE, case insensitive</option>\n+ <option value="f">fixed strings ("no regular expressions")</option>\n+ <option value="fi"> - fixed strings, case insensitive</option>\n+ </param>\n+ <param name="group_filter_patterns" type="text" value=".+"\n+ help="Comma-separated list of regular expressions matching group-names"\n+ label="Sample-group matching pattern">\n+ <sanitizer>\n+ <valid initial="string.printable">\n+ <remove value="'"/>\n+ </valid>\n+ </sanitizer>\n+ </param>\n+ </xml>\n </macros>\n <edam_topics>\n <edam_topic>topic_0121</edam_topic><!-- proteomics -->\n@@ -27,29 +49,58 @@\n both need access to a writeable directory, but most directories in a\n biocontainer are read-only, so this builds a pseudo-home under /tmp\n -->\n+ <required_files>\n+ <include path="KSEA_impl_flowchart.pdf" />\n+ <include path="kinase_name_uniprot_lut.tabular.bz2" />\n+ <include path="kinase_uniprot_description_lut.tabular.bz2" />\n+ <include path="kinase_uniprot_description_lut.tabular.bz2" />\n+ <include path="mqppep_anova.R" />\n+ <include path="mqppep_anova_preamble.tex" />\n+ <include path="mqppep_anova_script.Rmd" />\n+ <include path="perpage.tex" />\n+ </required_files>\n <command detect_errors="exit_code"><![CDATA[\n+ (printenv | sort) &&\n cp \'$__tool_directory__/mqppep_anova_script.Rmd\' . &&\n- cp \'$__tool_directory__/mqppep_anova.R\' . &&\n+ cp \'$__tool_directory__/mqppep_anova.R\' . &&\n+ cp \'$__tool_directory__/kinase_name_uniprot_lut.tabular.bz2\' . &&\n+ cp \'$__tool_directory__/kinase_uniprot_description_lut.tabular.bz2\' . &&\n+ cp \'$__tool_directory__/mqppep_anova_preamble.tex\' . &&\n+ cp \'$__tool_directory__/perpage.tex\' . &&\n+ cp \'$__tool_directory__/KSEA_impl_flowchart.pdf\' . &&\n Rscript mqppep_anova.R\n --inputFile \'$input_file\'\n --alphaFile \'$alpha_file\'\n --preproc_sqlite \'$preproc_sqlite\'\n- --firstDataColumn $intensity_column_regex_f\n+ --firstDataColumn \'$intensity_column_regex_f\'\n --imputationMethod $imputation.imputation_method\n #if $imputation.imputation_method == "random"\n --meanPercentile \'$imputation.meanPercentile\'\n --sdPercentile \'$imputation.sdPercentile\'\n #end if\n- --regexSampleNames $sample_names_regex_f\n- --regexSampleGrouping $sample_grouping_regex_f\n- --imputedDataFile $imputed_data_file\n+ --regexSampleNames \'$sample_names_regex_f\'\n+ --regexSampleGrouping \'$sample_grouping_regex_f\'\n+ #if $group_filter.group_filter_method == "none"\n+ --sampleGroupFilter \'none\'\n+ #else\n+ --sampleGroupFilter \'$group_filter.group_filter_method\'\n+ --sampleGroupFilterPatterns \'$group_filter_patterns_f\'\n+ --sampleGroupFilterMode \'$group_filter.group_filter_mode\'\n+ #end if\n+ --intensityMinValuesPerClass \'$intnsty_min_vals_per_smpl_grp\'\n+ --imputedDataFile \'$imputed_data_file\'\n --imputedQNLTDataFile \'$im'..b'nitude of the differences across the contrast for all of the substrates when aggregating them to assess the enrichment of a given kinase\'s substrates. When FALSE, also consider the direction. Surprisingly, setting this to TRUE may decrease the enriched kinases. \n+\n+``Minimum quality of substrates for KSEA``\n+ An arbitrary "quality score" is assigned to each substrate, as described in the PDF report produced by the tool. This score takes into account both FDR-adjusted p-value and the number of missing values for each substrate. Setting the minimum to zero retains all substrates, which may be a large number.\n \n **Outputs**\n+===========\n \n-``imputed_intensities (input_file.imputation_method-imputed_intensities)``\n- Phosphopeptide MS intensities where missing values have been **imputed** by the chosen method, in tabular format.\n+Report dataset\n+ *[input file].[imputation method]*-``imputed_report``\n+\n+ Summary report for normalization, imputation, and **ANOVA**, in PDF format.\n \n-``imputed_QN_LT_intensities (input_file.imputation_method-imputed_QN_LT_intensities)``\n- Phosphopeptide MS intensities where missing values have been **imputed** by the chosen method, quantile-normalized (**QN**), and log10-transformed (**LT**), in tabular format.\n+Imputed intensities\n+ *[input file].[imputation method]*-``imputed_intensities``\n+\n+ Phosphopeptide MS intensities where missing values have been **imputed** by the chosen method, in tabular format.\n \n-``report_file (input_file.imputation_method-imputed_report)``\n- Summary report for normalization, imputation, and **ANOVA**, in PDF format.\n+Imputed quantum-normalized log-transformed intensities\n+ *[input file].[imputation method]*-``imputed_QN_LT_intensities``\n+\n+ Phosphopeptide MS intensities where missing values have been **imputed** by the chosen method, quantile-normalized (**QN**), and log10-transformed (**LT**), in tabular format.\n \n-``anova_ksea_metadata (input_file.imputation_method-imputed_anova_ksea_metadata)``\n- Phosphopeptide metadata including ANOVA significance and KSEA enrichments.\n+ANOVA KSEA metadata\n+ *[input file].[imputation method]*-``imputed_anova_ksea_metadata``\n+ Phosphopeptide metadata including ANOVA significance and KSEA enrichments.\n \n-``ksea_sqlite (input_file.imputation_method-imputed_ksea_sqlite)``\n- SQLite database for ad-hoc report creation.\n+KSEA SQLite database sqlite\n+ *[input file].[imputation method]*-``imputed_ksea_sqlite``\n+ An SQLite database that is usable for *ad hoc* report creation.\n \n **Algorithm**\n+=============\n \n-The KSEA algorithm used here is as in the KSEAapp package as reported in [Wiredja 2017].\n-The code is adapted from "Danica D. Wiredja (2017). KSEAapp: Kinase-Substrate Enrichment Analysis. R package version 0.99.0." to work with output from the "MaxQuant Phosphopeptide Preprocessing" Galaxy tool.\n+The KSEA algorithm used here is as in the KSEAapp package as reported in `[Wiredja 2017] <https://doi.org/10.1093/bioinformatics/btx415>`_.\n+The code is adapted from `"Danica D. Wiredja (2017). KSEAapp: Kinase-Substrate Enrichment Analysis. R package version 0.99.0." <https://cran.r-project.org/package=KSEAapp>`_ to work with output from the "MaxQuant Phosphopeptide Preprocessing" Galaxy tool and the multiple kinase-substrate databases that the latter tool searches.\n \n **Authors**\n+===========\n \n ``Larry C. Cheng``\n (`ORCiD 0000-0002-6922-6433 <https://orcid.org/0000-0002-6922-6433>`_) wrote the original script.\n@@ -337,5 +525,11 @@\n <citation type="doi">10.3791/57996</citation>\n <!-- Wiredja_2017 "The KSEA App ..." PMID: 28655153 -->\n <citation type="doi">10.1093/bioinformatics/btx415</citation>\n+ <citation type="bibtex">@Manual{,\n+ title = {KSEAapp: Kinase-Substrate Enrichment Analysis},\n+ author = {Danica D. Wiredja},\n+ year = {2017},\n+ note = {R package version 0.99.0},\n+ }</citation>\n </citations>\n </tool>\n' |
b |
diff -r dbff53e6f75f -r 08678c931f5d mqppep_anova_preamble.tex --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mqppep_anova_preamble.tex Fri Oct 28 18:27:21 2022 +0000 |
[ |
@@ -0,0 +1,90 @@ +% ----------------------------------------------------------------------------- +% preamble includes BEGIN +% ----------------------------------------------------------------------------- +\usepackage{longtable, lscape, ifthen} + +% ----------------------------------------------------------------------------- +% put \T or \B at the ends of lines to add space for super- or sub- +% scripts above or below, respectively +% ref: ? +\newcommand\T{\rule{0pt}{2.6ex}} % Top strut +\newcommand\B{\rule[-1.2ex]{0pt}{0pt}} % Bottom strut + +% ----------------------------------------------------------------------------- +% horizontal line commands; ideally, these would compute the width rather than +% hardcoding it +% ref: ? +\def\hlinport{\makebox[6.5in]{\hrulefill} \\} % hline outside tabular, port +\def\hlinlscp{\makebox[9in]{\hrulefill} \\} % hline outside tabular, lndscp +%ref: https://stackoverflow.com/a/67335722 +\def\hlinnotab{\\makebox[1.0\linewidth]{\hrulefill}\\[1ex]} + +% ----------------------------------------------------------------------------- +% ref: https://latex.org/forum/viewtopic.php?p=23257#p23257 +\newcommand{\nonemptyline}[1]{% + %\ifthenelse{\equal{#1}{}}{do when empty}{do when not empty} + \ifthenelse{\equal{#1}{}}{}{#1}% +} + +% ----------------------------------------------------------------------------- +% For RMarkdown, I needed to put this into a preamble.tex file and include it +% via `output: pdf_document: includes: in_header: preamble.tex` because +% Markdown was expanding the \tabfill command before writing the tex file +% ref: https://tex.stackexchange.com/a/119477 in reply to +% https://tex.stackexchange.com/questions/119473/tabbing-and-line-wrapping +\makeatletter +\newlength\tdima +\newcommand\tabfill[1]{\setlength\tdima{\linewidth}% + \addtolength\tdima{\@totalleftmargin}% + \addtolength\tdima{-\dimen\@curtab}% + \parbox[t]{\tdima}{#1\ifhmode\strut\fi}} + %\parbox[t]{\tdima}{\nonemptyline{#1}\ifhmode\strut\fi}} +\makeatother +% +% Create a tabbing environment in which to use tabfill +% param #1 is specified the tabstops (as expected by the tabbing +% environment) and is provided in braces after invocation, e.g.: +% \begin{tabwrap}{\hspace{1.25in}\=} +% param #2 is the contents of the envirnent +\newenvironment{tabwrap}[2]{% + \begin{tabbing}#1\kill\ignorespaces% + #2}% + {\end{tabbing}% +} + +% ----------------------------------------------------------------------------- +% Make a caption for a non-floating figure or table, e.g., +% ref: https://github.com/rf-latex/capt-of/blob/main/capt-of.dtx +% https://texfaq.org/FAQ-figurehere +% +% Usage: \captionof{*type*}[*move*]{*caption*} +% *type* is `figure` or `table` (or some type you've +% defined with the`float` package) +% *move* is the optional moving argument *caption* (the thing +% that goes to the list of tables/figures) +% *caption* is the text of the caption +\makeatletter +\newcommand\captionof[1]{\def\@captype{#1}\caption} +\makeatother +% +%%ACE \captionof{table}{Hello world from line 210} +% To circumvent mis-numbering of interleaved float and non-float table +% and figure captions, it is necessary to include the `perpage` package and +% "make them sorted" (FFI see https://texfaq.org/FAQ-figurehere) +% I (ACE) don't know how to get this package to include: +% \usepackage{bigfoot} +% so I included the source instead: +\makeatletter +\input{perpage.tex} +\makeatother +% +% Ensure that table numbers are sorted +\MakeSorted{table} +% Ensure that figure numbers are sorted +\MakeSorted{figure} + +% ----------------------------------------------------------------------------- + +% ----------------------------------------------------------------------------- +% preamble includes END +% ----------------------------------------------------------------------------- |
b |
diff -r dbff53e6f75f -r 08678c931f5d mqppep_anova_script.Rmd --- a/mqppep_anova_script.Rmd Mon Jul 11 19:22:25 2022 +0000 +++ b/mqppep_anova_script.Rmd Fri Oct 28 18:27:21 2022 +0000 |
[ |
b'@@ -7,81 +7,153 @@\n date:\n - "May 28, 2018"\n - "; revised June 23, 2022"\n+lot: true\n output:\n pdf_document:\n toc: true\n- toc_depth: 3\n+ toc_depth: 2\n keep_tex: true\n-header-includes:\n- - \\usepackage{longtable}\n- - \\newcommand\\T{\\rule{0pt}{2.6ex}} % Top strut\n- - \\newcommand\\B{\\rule[-1.2ex]{0pt}{0pt}} % Bottom strut\n+ dev: pdf\n+ includes:\n+ in_header: mqppep_anova_preamble.tex\n+latex_macros: false\n+raw_tex: true\n+urlcolor: blue\n params:\n alphaFile: "test-data/alpha_levels.tabular"\n inputFile: "test-data/test_input_for_anova.tabular"\n preprocDb: "test-data/test_input_for_anova.sqlite"\n kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2]\n- show_toc: true\n- firstDataColumn: "^Intensity[^_]"\n- imputationMethod: !r c("group-median", "median", "mean", "random")[1]\n- meanPercentile: 1\n- sdPercentile: 1.0\n regexSampleNames: "\\\\.\\\\d+[A-Z]$"\n regexSampleGrouping: "\\\\d+"\n+ groupFilterPatterns: ".+"\n+ groupFilter: !r c("none", "exclude", "include")[1]\n+ imputationMethod: !r c("group-median", "median", "mean", "random")[4]\n+ kseaCutoffThreshold: !r c(0.05, 0.1, 0.25, 0.5, 0.9)[5]\n+ #imputationMethod: !r c("group-median", "median", "mean", "random")[1]\n+\n+ # how should sample groups be interpreted?\n+ # - "f": fixed patterns (like `grep -F`)\n+ # - "p": PERL-compatible (like `grep -P`)\n+ # - "r": extended grep patterns (like `grep -E`)\n+ # use what case sensitivity?\n+ # - "i": case insensitive matching (like `grep -i`)\n+ groupFilterMode: !r c("r", "ri", "p", "pi", "f", "fi")[1]\n+ # what pattern should be used for the first column\n+ # (extended grep pattern, case sensitive)\n+ firstDataColumn: "^Intensity[^_]"\n+ # for small random value imputation, what percentile should be center?\n+ meanPercentile: 50\n+ #meanPercentile: 1\n+ # for small random value imputation, what should `s / mean(x)` ratio be?\n+ sdPercentile: 1.0\n+ # output path for imputed data file\n imputedDataFilename: "test-data/limbo/imputedDataFilename.txt"\n+ # output path for imputed/quantile-normalized/log-transformed data file\n imputedQNLTDataFile: "test-data/limbo/imputedQNLTDataFile.txt"\n+ # output path for contents of `stats_metadata_v` table\n anovaKseaMetadata: "test-data/limbo/anovaKseaMetadata.txt"\n+ # how to test one variable with > 2 categories (e.g., aov or kruskal.test)\n oneWayManyCategories: !r c("aov", "kruskal.test", "oneway.test")[1]\n+ # how to test one variable with 2 categories (e.g., oneway.test)\n oneWayTwoCategories: !r c("aov", "kruskal.test", "oneway.test")[3]\n- kseaCutoffStatistic: !r c("p.value", "FDR")[2]\n- kseaCutoffThreshold: !r c( 0.1, 0.05)[2]\n- kseaMinKinaseCount: 1\n- intensityHeatmapRows: 75\n+ # what should be the minimum quality for consideration in both\n+ minQuality: 0\n+ # correct KSEA with FDR (recommended) or raw p-value\n+ kseaCutoffStatistic: !r c("FDR", "p.value")[1]\n+ # correct KSEA threshold 0.05 (conventional) or higher (perhaps better)\n+ # "perhaps better" meaning that KSEA is an hypothesis-generator, not -test\n+ #kseaCutoffThreshold: !r c(0.05, 0.1, 0.25, 0.5)[1]\n+ # minimum number of substrates required for a kinase to be considered in KSEA\n+ kseaMinSubstrateCount: 1\n+ # Should KSEA be performed aggregating signed log2FC or absolute?\n+ # FALSE use raw log2FC for KSEA as for KSEAapp::KSEA.Scores\n+ # TRUE use abs(log2FC) for KSEA as Justin Drake requested; this is a\n+ # justifiable deviation from the KSEAapp::KSEA.Scores algorithm.\n+ kseaUseAbsoluteLog2FC: TRUE\n+ #kseaUseAbsoluteLog2FC: FALSE\n+ # minimum number of observed values per sample-group\n+ intensityMinValuesPerGroup: 1\n+ # maximum number of heatmap rows (result are poor when > 50)\n+ intensityHeatmapRows: 50\n+ # what should be the primary criterion to eliminate excessive heatmap rows\n+ intensityHeatmapCriteria: '..b'nb_messages) nbe("Output quantile normalized data tabular file\\n")\n+ write.table(\n+ data_table_imputed,\n+ file = imp_qn_lt_data_filenm,\n+ sep = "\\t",\n+ col.names = TRUE,\n+ row.names = FALSE,\n+ quote = FALSE\n )\n \n-\n-#output quantile normalized data\n-impish <- cbind(rownames(quant_data_imp_qn_log), quant_data_imp_qn_log)\n-colnames(impish)[1] <- "Phosphopeptide"\n-data_table_imputed <- sqldf(data_table_imputed_sql)\n-# Zap the duplicated \'Phosphopeptide\' column named \'ppep\'\n-data_table_imputed <-\n- data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))]\n-write.table(\n- data_table_imputed,\n- file = imp_qn_lt_data_filenm,\n- sep = "\\t",\n- col.names = TRUE,\n- row.names = FALSE,\n- quote = FALSE\n-)\n-\n-ppep_kinase <- sqldf("\n- SELECT DISTINCT k.ppep, k.kinase\n- FROM (\n- SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep\n- FROM pseudo_ksdata\n- WHERE GENE IN (SELECT kinase FROM enriched_kinases)\n- ) k\n- ORDER BY k.ppep, k.kinase\n- ")\n-\n-RSQLite::dbWriteTable(\n- conn = db,\n- name = "ksea_enriched_ks",\n- value = ppep_kinase,\n- append = FALSE\n- )\n+ ppep_kinase <- sqldf("\n+ SELECT DISTINCT k.ppep, k.kinase\n+ FROM (\n+ SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep\n+ FROM pseudo_ksdata\n+ WHERE GENE IN (SELECT kinase FROM enriched_kinases)\n+ ) k\n+ ORDER BY k.ppep, k.kinase\n+ ")\n+\n+ RSQLite::dbWriteTable(\n+ conn = db,\n+ name = "ksea_enriched_ks",\n+ value = ppep_kinase,\n+ append = FALSE\n+ )\n+}\n+\n+if (print_nb_messages) nb("RSQLite::dbWriteTable anova_signif\\n")\n \n RSQLite::dbWriteTable(\n conn = db,\n@@ -3453,6 +6293,8 @@\n "\n )\n \n+if (print_nb_messages) nb("Output contents of `stats_metadata_v` table to tabular file\\n")\n+if (print_nb_messages) nbe("Output contents of `stats_metadata_v` table to tabular file\\n")\n write.table(\n dbReadTable(db, "stats_metadata_v"),\n file = anova_ksea_mtdt_file,\n@@ -3462,75 +6304,21 @@\n quote = FALSE\n )\n \n+cat("\\n\\\\clearpage\\n")\n \n ```\n \n+# Data-processing summary flowchart\n+\n+![Flowchart showing ANOVA and KSEA data-processing steps](KSEA_impl_flowchart.pdf)\n+\n ```{r parmlist, echo = FALSE, fig.dim = c(9, 10), results = \'asis\'}\n cat("\\\\leavevmode\\n\\n\\n")\n \n-# write parameters to report\n-\n-param_unlist <- unlist(as.list(params))\n-param_df <- data.frame(\n- parameter = paste0("\\\\verb@", names(param_unlist), "@"),\n- value = paste0("\\\\verb@", gsub("$", "\\\\$", param_unlist, fixed = TRUE), "@")\n- )\n-\n-data_frame_latex(\n- x = param_df,\n- justification = "p{0.35\\\\linewidth} p{0.6\\\\linewidth}",\n- centered = TRUE,\n- caption = "Input parameters",\n- anchor = const_table_anchor_bp,\n- underscore_whack = FALSE\n- )\n-\n-# write parameters to SQLite output\n-\n-mqppep_anova_script_param_df <- data.frame(\n- script = "mqppep_anova_script.Rmd",\n- parameter = names(param_unlist),\n- value = param_unlist\n- )\n-ddl_exec(db, "\n- DROP TABLE IF EXISTS script_parameter;\n- "\n-)\n-ddl_exec(db, "\n- CREATE TABLE IF NOT EXISTS script_parameter(\n- script TEXT,\n- parameter TEXT,\n- value ANY,\n- UNIQUE (script, parameter) ON CONFLICT REPLACE\n- )\n- ;\n- "\n-)\n-RSQLite::dbWriteTable(\n- conn = db,\n- name = "script_parameter",\n- value = mqppep_anova_script_param_df,\n- append = TRUE\n-)\n-\n+write_params(db)\n # We are done with output\n RSQLite::dbDisconnect(db)\n+\n+cat("\\\\clearpage\\n\\\\section{R package versions}\\n")\n+utils::toLatex(utils::sessionInfo())\n ```\n-<!--\n-There\'s gotta be a better way...\n-\n-loaded_packages_df <- sessioninfo::package_info("loaded")\n-loaded_packages_df[, "library"] <- as.character(loaded_packages_df$library)\n-loaded_packages_df <- data.frame(\n- package = loaded_packages_df$package,\n- version = loaded_packages_df$loadedversion,\n- date = loaded_packages_df$date\n- )\n-data_frame_latex(\n- x = loaded_packages_df,\n- justification = "l | l l",\n- centered = FALSE,\n- caption = "Loaded R packages",\n- anchor = const_table_anchor_bp\n- )\n--->\n' |
b |
diff -r dbff53e6f75f -r 08678c931f5d mqppep_mrgfltr.py --- a/mqppep_mrgfltr.py Mon Jul 11 19:22:25 2022 +0000 +++ b/mqppep_mrgfltr.py Fri Oct 28 18:27:21 2022 +0000 |
[ |
b'@@ -87,7 +87,10 @@\n nargs=1,\n required=True,\n dest="phosphopeptides",\n- help="Phosphopeptide data for experimental results, including the intensities and the mapping to kinase domains, in tabular format",\n+ help=" ".join([\n+ "Phosphopeptide data for experimental results, including the",\n+ "intensities and the mapping to kinase domains, in tabular format"\n+ ]),\n )\n # UniProtKB/SwissProt DB input, SQLite\n parser.add_argument(\n@@ -106,7 +109,10 @@\n required=False,\n default=[],\n dest="species",\n- help="limit PhosphoSitePlus records to indicated species (field may be empty)",\n+ help=" ".join([\n+ "limit PhosphoSitePlus records to indicated species",\n+ "(field may be empty)"\n+ ]),\n )\n \n # outputs:\n@@ -174,7 +180,7 @@\n # determine species to limit records from PSP_Regulatory_Sites\n if options.species is None:\n exit(\n- \'Argument "species" is required (and may be empty) but not supplied\'\n+ \'Argument "species" is required (& may be empty) but not supplied\'\n )\n try:\n if len(options.species) > 0:\n@@ -216,20 +222,25 @@\n FUNCTION_PHOSPHORESIDUE = (\n "Function Phosphoresidue(PSP=PhosphoSitePlus.org)"\n )\n- GENE_NAME = "Gene_Name" # Gene Name from UniProtKB\n- ON_FUNCTION = (\n- "ON_FUNCTION" # ON_FUNCTION column from PSP_Regulatory_Sites\n- )\n- ON_NOTES = "NOTES" # NOTES column from PSP_Regulatory_Sites\n- ON_OTHER_INTERACT = "ON_OTHER_INTERACT" # ON_OTHER_INTERACT column from PSP_Regulatory_Sites\n- ON_PROCESS = (\n- "ON_PROCESS" # ON_PROCESS column from PSP_Regulatory_Sites\n- )\n- ON_PROT_INTERACT = "ON_PROT_INTERACT" # ON_PROT_INTERACT column from PSP_Regulatory_Sites\n+ # Gene Name from UniProtKB\n+ GENE_NAME = "Gene_Name"\n+ # ON_FUNCTION column from PSP_Regulatory_Sites\n+ ON_FUNCTION = ("ON_FUNCTION")\n+ # NOTES column from PSP_Regulatory_Sites\n+ ON_NOTES = "NOTES"\n+ # ON_OTHER_INTERACT column from PSP_Regulatory_Sites\n+ ON_OTHER_INTERACT = "ON_OTHER_INTERACT"\n+ # ON_PROCESS column from PSP_Regulatory_Sites\n+ ON_PROCESS = ("ON_PROCESS")\n+ # ON_PROT_INTERACT column from PSP_Regulatory_Sites\n+ ON_PROT_INTERACT = "ON_PROT_INTERACT"\n PHOSPHOPEPTIDE = "Phosphopeptide"\n PHOSPHOPEPTIDE_MATCH = "Phosphopeptide_match"\n PHOSPHORESIDUE = "Phosphoresidue"\n- PUTATIVE_UPSTREAM_DOMAINS = "Putative Upstream Kinases(PSP=PhosphoSitePlus.org)/Phosphatases/Binding Domains"\n+ PUTATIVE_UPSTREAM_DOMAINS = " ".join([\n+ "Putative Upstream Kinases(PSP=PhosphoSitePlus.org)/",\n+ "Phosphatases/Binding Domains"\n+ ])\n SEQUENCE = "Sequence"\n SEQUENCE10 = "Sequence10"\n SEQUENCE7 = "Sequence7"\n@@ -328,8 +339,26 @@\n CitationData\n ) VALUES (?,?)\n """\n- CITATION_INSERT_PSP = \'PhosphoSitePlus(R) (PSP) was created by Cell Signaling Technology Inc. It is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. When using PSP data or analyses in printed publications or in online resources, the following acknowledgements must be included: (a) the words "PhosphoSitePlus(R), www.phosphosite.org" must be included at appropriate places in the text or webpage, and (b) the following citation must be included in the bibliography: "Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations, PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926."\'\n- CITATION_INSERT_PSP_REF = \'Hornbeck, 2014, "PhosphoSitePlus, 2014: mutations, PTMs and recalibrations.", https://pubmed.ncbi.nlm.nih.gov/22135298, https://doi.org/10.1093/nar/gkr1122\'\n+ CITATION_INSERT_PSP = " '..b'- end read upstream_data_melt --------------------------------------\n+ # ... end read upstream_data_melt ---------------------------------\n \n end_time = time.process_time() # timer\n print(\n@@ -1332,10 +1312,13 @@\n if p_peptide in melt_dict:\n melt_dict[p_peptide].append(characterization)\n else:\n- exit(\n- \'Phosphopeptide %s not found in ppep_mapping_db: "phopsphopeptides" and "ppep_mapping_db" must both originate from the same run of mqppep_kinase_mapping\'\n- % (p_peptide)\n- )\n+ los = [\n+ "Phosphopeptide %s" % p_peptide,\n+ "not found in ppep_mapping_db:",\n+ \'"phopsphopeptides" and "ppep_mapping_db" must both\',\n+ "originate from the same run of mqppep_kinase_mapping"\n+ ]\n+ exit(" ".join(los))\n \n end_time = time.process_time() # timer\n print(\n@@ -1397,29 +1380,12 @@\n ]\n ]\n \n- # cols_output_prelim = output_df.columns.tolist()\n- #\n- # print("cols_output_prelim")\n- # print(cols_output_prelim)\n- #\n- # cols_output = cols_output_prelim[:8]+[cols_output_prelim[9]]+[cols_output_prelim[10]]\n- #\n- # print("cols_output with p-peptide")\n- # print(cols_output)\n- #\n- # cols_output = [col for col in cols_output if not col == "p-peptide"]\n- #\n- # print("cols_output")\n- # print(cols_output)\n- #\n- # output_df = output_df[cols_output]\n-\n # join output_df back to quantitative columns in data_in df\n quant_cols = data_in.columns.tolist()\n quant_cols = quant_cols[1:]\n quant_data = data_in[quant_cols]\n \n- # ----------- Write merge/filter metadata to SQLite database (start) -----------\n+ # ---- Write merge/filter metadata to SQLite database (start) ----\n # Open SwissProt SQLite database\n conn = sql.connect(output_sqlite)\n cur = conn.cursor()\n@@ -1467,7 +1433,7 @@\n \n # Close SwissProt SQLite database\n conn.close()\n- # ----------- Write merge/filter metadata to SQLite database (finish) -----------\n+ # ---- Write merge/filter metadata to SQLite database (finish) ----\n \n output_df = output_df.merge(\n quant_data,\n@@ -1480,15 +1446,18 @@\n output_df = output_df[output_cols]\n \n # cosmetic changes to Upstream column\n+ # fill the NaN with "" for those Phosphopeptides that got a\n+ # "WARNING: Failed match for " in the upstream mapping\n output_df[PUTATIVE_UPSTREAM_DOMAINS] = output_df[\n PUTATIVE_UPSTREAM_DOMAINS\n ].fillna(\n ""\n- ) # fill the NaN with "" for those Phosphopeptides that got a "WARNING: Failed match for " in the upstream mapping\n+ )\n us_series = pandas.Series(output_df[PUTATIVE_UPSTREAM_DOMAINS])\n i = 0\n while i < len(us_series):\n- # turn blanks into N_A to signify the info was searched for but cannot be found\n+ # turn blanks into N_A to signify the info\n+ # that was searched for but cannot be found\n if us_series[i] == "":\n us_series[i] = N_A\n i += 1\n@@ -1530,8 +1499,9 @@\n # Rev. 7/1/2016\n # Rev. 7/3/2016 : fill NaN in Upstream column to replace to N/A\'s\n # Rev. 7/3/2016: renamed Upstream column to PUTATIVE_UPSTREAM_DOMAINS\n- # Rev. 12/2/2021: Converted to Python from ipynb; use fast Aho-Corasick searching; \\\n- # read from SwissProt SQLite database\n+ # Rev. 12/2/2021: Converted to Python from ipynb; use fast \\\n+ # Aho-Corasick searching; \\\n+ # read from SwissProt SQLite database\n # Rev. 12/9/2021: Transfer code to Galaxy tool wrapper\n \n #\n' |
b |
diff -r dbff53e6f75f -r 08678c931f5d perpage.tex --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/perpage.tex Fri Oct 28 18:27:21 2022 +0000 |
[ |
b"@@ -0,0 +1,547 @@\n+% \\iffalse\n+%%\n+%% perpage is part of the bigfoot bundle for critical typesetting\n+%% Copyright 2002--2014 David Kastrup <dak@gnu.org>\n+%%\n+%% The license notice and corresponding source code for this file are\n+%% contained in perpage.dtx.\n+%%\n+% This program is free software; you can redistribute it and/or modify\n+% it under the terms of the GNU General Public License as published by\n+% the Free Software Foundation; either version 2 of the License, or\n+% (at your option) any later version.\n+%\n+% This program is distributed in the hope that it will be useful,\n+% but WITHOUT ANY WARRANTY; without even the implied warranty of\n+% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n+% GNU General Public License for more details.\n+%\n+% You should have received a copy of the GNU General Public License\n+% along with this program; if not, write to the Free Software\n+% Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA\n+% \\fi\n+% \\CheckSum{396}\n+% \\GetFileInfo{perpage.sty}\n+% \\date{\\filedate}\n+% \\author{David Kastrup\\thanks{\\texttt{dak@gnu.org}}}\n+% \\title{The \\texttt{perpage} package\\\\Version \\fileversion}\n+% \\maketitle\n+% \\section{Description}\n+%\n+% The \\texttt{perpage} package adds the ability to reset counters per\n+% page and/or keep their occurences sorted in order of appearance on\n+% the page.\n+%\n+% It works by attaching itself to the code for \\cmd{\\stepcounter} and\n+% will then modify the given counter according to information written\n+% to the |.aux| file, which means that multiple passes may be needed.\n+% Since it uses the internals of the \\cmd{\\label} mechanism, the need\n+% for additional passes will get announced by \\LaTeX\\ as ``labels may\n+% have changed''.\n+%\n+% \\DescribeMacro{\\MakePerPage}\n+% \\begin{quote}\n+% |\\MakePerPage[2]{footnote}|\n+% \\end{quote}\n+% will start footnote numbers with~2 on each page (the optional\n+% argument defaults to~1). 2~might be a strange number, unless you\n+% have used something like\n+% \\begin{quote}\n+% |\\renewcommand\\thefootnote{\\fnsymbol{footnote}}|\n+% \\end{quote}\n+% and want to start off with a dagger. The starting value must not be\n+% less than~1 so that the counter logic can detect the reset of a\n+% counter\n+% reliably.\\footnote{This unfortunately means that you can't just use\n+% \\cmd{\\alph} in order to get figures on page~10 numbered as ``10'',\n+% ``10a'', ``10b''.}\n+% It could be a good idea to redefine |\\@cnterr| if you use a format\n+% with limited range: at the first pass, footnotes are not reset\n+% across pages and things like |\\fnsymbol| will quickly run out of\n+% characters to use.\n+%\n+% \\DescribeMacro{\\theperpage}\n+% If you want to label things also on a per page base, for example\n+% with\n+% \\begin{quote}\n+% |\\renewcommand{\\thefigure}{\\thepage-\\arabic{figure}}|\n+% \\end{quote}\n+% you'll have the problem that \\cmd{\\thepage} is updated\n+% asynchronously with the real page, since \\TeX\\ does not know which\n+% page the figure will end up. If you have used the |perpage| package\n+% for modifying the figure counter, however, at the point where the\n+% counter is incremented, the macro \\cmd{\\theperpage} will be set to\n+% the correct value corresponding to the actual page location. Note\n+% that this macro is shared between all counters, so advancing a\n+% different counter under control of |perpage| will render\n+% \\cmd{\\thefigure} incorrect.\n+%\n+% \\DescribeMacro{\\MakeSorted}\n+% \\begin{quote}\n+% |\\MakeSorted{figure}|\n+% \\end{quote}\n+% will make the |figure| counter get `sorted': this means that counter\n+% values will be assigned in order of appearance in the output, not in\n+% order of appearance in the source code. For example, the order of\n+% interspersed one- and two-column figures might get mixed up by\n+% \\LaTeX\\ in the output. Making the counter sorted will fix the order\n+% to match the order of appearance. A similar problem is when\n+% ordinary footnotes are present in floating material (this does not\n+% work in standard "..b'@\n+ \\penalty \\ifnum\\count@<\\@M \\@M \\else \\count@ \\fi\n+ \\else \\kern\\dimen@\\fi\n+ \\pp@cl@end}\n+% \\end{macrocode}\n+% \\end{macro}\n+% \\begin{macro}{\\pp@labeldef}\n+% This is a helper macro.\n+% \\begin{macrocode}\n+\\def\\pp@labeldef#1#2#3#4#5{\\@newl@bel{pp@r@#2}{#3}{{#1}{#4}{#5}}}\n+% \\end{macrocode}\n+% \\end{macro}\n+% \n+% \\begin{macro}{\\pp@pagectr}\n+% This is the workhorse for normal per page counters. It is called\n+% whenever the |.aux| file is read in and establishes the\n+% appropriate information for each counter advancement in a\n+% pseudolabel.\n+% \\begin{macrocode}\n+\\def\\pp@pagectr#1#2#3#4{\\@ifundefined{c@pp@a@#1}{}{%\n+ \\addtocounter{pp@a@#1}\\@ne\n+ \\expandafter\\pp@labeldef\\expandafter\n+ {\\number\\value{pp@a@#1}}{#1}{#2}{#3}{#4}}}\n+% \\end{macrocode}\n+% \\end{macro}\n+% \\begin{macro}{\\c@schk@}\n+% This is called for implementing sorted counters. Sorted counters\n+% maintain a ``count group\'\', and the values in each count group are\n+% numbered independently from that of other count groups. Whenever\n+% a counter is found to have been reset, it will start a new count\n+% group. At the end of document, the count group counters need to\n+% get reset, too, so that the check for changed |.aux| files will\n+% still work.\n+% \\begin{macrocode}\n+\\def\\c@schk@#1{\\pp@cl@begin\n+ \\addtocounter{pp@a@#1}\\@ne\n+ \\ifnum\\value{#1}=\\@ne\n+ \\expandafter\\xdef\\csname pp@g@#1\\endcsname{\\number\\value{pp@a@#1}}%\n+ \\edef\\next{\\noexpand\\AtEndDocument{\\global\\let\n+ \\expandafter\\noexpand\\csname pp@g@#1@\\number\\value{pp@a@#1}\\endcsname\n+ \\relax}}\\next\n+ \\fi\n+ \\pp@fetchctr{#1}%\n+ \\ifx\\pp@page\\@empty\n+ \\else \\setcounter{#1}{\\pp@label}\\fi\n+ \\pp@writectr\\pp@spagectr{#1}{\\csname pp@g@#1\\endcsname}}%\n+% \\end{macrocode}\n+% \\end{macro}\n+% \\begin{macro}{\\pp@spagectr}\n+% This is the code advancing the respective value of the appropriate\n+% count group and assigning the label.\n+% \\begin{macrocode}\n+\\def\\pp@spagectr#1#2#3#4{\\@ifundefined{c@pp@a@#1}{}{%\n+ \\count@0\\csname pp@g@#1@#3\\endcsname\n+ \\advance\\count@\\@ne\n+ \\expandafter\\xdef\\csname pp@g@#1@#3\\endcsname{\\number\\count@}%\n+ \\expandafter\\pp@labeldef\\expandafter\n+ {\\number\\count@}{#1}{#2}{#3}{#4}}}\n+% \\end{macrocode}\n+% \\end{macro}\n+% \\begin{macro}{\\c@spchk@}\n+% And this finally is the counter advance code for sorted counters\n+% per page. Basically, we just use one count group per page.\n+% Resetting a counter manually will not introduce a new count group,\n+% and it would be hard to decide what to do in case count groups and\n+% page positions overlap.\n+% \\begin{macrocode}\n+\\def\\c@spchk@#1{\\pp@cl@begin\n+ \\addtocounter{pp@a@#1}\\@ne\n+ \\pp@fetchctr{#1}%\n+ \\ifx\\pp@page\\@empty\n+ \\else \\setcounter{#1}{\\pp@label}\\fi\n+ \\pp@writectr\\pp@ppagectr{#1}{\\noexpand\\theabspage}}\n+% \\end{macrocode}\n+% \\end{macro}\n+% \\begin{macro}{\\pp@ppagectr}\n+% \\begin{macrocode}\n+\\def\\pp@ppagectr#1#2#3#4{\\@ifundefined{c@pp@a@#1}{}{%\n+ \\def\\next{#3}%\n+ \\expandafter\\ifx\\csname pp@page@#1\\endcsname\\next\n+ \\addtocounter{pp@a@#1}\\@ne\n+ \\else\n+ \\setcounter{pp@a@#1}{\\value{pp@r@#1}}%\n+ \\fi\n+ \\global\\expandafter\\let\\csname pp@page@#1\\endcsname\\next\n+ \\expandafter\\pp@labeldef\\expandafter\n+ {\\number\\value{pp@a@#1}}{#1}{#2}{#3}{#4}}}\n+% \\end{macrocode}\n+% \\end{macro}\n+% \\begin{macro}{\\@testdef}\n+% \\LaTeX\'s current (2007) definition of this macro causes save stack\n+% overflow. We fix this by an additional grouping. Delay to the\n+% beginning of document to keep Babel happy.\n+% \\begin{macrocode}\n+\\AtBeginDocument{%\n+ \\begingroup\n+ \\@testdef{}{undefined}{}%\n+ \\expandafter\n+ \\endgroup\n+ \\ifx\\@undefined\\relax\n+ \\let\\pp@@testdef\\@testdef\n+ \\def\\@testdef#1#2#3{{\\pp@@testdef{#1}{#2}{#3}%\n+ \\if@tempswa\\aftergroup\\@tempswatrue\\fi}}%\n+ \\fi}\n+%</style>\n+% \\end{macrocode}\n+% \\end{macro}\n+% \n+% \\Finale\n+% \\endinput\n+% Local Variables: \n+% mode: doctex\n+% TeX-master: "perpage.drv"\n+% End: \n' |
b |
diff -r dbff53e6f75f -r 08678c931f5d search_ppep.py --- a/search_ppep.py Mon Jul 11 19:22:25 2022 +0000 +++ b/search_ppep.py Fri Oct 28 18:27:21 2022 +0000 |
[ |
@@ -237,7 +237,10 @@ # Parse Command Line parser = argparse.ArgumentParser( - description="Phopsphoproteomic Enrichment phosphopeptide SwissProt search (in place in SQLite DB)." + description=" ".join([ + "Phopsphoproteomic Enrichment", + "phosphopeptide SwissProt search (in place in SQLite DB)." + ]) ) # inputs: @@ -249,7 +252,11 @@ nargs=1, required=True, dest="phosphopeptides", - help="Phosphopeptide data for experimental results, generated by the Phopsphoproteomic Enrichment Localization Filter tool", + help=" ".join([ + "Phosphopeptide data for experimental results,", + "generated by the Phopsphoproteomic Enrichment Localization", + "Filter tool" + ]), ) parser.add_argument( "--uniprotkb", @@ -257,7 +264,10 @@ nargs=1, required=True, dest="uniprotkb", - help="UniProtKB/Swiss-Prot data, converted from FASTA format by the Phopsphoproteomic Enrichment Kinase Mapping tool", + help=" ".join([ + "UniProtKB/Swiss-Prot data, converted from FASTA format by the", + "Phopsphoproteomic Enrichment Kinase Mapping tool" + ]), ) parser.add_argument( "--schema", @@ -310,7 +320,8 @@ cur.executescript(DROP_TABLES_SQL) # if options.db_schema: - # print("\nAfter dropping tables/views that are to be created, schema is:") + # print("\nAfter dropping tables/views that are to be created," + # + schema is:") # cur.execute("SELECT * FROM sqlite_schema") # for row in cur.fetchall(): # if row[4] is not None: @@ -403,7 +414,11 @@ deppep_count = row[0] cur.execute( - "SELECT count(*) FROM (SELECT Sequence FROM UniProtKB GROUP BY Sequence)" + """ + SELECT count(*) FROM ( + SELECT Sequence FROM UniProtKB GROUP BY Sequence + ) + """ ) for row in cur.fetchall(): sequence_count = row[0] @@ -431,9 +446,11 @@ old_seq = "" for row in cur.fetchall(): if duplicate_count == 0: - print( - "\nEach of the following sequences is associated with several accession IDs (which are listed in the first column) but the same gene ID (which is listed in the second column)." - ) + print(" ".join([ + "\nEach of the following sequences is associated with several", + "accession IDs (which are listed in the first column) but", + "the same gene ID (which is listed in the second column)." + ])) if row[2] != old_seq: old_seq = row[2] duplicate_count += 1 @@ -480,13 +497,19 @@ ) else: raise ValueError( - "UniProtKB_id %s, but Sequence is None: Check whether SwissProt file is missing sequence for this ID" - % (UniProtKB_id,) + "UniProtKB_id %s, but Sequence is None: %s %s" + % ( + UniProtKB_id, + "Check whether SwissProt file is missing", + "the sequence for this ID") ) ker.execute( """ - SELECT count(*) || ' accession-peptide-phosphopeptide combinations were found' - FROM uniprotkb_pep_ppep_view + SELECT + count(*) || + ' accession-peptide-phosphopeptide combinations were found' + FROM + uniprotkb_pep_ppep_view """ ) for row in ker.fetchall(): @@ -494,7 +517,9 @@ ker.execute( """ - SELECT count(*) || ' accession matches were found', count(*) AS accession_count + SELECT + count(*) || ' accession matches were found', + count(*) AS accession_count FROM ( SELECT accession FROM uniprotkb_pep_ppep_view @@ -520,7 +545,9 @@ ker.execute( """ - SELECT count(*) || ' phosphopeptide matches were found', count(*) AS phosphopeptide_count + SELECT + count(*) || ' phosphopeptide matches were found', + count(*) AS phosphopeptide_count FROM ( SELECT phosphopeptide FROM uniprotkb_pep_ppep_view |