mqppep_preproc: mqppep_anova.R comparison

comparison mqppep_anova.R @ 1:b76c75521d91 draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 43e7a43b545c24b2dc33d039198551c032aa79be

author	galaxyp
date	Fri, 28 Oct 2022 18:26:42 +0000
parents	8dfd5d2b5903
children	bae3a23461c9

comparison

equal deleted inserted replaced

-:8dfd5d2b5903
+:b76c75521d91
 #!/usr/bin/env Rscript
 # libraries
 library(optparse)
-library(data.table)
 library(stringr)
+library(tinytex)
 # ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285
 # parse options
 option_list <- list(
-make_option(
-c("-i", "--inputFile"),
+# files
-action = "store",
-default = NA,
-type = "character",
-help = "Phosphopeptide Intensities sparse input file path"
-),
 make_option(
 c("-a", "--alphaFile"),
 action = "store",
 default = NA,
 type = "character",
 help = paste0("List of alpha cutoff values for significance testing;",
 " path to text file having one column and no header")
 ),
 make_option(
-c("-S", "--preproc_sqlite"),
+c("-M", "--anova_ksea_metadata"),
 action = "store",
-default = NA,
+default = "anova_ksea_metadata.tsv",
 type = "character",
-help = "Path to 'preproc_sqlite' produced by `mqppep_mrgfltr.py`"
+help = "Phosphopeptide metadata, ANOVA FDR, and KSEA enribhments"
-),
-make_option(
-c("-K", "--ksea_sqlite"),
-action = "store",
-default = NA,
-type = "character",
-help = "Path to 'ksea_sqlite' output produced by this tool"
-),
-make_option(
-c("-f", "--firstDataColumn"),
-action = "store",
-default = "^Intensity[^_]",
-type = "character",
-help = "First column of intensity values"
-),
-make_option(
-c("-m", "--imputationMethod"),
-action = "store",
-default = "random",
-type = "character",
-help = paste0("Method for missing-value imputation,",
-" one of c('group-median','median','mean','random')")
-),
-make_option(
-c("-p", "--meanPercentile"),
-action = "store",
-default = 3,
-type = "integer",
-help = paste0("Mean percentile for randomly generated imputed values;",
-", range [1,99]")
-),
-make_option(
-c("-d", "--sdPercentile"),
-action = "store",
-default = 3,
-type = "double",
-help = paste0("Adjustment value for standard deviation of",
-" randomly generated imputed values; real")
-),
-make_option(
-c("-s", "--regexSampleNames"),
-action = "store",
-default = "\\.(\\d+)[A-Z]$",
-type = "character",
-help = "Regular expression extracting sample-names"
-),
-make_option(
-c("-g", "--regexSampleGrouping"),
-action = "store",
-default = "(\\d+)",
-type = "character",
-help = paste0("Regular expression extracting sample-group",
-" from an extracted sample-name")
 ),
 make_option(
 c("-o", "--imputedDataFile"),
 action = "store",
 default = "output_imputed.tsv",
 "Imputed, Quantile-Normalized Log-Transformed Phosphopeptide",
 "Intensities output file path"
 )
 ),
 make_option(
+c("-i", "--inputFile"),
+action = "store",
+default = NA,
+type = "character",
+help = "Phosphopeptide Intensities sparse input file path"
+),
+make_option(
+c("-K", "--ksea_sqlite"),
+action = "store",
+default = NA,
+type = "character",
+help = "Path to 'ksea_sqlite' output produced by this tool"
+),
+make_option(
+c("-S", "--preproc_sqlite"),
+action = "store",
+default = NA,
+type = "character",
+help = "Path to 'preproc_sqlite' produced by `mqppep_mrgfltr.py`"
+),
+make_option(
 c("-r", "--reportFile"),
 action = "store",
-default = "QuantDataProcessingScript.html",
+default = "mqppep_anova.pdf",
 type = "character",
-help = "HTML report file path"
+help = "PDF report file path"
+),
+# parameters
+make_option(
+c("-f", "--firstDataColumn"),
+action = "store",
+default = "^Intensity[^_]",
+type = "character",
+help = "First column of intensity values"
+),
+make_option(
+c("-m", "--imputationMethod"),
+action = "store",
+default = "random",
+type = "character",
+help = paste0("Method for missing-value imputation,",
+" one of c('group-median','median','mean','random')")
+),
+make_option(
+c("-C", "--intensityMinValuesPerClass"),
+action = "store",
+default = "0",
+type = "integer",
+help = "Minimum number of observed values per class"
 ),
 make_option(
 c("-k", "--ksea_cutoff_statistic"),
 action = "store",
 default = "FDR",
 default = 0.05,
 type = "double",
 help = paste0("Maximum score to be used to score a kinase enrichment as significant")
 ),
 make_option(
-c("-M", "--anova_ksea_metadata"),
+c("-c", "--kseaMinSubstrateCount"),
 action = "store",
-default = "anova_ksea_metadata.tsv",
+default = "1",
-type = "character",
+type = "integer",
-help = "Phosphopeptide metadata, ANOVA FDR, and KSEA enribhments"
+help = "Minimum number of substrates to consider any kinase for KSEA"
+),
+make_option(
+c("--kseaUseAbsoluteLog2FC"),
+action = "store_true",
+default = "FALSE",
+type = "logical",
+help = paste0("Should abs(log2(fold-change)) be used for KSEA?",
+" (TRUE may alter number of hits.)")
+),
+make_option(
+c("-p", "--meanPercentile"),
+action = "store",
+default = 3,
+type = "integer",
+help = paste0("Mean percentile for randomly generated imputed values;",
+", range [1,99]")
+),
+make_option(
+c("--minQuality"),
+action = "store",
+default = 0,
+type = "integer",
+help = paste0("Minimum quality (higher value reduces number of substrates",
+" accepted; you may want to keep below 100), range [0,infinity]")
+),
+make_option(
+c("--oneWayManyCategories"),
+action = "store",
+default = "aov",
+type = "character",
+help = "Name of R function for one-way tests among more than two categories"
+),
+make_option(
+c("--oneWayTwoCategories"),
+action = "store",
+default = "two.way",
+type = "character",
+help = "Name of R function for one-way tests between two categories"
+),
+make_option(
+c("-s", "--regexSampleNames"),
+action = "store",
+default = "\\.(\\d+)[A-Z]$",
+type = "character",
+help = "Regular expression extracting sample-names"
+),
+make_option(
+c("-g", "--regexSampleGrouping"),
+action = "store",
+default = "(\\d+)",
+type = "character",
+help = paste0("Regular expression extracting sample-group",
+" from an extracted sample-name")
+),
+make_option(
+c("-d", "--sdPercentile"),
+action = "store",
+default = 3,
+type = "double",
+help = paste0("Adjustment value for standard deviation of",
+" randomly generated imputed values; real")
+),
+make_option(
+c("-F", "--sampleGroupFilter"),
+action = "store",
+default = "none",
+type = "character",
+help = paste0("Should no filter be applied to sample group names (none)",
+" or should the filter specify samples to include or exclude?")
+),
+make_option(
+c("--sampleGroupFilterMode"),
+action = "store",
+default = "r",
+type = "character",
+help = paste0("First character ('f', 'p', or 'r') indicating regular",
+"expression matching mode ('fixed', 'perl', or 'grep'; ",
+"see https://rdrr.io/r/base/grep.html).  Second character may be 'i;",
+"to make search ignore case.")
+),
+make_option(
+c("-G", "--sampleGroupFilterPatterns"),
+action = "store",
+default = ".*",
+type = "character",
+help = paste0("Regular expression extracting sample-group",
+" from an extracted sample-name")
 )
 )
-args <- parse_args(OptionParser(option_list = option_list))
+tryCatch(
+args <- parse_args(
+OptionParser(
+option_list = option_list,
+add_help_option = TRUE
+),
+print_help_and_exit = TRUE
+),
+error = function(e) {
+parse_args(
+OptionParser(
+option_list = option_list,
+add_help_option = TRUE
+),
+print_help_and_exit = TRUE
+)
+stop(as.character(e))
+}
+)
 print("args is:")
 cat(str(args))
 # Check parameter values
 if (! file.exists(args$inputFile)) {
 stop((paste("Input file", args$inputFile, "does not exist")))
 }
-input_file             <- args$inputFile
-alpha_file             <- args$alphaFile
+# files
-preproc_sqlite         <- args$preproc_sqlite
+alpha_file                     <- args$alphaFile
-imputed_data_file_name <- args$imputedDataFile
+anova_ksea_metadata_file       <- args$anova_ksea_metadata
-imp_qn_lt_data_filenm  <- args$imputedQNLTDataFile
+imp_qn_lt_data_file            <- args$imputedQNLTDataFile
-anova_ksea_metadata    <- args$anova_ksea_metadata
+imputed_data_file              <- args$imputedDataFile
-report_file_name       <- args$reportFile
+input_file                     <- args$inputFile
-ksea_sqlite            <- args$ksea_sqlite
+ksea_sqlite_file               <- args$ksea_sqlite
-ksea_cutoff_statistic  <- args$ksea_cutoff_statistic
+preproc_sqlite_file            <- args$preproc_sqlite
-ksea_cutoff_threshold  <- args$ksea_cutoff_threshold
+report_file_name               <- args$reportFile
+# parameters
+# firstDataColumn - see below
+group_filter                   <- args$sampleGroupFilter
+group_filter_mode              <- args$sampleGroupFilterMode
+# imputationMethod - see below
+intensity_min_values_per_class <- args$intensityMinValuesPerClass
+ksea_cutoff_statistic          <- args$ksea_cutoff_statistic
+ksea_cutoff_threshold          <- args$ksea_cutoff_threshold
+ksea_min_substrate_count       <- args$kseaMinSubstrateCount
+ksea_use_absolute_log2_fc      <- args$kseaUseAbsoluteLog2FC
+# mean_percentile - see below
+min_quality                    <- args$minQuality
+# regexSampleNames - see below
+# regexSampleGrouping - see below
+# sampleGroupFilterPatterns - see below (becomes group_filter_patterns)
+# sd_percentile - see below
 if (
 sum(
 grepl(
 pattern = ksea_cutoff_statistic,
 x = c("FDR", "p.value")
 # convert string parameters that are passed in via config files:
 #  - firstDataColumn
 #  - regexSampleNames
 #  - regexSampleGrouping
 read_config_file_string <- function(fname, limit) {
+cat(sprintf("read_config_file_string: fname = '%s'\n", fname))
+cat(sprintf("length(fname) = '%s'\n", length(fname)))
+result <-
+if (file.exists(fname)) {
+cat(sprintf("reading '%s' ...\n", fname))
+readChar(fname, limit)
+} else {
+cat(sprintf("not a file: '%s'\n", fname))
+fname
+}
+#AC print(paste0("read_config_file_string: opening file '", as.character(fname), "'"))
 # eliminate any leading whitespace
-result    <- gsub("^[ \t\n]*", "", readChar(fname, limit))
+result <- gsub("^[ \t\n]*", "",   result)
 # eliminate any trailing whitespace
-result    <- gsub("[ \t\n]*$", "", result)
+result <- gsub("[ \t\n]*$", "",   result)
 # substitute characters escaped by Galaxy sanitizer
-result <- gsub("__lt__", "<",  result)
+result <- gsub("__lt__",    "<",  result)
-result <- gsub("__le__", "<=", result)
+result <- gsub("__le__",    "<=", result)
-result <- gsub("__eq__", "==", result)
+result <- gsub("__eq__",    "==", result)
-result <- gsub("__ne__", "!=", result)
+result <- gsub("__ne__",    "!=", result)
-result <- gsub("__gt__", ">",  result)
+result <- gsub("__gt__",    ">",  result)
-result <- gsub("__ge__", ">=", result)
+result <- gsub("__ge__",    ">=", result)
-result <- gsub("__sq__", "'",  result)
+result <- gsub("__sq__",    "'",  result)
-result <- gsub("__dq__", '"',  result)
+result <- gsub("__dq__",    '"',  result)
-result <- gsub("__ob__", "[",  result)
+result <- gsub("__ob__",    "[",  result)
-result <- gsub("__cb__", "]",  result)
+result <- gsub("__cb__",    "]",  result)
 }
+nc <- 1000
+sink(stderr())
 cat(paste0("first_data_column file: ", args$firstDataColumn, "\n"))
-cat(paste0("regex_sample_names file: ", args$regexSampleNames, "\n"))
-cat(paste0("regex_sample_grouping file: ", args$regexSampleGrouping, "\n"))
-nc <- 1000
-regex_sample_names <- read_config_file_string(args$regexSampleNames, nc)
-regex_sample_grouping <- read_config_file_string(args$regexSampleGrouping, nc)
 first_data_column <- read_config_file_string(args$firstDataColumn,  nc)
 cat(paste0("first_data_column: ",     first_data_column,     "\n"))
+cat(paste0("regex_sample_grouping file: ", args$regexSampleGrouping, "\n"))
+regex_sample_grouping <- read_config_file_string(args$regexSampleGrouping, nc)
+cat(paste0("regex_sample_grouping: ", regex_sample_grouping, "\n"))
+cat(paste0("regex_sample_names file: ", args$regexSampleNames, "\n"))
+regex_sample_names <- read_config_file_string(args$regexSampleNames, nc)
 cat(paste0("regex_sample_names: ",    regex_sample_names,    "\n"))
-cat(paste0("regex_sample_grouping: ", regex_sample_grouping, "\n"))
+if (group_filter != "none") {
+cat(paste0("group_filter_patterns file: '", args$sampleGroupFilterPatterns, "'\n"))
+group_filter_patterns <- read_config_file_string(args$sampleGroupFilterPatterns, nc)
+} else {
+group_filter_patterns <- ".*"
+}
+cat(paste0("group_filter_patterns: ", group_filter_patterns, "\n"))
+sink()
 # from: https://github.com/molgenis/molgenis-pipelines/wiki/
 #   How-to-source-another_file.R-from-within-your-R-script
 # Function location_of_this_script returns the location of this .R script
 #   (may be needed to source other files in same dir)
 # Both are not the case. Maybe we are in an R GUI?
 return(NULL)
 }
-script_dir <-  location_of_this_script()
+# validation of input parameters is complete; it is now justifiable to
+#   install LaTeX tools to render markdown as PDF; this involves a big
+#   download from GitHub
+if (!tinytex::is_tinytex()) tinytex::install_tinytex()
 rmarkdown_params <- list(
-inputFile = input_file
-, alphaFile = alpha_file
+# files
-, preprocDb = preproc_sqlite
+alphaFile = alpha_file
+, anovaKseaMetadata = anova_ksea_metadata_file
+, imputedDataFilename = imputed_data_file
+, imputedQNLTDataFile = imp_qn_lt_data_file
+, inputFile = input_file
+, kseaAppPrepDb = ksea_sqlite_file
+, preprocDb = preproc_sqlite_file
+# parameters
 , firstDataColumn = first_data_column
+, groupFilter = group_filter
+, groupFilterMode = group_filter_mode         # arg sampleGroupFilterMode
+, groupFilterPatterns = group_filter_patterns # arg sampleGroupFilterPatterns
 , imputationMethod = imputation_method
+, intensityMinValuesPerGroup = intensity_min_values_per_class
+, kseaCutoffStatistic = ksea_cutoff_statistic
+, kseaCutoffThreshold = ksea_cutoff_threshold
+, kseaMinSubstrateCount = ksea_min_substrate_count
+, kseaUseAbsoluteLog2FC = ksea_use_absolute_log2_fc # add
 , meanPercentile = mean_percentile
+, minQuality = min_quality                          # add
+, regexSampleGrouping = regex_sample_grouping
+, regexSampleNames = regex_sample_names
 , sdPercentile = sd_percentile
-, regexSampleNames = regex_sample_names
-, regexSampleGrouping = regex_sample_grouping
-, imputedDataFilename = imputed_data_file_name
-, imputedQNLTDataFile = imp_qn_lt_data_filenm
-, anovaKseaMetadata = anova_ksea_metadata
-, kseaAppPrepDb = ksea_sqlite
-, kseaCutoffThreshold = ksea_cutoff_threshold
-, kseaCutoffStatistic = ksea_cutoff_statistic
 )
 print("rmarkdown_params")
-str(rmarkdown_params)
+print(rmarkdown_params)
+print(
+lapply(
+X = rmarkdown_params,
+FUN = function(x) {
+paste0(
+nchar(as.character(x)),
+": '",
+as.character(x),
+"'"
+)
+}
+)
+)
 # freeze the random number generator so the same results will be produced
 #  from run to run
 set.seed(28571)
-# BUG (or "opportunity")
+script_dir <-  location_of_this_script()
-# To render as PDF for the time being requires installing the conda
-# package `r-texlive` until this issue in `texlive-core` is resolved:
-#   https://github.com/conda-forge/texlive-core-feedstock/issues/19
-# This workaround is detailed in the fourth comment of:
-#   https://github.com/conda-forge/texlive-core-feedstock/issues/61
-library(tinytex)
-tinytex::install_tinytex()
 rmarkdown::render(
 input = paste(script_dir, "mqppep_anova_script.Rmd", sep = "/")
-, output_format = rmarkdown::pdf_document(toc = TRUE)
 , output_file = report_file_name
 , params = rmarkdown_params
+, output_format = rmarkdown::pdf_document(
+includes = rmarkdown::includes(in_header = "mqppep_anova_preamble.tex")
+, dev = "pdf"
+, toc = TRUE
+, toc_depth = 2
+, number_sections = FALSE
+)
 )

Mercurial > repos > galaxyp > mqppep_preproc

comparison mqppep_anova.R @ 1:b76c75521d91 draft