Mercurial > repos > galaxyp > mqppep_preproc
diff mqppep_anova.R @ 1:b76c75521d91 draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 43e7a43b545c24b2dc33d039198551c032aa79be
author | galaxyp |
---|---|
date | Fri, 28 Oct 2022 18:26:42 +0000 |
parents | 8dfd5d2b5903 |
children | bae3a23461c9 |
line wrap: on
line diff
--- a/mqppep_anova.R Mon Jul 11 19:22:54 2022 +0000 +++ b/mqppep_anova.R Fri Oct 28 18:26:42 2022 +0000 @@ -1,20 +1,15 @@ #!/usr/bin/env Rscript # libraries library(optparse) -library(data.table) library(stringr) +library(tinytex) # ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285 # parse options option_list <- list( - make_option( - c("-i", "--inputFile"), - action = "store", - default = NA, - type = "character", - help = "Phosphopeptide Intensities sparse input file path" - ), + + # files make_option( c("-a", "--alphaFile"), action = "store", @@ -24,64 +19,11 @@ " path to text file having one column and no header") ), make_option( - c("-S", "--preproc_sqlite"), - action = "store", - default = NA, - type = "character", - help = "Path to 'preproc_sqlite' produced by `mqppep_mrgfltr.py`" - ), - make_option( - c("-K", "--ksea_sqlite"), + c("-M", "--anova_ksea_metadata"), action = "store", - default = NA, - type = "character", - help = "Path to 'ksea_sqlite' output produced by this tool" - ), - make_option( - c("-f", "--firstDataColumn"), - action = "store", - default = "^Intensity[^_]", - type = "character", - help = "First column of intensity values" - ), - make_option( - c("-m", "--imputationMethod"), - action = "store", - default = "random", + default = "anova_ksea_metadata.tsv", type = "character", - help = paste0("Method for missing-value imputation,", - " one of c('group-median','median','mean','random')") - ), - make_option( - c("-p", "--meanPercentile"), - action = "store", - default = 3, - type = "integer", - help = paste0("Mean percentile for randomly generated imputed values;", - ", range [1,99]") - ), - make_option( - c("-d", "--sdPercentile"), - action = "store", - default = 3, - type = "double", - help = paste0("Adjustment value for standard deviation of", - " randomly generated imputed values; real") - ), - make_option( - c("-s", "--regexSampleNames"), - action = "store", - default = "\\.(\\d+)[A-Z]$", - type = "character", - help = "Regular expression extracting sample-names" - ), - make_option( - c("-g", "--regexSampleGrouping"), - action = "store", - default = "(\\d+)", - type = "character", - help = paste0("Regular expression extracting sample-group", - " from an extracted sample-name") + help = "Phosphopeptide metadata, ANOVA FDR, and KSEA enribhments" ), make_option( c("-o", "--imputedDataFile"), @@ -102,11 +44,56 @@ ) ), make_option( + c("-i", "--inputFile"), + action = "store", + default = NA, + type = "character", + help = "Phosphopeptide Intensities sparse input file path" + ), + make_option( + c("-K", "--ksea_sqlite"), + action = "store", + default = NA, + type = "character", + help = "Path to 'ksea_sqlite' output produced by this tool" + ), + make_option( + c("-S", "--preproc_sqlite"), + action = "store", + default = NA, + type = "character", + help = "Path to 'preproc_sqlite' produced by `mqppep_mrgfltr.py`" + ), + make_option( c("-r", "--reportFile"), action = "store", - default = "QuantDataProcessingScript.html", + default = "mqppep_anova.pdf", + type = "character", + help = "PDF report file path" + ), + + # parameters + make_option( + c("-f", "--firstDataColumn"), + action = "store", + default = "^Intensity[^_]", type = "character", - help = "HTML report file path" + help = "First column of intensity values" + ), + make_option( + c("-m", "--imputationMethod"), + action = "store", + default = "random", + type = "character", + help = paste0("Method for missing-value imputation,", + " one of c('group-median','median','mean','random')") + ), + make_option( + c("-C", "--intensityMinValuesPerClass"), + action = "store", + default = "0", + type = "integer", + help = "Minimum number of observed values per class" ), make_option( c("-k", "--ksea_cutoff_statistic"), @@ -124,14 +111,120 @@ help = paste0("Maximum score to be used to score a kinase enrichment as significant") ), make_option( - c("-M", "--anova_ksea_metadata"), + c("-c", "--kseaMinSubstrateCount"), + action = "store", + default = "1", + type = "integer", + help = "Minimum number of substrates to consider any kinase for KSEA" + ), + make_option( + c("--kseaUseAbsoluteLog2FC"), + action = "store_true", + default = "FALSE", + type = "logical", + help = paste0("Should abs(log2(fold-change)) be used for KSEA?", + " (TRUE may alter number of hits.)") + ), + make_option( + c("-p", "--meanPercentile"), + action = "store", + default = 3, + type = "integer", + help = paste0("Mean percentile for randomly generated imputed values;", + ", range [1,99]") + ), + make_option( + c("--minQuality"), + action = "store", + default = 0, + type = "integer", + help = paste0("Minimum quality (higher value reduces number of substrates", + " accepted; you may want to keep below 100), range [0,infinity]") + ), + make_option( + c("--oneWayManyCategories"), + action = "store", + default = "aov", + type = "character", + help = "Name of R function for one-way tests among more than two categories" + ), + make_option( + c("--oneWayTwoCategories"), action = "store", - default = "anova_ksea_metadata.tsv", + default = "two.way", + type = "character", + help = "Name of R function for one-way tests between two categories" + ), + make_option( + c("-s", "--regexSampleNames"), + action = "store", + default = "\\.(\\d+)[A-Z]$", + type = "character", + help = "Regular expression extracting sample-names" + ), + make_option( + c("-g", "--regexSampleGrouping"), + action = "store", + default = "(\\d+)", type = "character", - help = "Phosphopeptide metadata, ANOVA FDR, and KSEA enribhments" + help = paste0("Regular expression extracting sample-group", + " from an extracted sample-name") + ), + make_option( + c("-d", "--sdPercentile"), + action = "store", + default = 3, + type = "double", + help = paste0("Adjustment value for standard deviation of", + " randomly generated imputed values; real") + ), + make_option( + c("-F", "--sampleGroupFilter"), + action = "store", + default = "none", + type = "character", + help = paste0("Should no filter be applied to sample group names (none)", + " or should the filter specify samples to include or exclude?") + ), + make_option( + c("--sampleGroupFilterMode"), + action = "store", + default = "r", + type = "character", + help = paste0("First character ('f', 'p', or 'r') indicating regular", + "expression matching mode ('fixed', 'perl', or 'grep'; ", + "see https://rdrr.io/r/base/grep.html). Second character may be 'i;", + "to make search ignore case.") + ), + make_option( + c("-G", "--sampleGroupFilterPatterns"), + action = "store", + default = ".*", + type = "character", + help = paste0("Regular expression extracting sample-group", + " from an extracted sample-name") ) ) -args <- parse_args(OptionParser(option_list = option_list)) + +tryCatch( + args <- parse_args( + OptionParser( + option_list = option_list, + add_help_option = TRUE + ), + print_help_and_exit = TRUE + ), + error = function(e) { + parse_args( + OptionParser( + option_list = option_list, + add_help_option = TRUE + ), + print_help_and_exit = TRUE + ) + stop(as.character(e)) + } +) print("args is:") cat(str(args)) @@ -140,16 +233,34 @@ if (! file.exists(args$inputFile)) { stop((paste("Input file", args$inputFile, "does not exist"))) } -input_file <- args$inputFile -alpha_file <- args$alphaFile -preproc_sqlite <- args$preproc_sqlite -imputed_data_file_name <- args$imputedDataFile -imp_qn_lt_data_filenm <- args$imputedQNLTDataFile -anova_ksea_metadata <- args$anova_ksea_metadata -report_file_name <- args$reportFile -ksea_sqlite <- args$ksea_sqlite -ksea_cutoff_statistic <- args$ksea_cutoff_statistic -ksea_cutoff_threshold <- args$ksea_cutoff_threshold + +# files +alpha_file <- args$alphaFile +anova_ksea_metadata_file <- args$anova_ksea_metadata +imp_qn_lt_data_file <- args$imputedQNLTDataFile +imputed_data_file <- args$imputedDataFile +input_file <- args$inputFile +ksea_sqlite_file <- args$ksea_sqlite +preproc_sqlite_file <- args$preproc_sqlite +report_file_name <- args$reportFile + +# parameters +# firstDataColumn - see below +group_filter <- args$sampleGroupFilter +group_filter_mode <- args$sampleGroupFilterMode +# imputationMethod - see below +intensity_min_values_per_class <- args$intensityMinValuesPerClass +ksea_cutoff_statistic <- args$ksea_cutoff_statistic +ksea_cutoff_threshold <- args$ksea_cutoff_threshold +ksea_min_substrate_count <- args$kseaMinSubstrateCount +ksea_use_absolute_log2_fc <- args$kseaUseAbsoluteLog2FC +# mean_percentile - see below +min_quality <- args$minQuality +# regexSampleNames - see below +# regexSampleGrouping - see below +# sampleGroupFilterPatterns - see below (becomes group_filter_patterns) +# sd_percentile - see below + if ( sum( grepl( @@ -192,32 +303,59 @@ # - regexSampleNames # - regexSampleGrouping read_config_file_string <- function(fname, limit) { + cat(sprintf("read_config_file_string: fname = '%s'\n", fname)) + cat(sprintf("length(fname) = '%s'\n", length(fname))) + result <- + if (file.exists(fname)) { + cat(sprintf("reading '%s' ...\n", fname)) + readChar(fname, limit) + } else { + cat(sprintf("not a file: '%s'\n", fname)) + fname + } + #AC print(paste0("read_config_file_string: opening file '", as.character(fname), "'")) # eliminate any leading whitespace - result <- gsub("^[ \t\n]*", "", readChar(fname, limit)) + result <- gsub("^[ \t\n]*", "", result) # eliminate any trailing whitespace - result <- gsub("[ \t\n]*$", "", result) + result <- gsub("[ \t\n]*$", "", result) # substitute characters escaped by Galaxy sanitizer - result <- gsub("__lt__", "<", result) - result <- gsub("__le__", "<=", result) - result <- gsub("__eq__", "==", result) - result <- gsub("__ne__", "!=", result) - result <- gsub("__gt__", ">", result) - result <- gsub("__ge__", ">=", result) - result <- gsub("__sq__", "'", result) - result <- gsub("__dq__", '"', result) - result <- gsub("__ob__", "[", result) - result <- gsub("__cb__", "]", result) + result <- gsub("__lt__", "<", result) + result <- gsub("__le__", "<=", result) + result <- gsub("__eq__", "==", result) + result <- gsub("__ne__", "!=", result) + result <- gsub("__gt__", ">", result) + result <- gsub("__ge__", ">=", result) + result <- gsub("__sq__", "'", result) + result <- gsub("__dq__", '"', result) + result <- gsub("__ob__", "[", result) + result <- gsub("__cb__", "]", result) } +nc <- 1000 + +sink(stderr()) + cat(paste0("first_data_column file: ", args$firstDataColumn, "\n")) -cat(paste0("regex_sample_names file: ", args$regexSampleNames, "\n")) -cat(paste0("regex_sample_grouping file: ", args$regexSampleGrouping, "\n")) -nc <- 1000 -regex_sample_names <- read_config_file_string(args$regexSampleNames, nc) -regex_sample_grouping <- read_config_file_string(args$regexSampleGrouping, nc) first_data_column <- read_config_file_string(args$firstDataColumn, nc) cat(paste0("first_data_column: ", first_data_column, "\n")) + +cat(paste0("regex_sample_grouping file: ", args$regexSampleGrouping, "\n")) +regex_sample_grouping <- read_config_file_string(args$regexSampleGrouping, nc) +cat(paste0("regex_sample_grouping: ", regex_sample_grouping, "\n")) + +cat(paste0("regex_sample_names file: ", args$regexSampleNames, "\n")) +regex_sample_names <- read_config_file_string(args$regexSampleNames, nc) cat(paste0("regex_sample_names: ", regex_sample_names, "\n")) -cat(paste0("regex_sample_grouping: ", regex_sample_grouping, "\n")) + +if (group_filter != "none") { + cat(paste0("group_filter_patterns file: '", args$sampleGroupFilterPatterns, "'\n")) + group_filter_patterns <- read_config_file_string(args$sampleGroupFilterPatterns, nc) +} else { + group_filter_patterns <- ".*" +} +cat(paste0("group_filter_patterns: ", group_filter_patterns, "\n")) + +sink() + # from: https://github.com/molgenis/molgenis-pipelines/wiki/ # How-to-source-another_file.R-from-within-your-R-script @@ -253,45 +391,72 @@ return(NULL) } -script_dir <- location_of_this_script() +# validation of input parameters is complete; it is now justifiable to +# install LaTeX tools to render markdown as PDF; this involves a big +# download from GitHub +if (!tinytex::is_tinytex()) tinytex::install_tinytex() rmarkdown_params <- list( - inputFile = input_file - , alphaFile = alpha_file - , preprocDb = preproc_sqlite + + # files + alphaFile = alpha_file + , anovaKseaMetadata = anova_ksea_metadata_file + , imputedDataFilename = imputed_data_file + , imputedQNLTDataFile = imp_qn_lt_data_file + , inputFile = input_file + , kseaAppPrepDb = ksea_sqlite_file + , preprocDb = preproc_sqlite_file + + # parameters , firstDataColumn = first_data_column + , groupFilter = group_filter + , groupFilterMode = group_filter_mode # arg sampleGroupFilterMode + , groupFilterPatterns = group_filter_patterns # arg sampleGroupFilterPatterns , imputationMethod = imputation_method + , intensityMinValuesPerGroup = intensity_min_values_per_class + , kseaCutoffStatistic = ksea_cutoff_statistic + , kseaCutoffThreshold = ksea_cutoff_threshold + , kseaMinSubstrateCount = ksea_min_substrate_count + , kseaUseAbsoluteLog2FC = ksea_use_absolute_log2_fc # add , meanPercentile = mean_percentile - , sdPercentile = sd_percentile + , minQuality = min_quality # add + , regexSampleGrouping = regex_sample_grouping , regexSampleNames = regex_sample_names - , regexSampleGrouping = regex_sample_grouping - , imputedDataFilename = imputed_data_file_name - , imputedQNLTDataFile = imp_qn_lt_data_filenm - , anovaKseaMetadata = anova_ksea_metadata - , kseaAppPrepDb = ksea_sqlite - , kseaCutoffThreshold = ksea_cutoff_threshold - , kseaCutoffStatistic = ksea_cutoff_statistic + , sdPercentile = sd_percentile ) print("rmarkdown_params") -str(rmarkdown_params) +print(rmarkdown_params) +print( + lapply( + X = rmarkdown_params, + FUN = function(x) { + paste0( + nchar(as.character(x)), + ": '", + as.character(x), + "'" + ) + } + ) +) + # freeze the random number generator so the same results will be produced # from run to run set.seed(28571) -# BUG (or "opportunity") -# To render as PDF for the time being requires installing the conda -# package `r-texlive` until this issue in `texlive-core` is resolved: -# https://github.com/conda-forge/texlive-core-feedstock/issues/19 -# This workaround is detailed in the fourth comment of: -# https://github.com/conda-forge/texlive-core-feedstock/issues/61 +script_dir <- location_of_this_script() -library(tinytex) -tinytex::install_tinytex() rmarkdown::render( input = paste(script_dir, "mqppep_anova_script.Rmd", sep = "/") -, output_format = rmarkdown::pdf_document(toc = TRUE) , output_file = report_file_name , params = rmarkdown_params +, output_format = rmarkdown::pdf_document( + includes = rmarkdown::includes(in_header = "mqppep_anova_preamble.tex") + , dev = "pdf" + , toc = TRUE + , toc_depth = 2 + , number_sections = FALSE + ) )