Mercurial > repos > lain > xseekerpreparator
view XSeekerPreparator.R @ 20:ce94e7a141bb draft default tip
" master branch Updating"
author | lain |
---|---|
date | Tue, 06 Dec 2022 10:18:10 +0000 |
parents | 2937e72e5891 |
children |
line wrap: on
line source
assign("TOOL_NAME", "XSeekerPreparator", envir = globalenv()) lockBinding("TOOL_NAME", globalenv()) assign("VERSION", "1.3.0", envir = globalenv()) lockBinding("VERSION", globalenv()) assign("DEBUG_FAST", FALSE, envir = globalenv()) lockBinding("DEBUG_FAST", globalenv()) assign("DEBUG_FAST_IGNORE_SLOW_OP", DEBUG_FAST, envir = globalenv()) lockBinding("DEBUG_FAST_IGNORE_SLOW_OP", globalenv()) assign("PROCESS_SMOL_BATCH", DEBUG_FAST, envir = globalenv()) lockBinding("PROCESS_SMOL_BATCH", globalenv()) assign("FAST_FEATURE_RATIO", 10, envir = globalenv()) lockBinding("FAST_FEATURE_RATIO", globalenv()) assign("OUTPUT_SPECIFIC_TOOL", "XSeeker_Galaxy", envir = globalenv()) lockBinding("OUTPUT_SPECIFIC_TOOL", globalenv()) assign( "ENRICHED_RDATA_VERSION", paste(VERSION, OUTPUT_SPECIFIC_TOOL, sep = "-"), envir = globalenv() ) lockBinding("ENRICHED_RDATA_VERSION", globalenv()) assign("ENRICHED_RDATA_DOC", sprintf(" Welcome to the enriched <Version %s> of the output of CAMERA/xcms. This doc was generated by the tool: %s - Version %s To show the different variables contained in this rdata, type: - `load('this_rdata.rdata', rdata_env <- new.env())` - `names(rdata_env)` Sections ###### This tools helpers ------ The version number is somewhat special because the evolution of the rdata's format is non-linear. There may be different branches, each evolving separatly. To reflect these branches's diversions, there may be a prepended branch name following this format: major.minor.patch-branch_name Like this, we can process rdata with the same tool, and output rdata formated differently, for each tool. - enriched_rdata: - Description: flag created by that tool to tell it was enriched. - Retrieval method: enriched_rdata <- TRUE - enriched_rdata_version: - Description: A flag created by that tool to tell which version of this tool has enriched the rdata. - Retrieval method: enriched_rdata_version <- sprintf( \"%s\", ENRICHED_RDATA_VERSION ) - enriched_rdata_doc: - Description: Contains the documentation string. Data from original mzxml file ------ - tic: - Description: Those are the tic values from the original mzxml file, extracted using xcms 2. - Retrieval method: xcms::xcmsRaw('original_file.mzxml')@tic - xcms version: 2.0 - mz: - Description: Those are the m/z values from the original mzxml file, extracted using xcms 2. - Retrieval method: xcms::xcmsRaw('original_file.mzxml')@env$mz - xcms version: 2.0 - scanindex: - Description: Those are the scanindex values from the original mzxml file, extracted using xcms 2. - Retrieval method: xcms::xcmsRaw('original_file.mzxml')@scanindex - xcms version: 2.0 - scantime: - Description: Those are the scantime values from the original mzxml file, extracted using xcms 2. - Retrieval method: xcms::xcmsRaw('original_file.mzxml')@scantime - xcms version: 2.0 - intensity: - Description: Those are the intensity values from the original mzxml file, extracted using xcms 2. - Retrieval method: xcms::xcmsRaw('original_file.mzxml')@env$intensity - xcms version: 2.0 - polarity: - Description: Those are the polarity values from the original mzxml file, extracted using xcms 2. - Retrieval method: as.character(xcms::xcmsRaw( 'original_file.mzxml' )@polarity[[1]]) - xcms version: 2.0 Data taken from incoming rdata ------ - variableMetadata: - Description: Unmodified copy of variableMetadata from incoming rdata. - Retrieval method: rdata_file$variableMetadata - process_params: - Description: Those are the processing parameters values from the curent rdata. They have been simplified to allow easy access like: for (params in process_params) { if (params[[\"xfunction\"]] == \"annotatediff\") { process_peak_picking_params(params) } } - Retrieval method: ## just he same list, but simplified process_params <- list() for (list_name in names(rdata_file$listOFlistArguments)) { param_list <- list() for (param_name in names( rdata_file$listOFlistArguments[[list_name]] )) { param_list[[param_name]] <- rdata_file$listOFlistArguments[[ list_name ]][[param_name]] } process_params[[length(process_params)+1]] <- param_list } ", ENRICHED_RDATA_VERSION, TOOL_NAME, VERSION, ENRICHED_RDATA_VERSION), envir = globalenv()) lockBinding("ENRICHED_RDATA_DOC", globalenv()) get_models <- function(path) { if (is.null(path)) { stop("No models to define the database schema") } else { message(sprintf("Loading models from %s", path)) } ## galaxy mangles the "@" to a "__at__" if (substr(path, 1, 9) == "git__at__") { path <- sub("^git__at__", "git@", path, perl = TRUE) } if ( substr(path, 1, 4) == "git@" || substr(path, length(path) - 4, 4) == ".git" ) { return(get_models_from_git(path)) } if (substr(path, 1, 4) == "http") { return(get_models_from_url(path)) } return(source(path)$value) } get_models_from_git <- function(url, target_file = "models.R", rm = TRUE) { tmp <- tempdir() message(sprintf("Cloning %s", url)) system2("git", c("clone", url, tmp)) result <- search_tree(file.path(tmp, dir), target_file) if (!is.null(result)) { models <- source(result)$value if (rm) { unlink(tmp, recursive = TRUE) } return(models) } if (rm) { unlink(tmp, recursive = TRUE) } stop(sprintf( "Could not find any file named \"%s\" in this repo", target_file )) } get_models_from_url <- function(url, target_file = "models.R", rm = TRUE) { tmp <- tempdir() message(sprintf("Downloading %s", url)) result <- file.path(tmp, target_file) if (download.file(url, destfile = result) == 0) { models <- source(result)$value if (rm) { unlink(tmp, recursive = TRUE) } return(models) } if (rm) { unlink(tmp, recursive = TRUE) } stop("Could not download any file at this adress.") } search_tree <- function(path, target) { target <- tolower(target) for (file in list.files(path)) { if (fs::is.dir(file)) { result <- search_tree(file.path(path, file), target) if (!is.null(result)) { return(result) } } else if (tolower(file) == target) { return(file.path(path, file)) } } return(NULL) } create_database <- function(orm) { orm$recreate_database(no_exists = FALSE) set_database_version(orm, "created") } insert_adducts <- function(orm) { message("Creating adducts...") adducts <- list( list("[M-H2O-H]-", 1, -1, -48.992020312000001069, 1, 0, 0.5, "H0", "H1O3"), list("[M-H-Cl+O]-", 1, -1, -19.981214542000000022, 2, 0, 0.5, "O1", "H1Cl1"), list("[M-Cl+O]-", 1, -1, -18.973389510000000512, 3, 0, 0.5, "O1", "Cl1"), list("[M-3H]3-", 1, -3, -3.0218293560000000219, 4, 0, 1.0, "H0", "H3"), list("[2M-3H]3-", 2, -3, -3.0218293560000000219, 4, 0, 0.5, "H0", "H3"), list("[3M-3H]3-", 3, -3, -3.0218293560000000219, 4, 0, 0.5, "H0", "H3"), list("[M-2H]2-", 1, -2, -2.0145529039999998666, 5, 0, 1.0, "H0", "H2"), list("[2M-2H]2-", 2, -2, -2.0145529039999998666, 5, 0, 0.5, "H0", "H2"), list("[3M-2H]2-", 3, -2, -2.0145529039999998666, 5, 0, 0.5, "H0", "H2"), list("[M-H]-", 1, -1, -1.0072764519999999333, 6, 1, 1.0, "H0", "H1"), list("[2M-H]-", 2, -1, -1.0072764519999999333, 6, 0, 0.5, "H0", "H1"), list("[3M-H]-", 3, -1, -1.0072764519999999333, 6, 0, 0.5, "H0", "H1"), list("[M]+", 1, 1, -0.00054858000000000000945, 7, 1, 1.0, "H0", "H0"), list("[M]-", 1, -1, 0.00054858000000000000945, 8, 1, 1.0, "H0", "H0"), list("[M+H]+", 1, 1, 1.0072764519999999333, 9, 1, 1.0, "H1", "H0"), list("[2M+H]+", 2, 1, 1.0072764519999999333, 9, 0, 0.5, "H1", "H0"), list("[3M+H]+", 3, 1, 1.0072764519999999333, 9, 0, 0.25, "H1", "H0"), list("[M+2H]2+", 1, 2, 2.0145529039999998666, 10, 0, 0.75, "H2", "H0"), list("[2M+2H]2+", 2, 2, 2.0145529039999998666, 10, 0, 0.5, "H2", "H0"), list("[3M+2H]2+", 3, 2, 2.0145529039999998666, 10, 0, 0.25, "H2", "H0"), list("[M+3H]3+", 1, 3, 3.0218293560000000219, 11, 0, 0.75, "H3", "H0"), list("[2M+3H]3+", 2, 3, 3.0218293560000000219, 11, 0, 0.5, "H3", "H0"), list("[3M+3H]3+", 3, 3, 3.0218293560000000219, 11, 0, 0.25, "H3", "H0"), list("[M-2H+NH4]-", 1, -1, 16.019272654000001665, 12, 0, 0.25, "N1H4", "H2"), list("[2M-2H+NH4]-", 2, -1, 16.019272654000001665, 12, 0, 0.0, "N1H4", "H2"), list("[3M-2H+NH4]-", 3, -1, 16.019272654000001665, 12, 0, 0.25, "N1H4", "H2"), list("[M+NH4]+", 1, 1, 18.033825558000000199, 13, 1, 1.0, "N1H4", "H0"), list("[2M+NH4]+", 2, 1, 18.033825558000000199, 13, 0, 0.5, "N1H4", "H0"), list("[3M+NH4]+", 3, 1, 18.033825558000000199, 13, 0, 0.25, "N1H4", "H0"), list("[M+H+NH4]2+", 1, 2, 19.041102009999999467, 14, 0, 0.5, "N1H5", "H0"), list("[2M+H+NH4]2+", 2, 2, 19.041102009999999467, 14, 0, 0.5, "N1H5", "H0"), list("[3M+H+NH4]2+", 3, 2, 19.041102009999999467, 14, 0, 0.25, "N1H5", "H0"), list("[M+Na-2H]-", 1, -1, 20.974668176000001551, 15, 0, 0.75, "Na1", "H2"), list("[2M-2H+Na]-", 2, -1, 20.974668176000001551, 15, 0, 0.25, "Na1", "H2"), list("[3M-2H+Na]-", 3, -1, 20.974668176000001551, 15, 0, 0.25, "Na1", "H2"), list("[M+Na]+", 1, 1, 22.989221080000000086, 16, 1, 1.0, "Na1", "H0"), list("[2M+Na]+", 2, 1, 22.989221080000000086, 16, 0, 0.5, "Na1", "H0"), list("[3M+Na]+", 3, 1, 22.989221080000000086, 16, 0, 0.25, "Na1", "H0"), list("[M+H+Na]2+", 1, 2, 23.996497531999999353, 17, 0, 0.5, "Na1H1", "H0"), list("[2M+H+Na]2+", 2, 2, 23.996497531999999353, 17, 0, 0.5, "Na1H1", "H0"), list("[3M+H+Na]2+", 3, 2, 23.996497531999999353, 17, 0, 0.25, "Na1H1", "H0"), list("[M+2H+Na]3+", 1, 3, 25.003773983999998619, 18, 0, 0.25, "H2Na1", "H0"), list("[M+CH3OH+H]+", 1, 1, 33.033491200000000276, 19, 0, 0.25, "C1O1H5", "H0"), list("[M-H+Cl]2-", 1, -2, 33.962124838000001148, 20, 0, 1.0, "Cl1", "H1"), list("[2M-H+Cl]2-", 2, -2, 33.962124838000001148, 20, 0, 0.5, "Cl1", "H1"), list("[3M-H+Cl]2-", 3, -2, 33.962124838000001148, 20, 0, 0.5, "Cl1", "H1"), list("[M+Cl]-", 1, -1, 34.969401290000000416, 21, 1, 1.0, "Cl1", "H0"), list("[2M+Cl]-", 2, -1, 34.969401290000000416, 21, 0, 0.5, "Cl1", "H0"), list("[3M+Cl]-", 3, -1, 34.969401290000000416, 21, 0, 0.5, "Cl1", "H0"), list("[M+K-2H]-", 1, -1, 36.948605415999999479, 22, 0, 0.5, "K1", "H2"), list("[2M-2H+K]-", 2, -1, 36.948605415999999479, 22, 0, 0.0, "K1", "H2"), list("[3M-2H+K]-", 3, -1, 36.948605415999999479, 22, 0, 0.0, "K1", "H2"), list("[M+K]+", 1, 1, 38.963158319999998013, 23, 1, 1.0, "K1", "H0"), list("[2M+K]+", 2, 1, 38.963158319999998013, 23, 0, 0.5, "K1", "H0"), list("[3M+K]+", 3, 1, 38.963158319999998013, 23, 0, 0.25, "K1", "H0"), list("[M+H+K]2+", 1, 2, 39.970434771999997281, 24, 0, 0.5, "K1H1", "H0"), list("[2M+H+K]2+", 2, 2, 39.970434771999997281, 24, 0, 0.5, "K1H1", "H0"), list("[3M+H+K]2+", 3, 2, 39.970434771999997281, 24, 0, 0.25, "K1H1", "H0"), list("[M+ACN+H]+", 1, 1, 42.033825557999996646, 25, 0, 0.25, "C2H4N1", "H0"), list("[2M+ACN+H]+", 2, 1, 42.033825557999996646, 25, 0, 0.25, "C2H4N1", "H0"), list("[M+2Na-H]+", 1, 1, 44.971165708000000902, 26, 0, 0.5, "Na2", "H1"), list("[2M+2Na-H]+", 2, 1, 44.971165708000000902, 26, 0, 0.25, "Na2", "H1"), list("[3M+2Na-H]+", 3, 1, 44.971165708000000902, 26, 0, 0.25, "Na2", "H1"), list("[2M+FA-H]-", 2, -1, 44.998202851999998586, 27, 0, 0.25, "C1O2H2", "H1"), list("[M+FA-H]-", 1, -1, 44.998202851999998586, 27, 0, 0.5, "C1O2H2", "H1"), list("[M+2Na]2+", 1, 2, 45.978442160000000172, 28, 0, 0.5, "Na2", "H0"), list("[2M+2Na]2+", 2, 2, 45.978442160000000172, 28, 0, 0.5, "Na2", "H0"), list("[3M+2Na]2+", 3, 2, 45.978442160000000172, 28, 0, 0.25, "Na2", "H0"), list("[M+H+2Na]3+", 1, 3, 46.985718611999999438, 29, 0, 0.25, "H1Na2", "H0"), list("[M+H+FA]+", 1, 1, 47.012755755999997122, 30, 0, 0.25, "C1O2H3", "H0"), list("[M+Hac-H]-", 1, -1, 59.013852915999997607, 31, 0, 0.25, "C2O2H4", "H1"), list("[2M+Hac-H]-", 2, -1, 59.013852915999997607, 31, 0, 0.25, "C2O2H4", "H1"), list("[M+IsoProp+H]+", 1, 1, 61.064791327999998317, 32, 0, 0.25, "C3H9O1", "H0"), list("[M+Na+K]2+", 1, 2, 61.9523793999999981, 33, 0, 0.5, "Na1K1", "H0"), list("[2M+Na+K]2+", 2, 2, 61.9523793999999981, 33, 0, 0.5, "Na1K1", "H0"), list("[3M+Na+K]2+", 3, 2, 61.9523793999999981, 33, 0, 0.25, "Na1K1", "H0"), list("[M+NO3]-", 1, -1, 61.988366450000000895, 34, 0, 0.5, "N1O3", "H0"), list("[M+ACN+Na]+", 1, 1, 64.015770185999997464, 35, 0, 0.25, "C2H3N1Na1", "H0"), list("[2M+ACN+Na]+", 2, 1, 64.015770185999997464, 35, 0, 0.25, "C2H3N1Na1", "H0"), list("[M+NH4+FA]+", 1, 1, 64.039304861999994502, 36, 0, 0.25, "N1C1O2H6", "H0"), list("[M-2H+Na+FA]-", 1, -1, 66.980147479999999405, 37, 0, 0.5, "NaC1O2H2", "H2"), list("[M+3Na]3+", 1, 3, 68.967663239999993153, 38, 0, 0.25, "Na3", "H0"), list("[M+Na+FA]+", 1, 1, 68.99470038399999794, 39, 0, 0.25, "Na1C1O2H2", "H0"), list("[M+2Cl]2-", 1, -2, 69.938802580000000832, 40, 0, 1.0, "Cl2", "H0"), list("[2M+2Cl]2-", 2, -2, 69.938802580000000832, 40, 0, 0.5, "Cl2", "H0"), list("[3M+2Cl]2-", 3, -2, 69.938802580000000832, 40, 0, 0.5, "Cl2", "H0"), list("[M+2K-H]+", 1, 1, 76.919040187999996758, 41, 0, 0.5, "K2", "H1"), list("[2M+2K-H]+", 2, 1, 76.919040187999996758, 41, 0, 0.25, "K2", "H1"), list("[3M+2K-H]+", 3, 1, 76.919040187999996758, 41, 0, 0.25, "K2", "H1"), list("[M+2K]2+", 1, 2, 77.926316639999996028, 42, 0, 0.5, "K2", "H0"), list("[2M+2K]2+", 2, 2, 77.926316639999996028, 42, 0, 0.5, "K2", "H0"), list("[3M+2K]2+", 3, 2, 77.926316639999996028, 42, 0, 0.25, "K2", "H0"), list("[M+Br]-", 1, -1, 78.918886479999997619, 43, 1, 1.0, "Br1", "H0"), list("[M+Cl+FA]-", 1, -1, 80.974880593999998268, 44, 0, 0.5, "Cl1C1O2H2", "H0"), list("[M+AcNa-H]-", 1, -1, 80.995797543999998426, 45, 0, 0.25, "C2H3Na1O2", "H1"), list("[M+2ACN+2H]2+", 1, 2, 84.067651115999993292, 46, 0, 0.25, "C4H8N2", "H0"), list("[M+K+FA]+", 1, 1, 84.968637623999995868, 47, 0, 0.25, "K1C1O2H2", "H0"), list("[M+Cl+Na+FA-H]-", 1, -1, 102.95682522200000619, 48, 0, 0.5, "Cl1Na1C1O2H2", "H1"), list("[2M+3H2O+2H]+", 2, 1, 104.03153939599999944, 49, 0, 0.25, "H8O6", "H0"), list("[M+TFA-H]-", 1, -1, 112.98558742000000165, 50, 0, 0.5, "C2F3O2H1", "H1"), list("[M+H+TFA]+", 1, 1, 115.00014032400000019, 51, 0, 0.25, "C2F3O2H2", "H0"), list("[M+3ACN+2H]2+", 1, 2, 125.09420022199999778, 52, 0, 0.25, "C6H11N3", "H0"), list("[M+NH4+TFA]+", 1, 1, 132.02668943000000468, 53, 0, 0.25, "N1C2F3O2H5", "H0"), list("[M+Na+TFA]+", 1, 1, 136.98208495200000811, 54, 0, 0.25, "Na1C2F3O2H1", "H0"), list("[M+Cl+TFA]-", 1, -1, 148.96226516199999423, 55, 0, 0.5, "Cl1C2F3O2H1", "H0"), list("[M+K+TFA]+", 1, 1, 152.95602219200000604, 56, 0, 0.25, "K1C2F3O2H1","H0") ) dummy_adduct <- orm$adduct() for (adduct in adducts) { i <- 0 dummy_adduct$set_name(adduct[[i <- i + 1]]) dummy_adduct$set_multi(adduct[[i <- i + 1]]) dummy_adduct$set_charge(adduct[[i <- i + 1]]) dummy_adduct$set_mass(adduct[[i <- i + 1]]) dummy_adduct$set_oidscore(adduct[[i <- i + 1]]) dummy_adduct$set_quasi(adduct[[i <- i + 1]]) dummy_adduct$set_ips(adduct[[i <- i + 1]]) dummy_adduct$set_formula_add(adduct[[i <- i + 1]]) dummy_adduct$set_formula_ded(adduct[[i <- i + 1]]) invisible(dummy_adduct$save()) dummy_adduct$clear(unset_id = TRUE) } message("Adducts created") } insert_base_data <- function(orm, path, archetype = FALSE) { if (archetype) { ## not implemented yet return() } base_data <- readLines(path) for (sql in strsplit(paste(base_data, collapse = " "), ";")[[1]]) { orm$execute(sql) } set_database_version(orm, "enriched") } insert_compounds <- function(orm, compounds_path) { compounds <- read.csv(file = compounds_path, sep = "\t") if (is.null(compounds <- translate_compounds(compounds))) { stop("Could not find asked compound's attributes in csv file.") } dummy_compound <- orm$compound() compound_list <- list() for (i in seq_len(nrow(compounds))) { dummy_compound$set_mz(compounds[i, "mz"]) dummy_compound$set_name(compounds[i, "name"]) dummy_compound$set_common_name(compounds[i, "common_name"]) dummy_compound$set_formula(compounds[i, "formula"]) compound_list[[length(compound_list) + 1]] <- as.list( dummy_compound, c("mz", "name", "common_name", "formula") ) dummy_compound$clear(unset_id = TRUE) } invisible(dummy_compound$save(bulk = compound_list)) } translate_compounds <- function(compounds) { recognized_headers <- list( c( "HMDB_ID", "MzBank", "X.M.H..", "X.M.H...1", "MetName", "ChemFormula", "INChIkey" ) ) header_translators <- list( hmdb_header_translator ) for (index in seq_along(recognized_headers)) { headers <- recognized_headers[[index]] if (identical(colnames(compounds), headers)) { return(header_translators[[index]](compounds)) } } if (is.null(translator <- guess_translator(colnames(compounds)))) { return(NULL) } return(csv_header_translator(translator, compounds)) } guess_translator <- function(header) { result <- list( mz = NULL, name = NULL, common_name = NULL, formula = NULL ) asked_cols <- names(result) for (asked_col in asked_cols) { for (col in header) { if ((twisted <- tolower(col)) == asked_col || gsub("-", "_", twisted) == asked_col || gsub(" ", "_", twisted) == asked_col || tolower(gsub("(.)([A-Z])", "\\1_\\2", col)) == asked_col ) { result[[asked_col]] <- col next } } } if (any(mapply(is.null, result))) { return(NULL) } return(result) } hmdb_header_translator <- function(compounds) { return(csv_header_translator( list( HMDB_ID = "HMDB_ID", mz = "MzBank", name = "MetName", common_name = "MetName", formula = "ChemFormula", inchi_key = "INChIkey" ), compounds )) } csv_header_translator <- function(translation_table, csv) { header_names <- names(translation_table) result <- data.frame(seq_len(nrow(csv))) for (i in seq_along(header_names)) { result[, header_names[[i]]] <- csv[, translation_table[[i]]] } result[, "mz"] <- as.numeric(result[, "mz"]) return(result) } set_database_version <- function(orm, version) { orm$set_tag( version, tag_name = "database_version", tag_table_name = "XSeeker_tagging_table" ) } process_rdata <- function(orm, rdata, options) { mzml_tmp_dir <- gather_mzml_files(rdata) samples <- names(rdata$singlefile) if (!is.null(options$samples)) { samples <- samples[options$samples %in% samples] } show_percent <- ( is.null(options$`not-show-percent`) || options$`not-show-percent` == FALSE ) error <- tryCatch({ process_sample_list( orm, rdata, samples, show_percent = show_percent, file_grouping_var = options$class, options = options ) NULL }, error = function(e) { return(e) }) if (!is.null(mzml_tmp_dir)) { unlink(mzml_tmp_dir, recursive = TRUE) } if (!is.null(error)) { stop(error) } return(!is.null(error)) } gather_mzml_files <- function(rdata) { if (is.null(rdata$singlefile)) { message("Extracting mxml files") tmp <- tempdir() rdata$singlefile <- utils::unzip(rdata$zipfile, exdir = tmp) names(rdata$singlefile) <- tools::file_path_sans_ext( basename(rdata$singlefile) ) message("Extracted") return(tmp) } else { message(sprintf( "Not a zip file, loading files directly from path: %s", paste(rdata$singlefile, collapse = " ; ") )) } return(NULL) } process_sample_list <- function( orm, rdata, sample_names, show_percent, file_grouping_var = NULL, options = list() ) { if (is.null(file_grouping_var)) { file_grouping_var <- find_grouping_var(rdata$variableMetadata) if (is.null(file_grouping_var)) { stop("Malformed variableMetada.") } } tryCatch({ headers <- colnames(rdata$variableMetadata) file_grouping_var <- headers[[as.numeric(file_grouping_var)]] }, error = function(e) NULL) if ( is.null(file_grouping_var) || !(file_grouping_var %in% colnames(rdata$variableMetadata)) ) { stop(sprintf( "Could not find grouping variable %s in var meta file.", file_grouping_var )) } message("Processing samples.") message(sprintf("File grouping variable: %s", file_grouping_var)) context <- new.env() context$samples <- list() context$peaks <- rdata$xa@xcmsSet@peaks context$groupidx <- rdata$xa@xcmsSet@groupidx xcms_set <- rdata$xa@xcmsSet singlefile <- rdata$singlefile process_arg_list <- rdata$listOFlistArguments var_meta <- rdata$variableMetadata process_params <- list() if (is.null(process_arg_list)) { for (history in xcms_set@.processHistory) { if ( class(history@param) == "CentWaveParam" && history@type == "Peak detection" ) { params <- history@param process_params <- list(list( xfunction = "annotatediff", ppm = params@ppm, peakwidth = sprintf( "%s - %s", params@peakwidth[[1]], params@peakwidth[[2]] ), snthresh = params@snthresh, prefilterStep = params@prefilter[[1]], prefilterLevel = params@prefilter[[2]], mzdiff = params@mzdiff, fitgauss = params@fitgauss, noise = params@noise, mzCenterFun = params@mzCenterFun, integrate = params@integrate, firstBaselineCheck = params@firstBaselineCheck, snthreshIsoROIs = !identical(params@roiScales, numeric(0)) )) break } } } else { for (list_name in names(process_arg_list)) { param_list <- list() for (param_name in names(process_arg_list[[list_name]])) { param_list[[param_name]] <- process_arg_list[[ list_name ]][[param_name]] } process_params[[length(process_params) + 1]] <- param_list } } message("Parameters from previous processes extracted.") smol_xcms_set <- orm$smol_xcms_set() mz_tab_info <- new.env() g <- xcms::groups(xcms_set) mz_tab_info$group_length <- nrow(g) mz_tab_info$dataset_path <- xcms::filepaths(xcms_set) mz_tab_info$sampnames <- xcms::sampnames(xcms_set) mz_tab_info$sampclass <- xcms::sampclass(xcms_set) mz_tab_info$rtmed <- g[, "rtmed"] mz_tab_info$mzmed <- g[, "mzmed"] mz_tab_info$smallmolecule_abundance_assay <- xcms::groupval( xcms_set, value = "into" ) blogified <- blob::blob(fst::compress_fst( serialize(mz_tab_info, NULL), compression = 100 )) rm(mz_tab_info) invisible(smol_xcms_set$set_raw(blogified)$save()) smol_xcms_set_id <- smol_xcms_set$get_id() rm(smol_xcms_set) for (no in seq_along(names(singlefile))) { sample_name <- names(singlefile)[[no]] sample_path <- singlefile[[no]] if ( is.na(no) || is.null(sample_path) || !(sample_name %in% sample_names) ) { next } env <- new.env() if (!DEBUG_FAST_IGNORE_SLOW_OP) { ms_file <- xcms::xcmsRaw(sample_path) env$tic <- ms_file@tic env$mz <- ms_file@env$mz env$scanindex <- ms_file@scanindex env$scantime <- ms_file@scantime * 60 env$intensity <- ms_file@env$intensity env$polarity <- as.character(ms_file@polarity[[1]]) ## Again, ms file is huge, so we get rid of it quickly. rm(ms_file) } env$sample_name <- sample_name env$dataset_path <- sample_path env$process_params <- process_params env$enriched_rdata <- TRUE env$enriched_rdata_version <- ENRICHED_RDATA_VERSION env$tool_name <- TOOL_NAME env$enriched_rdata_doc <- ENRICHED_RDATA_DOC sample <- add_sample_to_database(orm, env, context, smol_xcms_set_id) rm(env) context$samples[no] <- sample$get_id() rm(sample) } context$clusters <- list() context$show_percent <- show_percent context$cluster_mean_rt_abundance <- list() context$central_feature <- list() context$adducts <- list() load_variable_metadata(orm, var_meta, context) clusters <- context$clusters rm(context) message("Features enrichment") complete_features(orm, clusters, show_percent) message("Features enrichment done.") return(NULL) } find_grouping_var <- function(var_meta) { known_colnames <- c( "name", "namecustom", "mz", "mzmin", "mzmax", "rt", "rtmin", "rtmax", "npeaks", "isotopes", "adduct", "pcgroup", "ms_level" ) col_names <- colnames(var_meta) classes <- list() for (name in col_names) { if (!(name %in% known_colnames)) { classes[[length(classes) + 1]] <- name } } if (length(classes) > 1) { stop(sprintf( "Only one class expected in the variable metadata. Found %d .", length(classes) )) } if (length(classes) == 0) { stop("Could not find any class column in your variableMetadata.") } return(classes[[1]]) } add_sample_to_database <- function(orm, env, context, smol_xcms_set_id) { message(sprintf("Processing sample %s", env$sample_name)) sample <- ( orm$sample() $set_name(env$sample_name) $set_path(env$dataset_path) $set_kind("enriched_rdata") $set_polarity( if ( is.null(env$polarity) || identical(env$polarity, character(0)) ) "" else env$polarity ) $set_raw(blob::blob(fst::compress_fst( serialize(env, NULL), compression = 100 ))) ) sample[["smol_xcms_set_id"]] <- smol_xcms_set_id sample$modified__[["smol_xcms_set_id"]] <- smol_xcms_set_id sample <- sample$save() load_process_params(orm, sample, env$process_params) message(sprintf("Sample %s inserted.", env$sample_name)) return(sample) } load_variable_metadata <- function(orm, var_meta, context) { all_clusters <- orm$cluster()$all() next_feature_id <- get_next_id(orm$feature()$all(), "featureID") + 1 next_cluster_id <- 0 next_pc_group <- get_next_id(all_clusters, "pc_group") next_align_group <- get_next_id(all_clusters, "align_group") + 1 message("Extracting features") invisible(create_features( orm, var_meta, context, next_feature_id, next_cluster_id, next_pc_group, next_align_group )) message("Extracting features done.") return(NULL) } get_next_id <- function(models, attribute) { if ((id <- models$max(attribute)) == Inf || id == -Inf) { return(0) } return(id) } create_features <- function( orm, var_meta, context, next_feature_id, next_cluster_id, next_pc_group, next_align_group ) { field_names <- as.list(names(orm$feature()$fields__)) field_names[field_names == "id"] <- NULL dummy_feature <- orm$feature() if (show_percent <- context$show_percent) { percent <- -1 total <- nrow(var_meta) } rows <- seq_len(nrow(var_meta)) if (PROCESS_SMOL_BATCH) { rows <- rows[1:as.integer(FAST_FEATURE_RATIO / 100.0 * length(rows))] } # features <- list() features <- as.list(rows) ## allocate all memory before processing # cluster_row <- list() cluster_row <- as.list(rows) ## allocate all memory before processing for (row in rows) { if (show_percent && (row / total) * 100 > percent) { percent <- percent + 1 message("\r", sprintf("\r%d %%", percent), appendLF = FALSE) } dummy_feature$set_featureID(next_feature_id) next_feature_id <- next_feature_id + 1 curent_var_meta <- var_meta[row, ] set_feature_fields_from_var_meta(dummy_feature, curent_var_meta) fake_iso <- dummy_feature$get_iso() iso <- extract_iso(fake_iso) clusterID <- extract_clusterID(fake_iso, next_cluster_id) context$clusterID <- clusterID dummy_feature$set_iso(iso) peak_list <- context$peaks[context$groupidx[[row]], ] if (! ("matrix" %in% class(peak_list))) { peak_list <- matrix( peak_list, nrow = 1, ncol = length(peak_list), dimnames = list(c(), names(peak_list)) ) } clusterID <- as.character(clusterID) if (is.null(context$central_feature[[clusterID]])) { int_o <- extract_peak_var(peak_list, "into") context$central_feature[[clusterID]] <- ( peak_list[peak_list[, "into"] == int_o, ]["sample"] ) } if (!DEBUG_FAST_IGNORE_SLOW_OP) { central_feature <- context$central_feature[[clusterID]] sample_peak_list <- peak_list[ as.integer(peak_list[, "sample"]) == central_feature, , drop = FALSE ] if ( !identical(sample_peak_list, numeric(0)) && !is.null(nrow(sample_peak_list)) && nrow(sample_peak_list) != 0 ) { int_o <- extract_peak_var(sample_peak_list, "into") if (!is.na(int_o)) { dummy_feature$set_int_o(int_o) } int_b <- extract_peak_var(sample_peak_list, "intb") if (!is.na(int_b)) { dummy_feature$set_int_b(int_b) } max_o <- extract_peak_var(sample_peak_list, "maxo") if (!is.na(max_o)) { dummy_feature$set_max_o(max_o) } } } cluster_row[[row]] <- create_associated_cluster( orm, context$samples[context$central_feature[[clusterID]]][[1]], dummy_feature, clusterID, context, curent_var_meta, next_pc_group, next_align_group ) next_align_group <- next_align_group + 1 features[[row]] <- as.list(dummy_feature, field_names) # features[[length(features) + 1]] <- as.list(dummy_feature, field_names) dummy_feature$clear() } rm(var_meta) message("") message("Saving features") invisible(dummy_feature$save(bulk = features)) ## We link manually clusters to the sample they're in. link_cache <- list() for (row in rows) { sample_nos <- unique(context$peaks[context$groupidx[[row]], "sample"]) for (sample_id in context$samples[sample_nos]) { cluster_id <- cluster_row[[row]]$get_id() id <- paste(sample_id, cluster_id, sep = ";") if (is.null(link_cache[[id]])) { link_cache[[id]] <- 1 orm$cluster_sample( sample_id = sample_id, cluster_id = cluster_id )$save() } } } message("Saved.") return(context$clusters) } extract_peak_var <- function(peak_list, var_name, selector = max) { value <- peak_list[, var_name] names(value) <- NULL return(selector(value)) } set_feature_fields_from_var_meta <- function(feature, var_meta) { if (!is.null(mz <- var_meta[["mz"]]) && !is.na(mz)) { feature$set_mz(mz) } if (!is.null(mzmin <- var_meta[["mzmin"]]) && !is.na(mzmin)) { feature$set_mz_min(mzmin) } if (!is.null(mzmax <- var_meta[["mzmax"]]) && !is.na(mzmax)) { feature$set_mz_max(mzmax) } if (!is.null(rt <- var_meta[["rt"]]) && !is.na(rt)) { feature$set_rt(rt) } if (!is.null(rtmin <- var_meta[["rtmin"]]) && !is.na(rtmin)) { feature$set_rt_min(rtmin) } if (!is.null(rtmax <- var_meta[["rtmax"]]) && !is.na(rtmax)) { feature$set_rt_max(rtmax) } if (!is.null(isotopes <- var_meta[["isotopes"]]) && !is.na(isotopes)) { feature$set_iso(isotopes) } return(feature) } extract_iso <- function(weird_data) { if (grepl("^\\[\\d+\\]", weird_data)[[1]]) { return(sub("^\\[\\d+\\]", "", weird_data, perl = TRUE)) } return(weird_data) } extract_clusterID <- function(weird_data, next_cluster_id) { if (grepl("^\\[\\d+\\]", weird_data)[[1]]) { clusterID <- stringr::str_extract(weird_data, "^\\[\\d+\\]") clusterID <- as.numeric(stringr::str_extract(clusterID, "\\d+")) } else { clusterID <- 0 } return(clusterID + next_cluster_id) } create_associated_cluster <- function( orm, main_sample_id, feature, clusterID, context, curent_var_meta, next_pc_group, next_align_group ) { clusterID <- as.character(clusterID) if (is.null(cluster <- context$clusters[[clusterID]])) { pcgroup <- as.numeric(curent_var_meta[["pcgroup"]]) adduct_name <- as.character(curent_var_meta[["adduct"]]) annotation <- curent_var_meta[["isotopes"]] cluster <- context$clusters[[clusterID]] <- orm$cluster( pc_group = pcgroup + next_pc_group, # adduct=adduct, align_group = next_align_group, # curent_group=curent_group, clusterID = context$clusterID, annotation = annotation ) if (is.null(adduct <- context$adducts[[adduct_name]])) { context$adducts[[adduct_name]] <- orm$adduct()$load_by( name = adduct_name )$first() if (is.null(adduct <- context$adducts[[adduct_name]])) { adduct <- context$adducts[[adduct_name]] <- orm$adduct( name = adduct_name, charge = 0 ) adduct$save() } } cluster$set_adduct(adduct) ## Crappy hack to assign sample id to cluster without loading the ## sample. Samples are too big (their sample$env) and slows the ## process, and eat all the menory so we dont't want to load them. cluster[["sample_id"]] <- main_sample_id cluster$modified__[["sample_id"]] <- main_sample_id } else { if (context$clusterID != 0 && cluster$get_clusterID() == 0) { cluster$set_clusterID(context$clusterID) } } cluster$save() feature$set_cluster(cluster) feature$save() return(cluster) } complete_features <- function(orm, clusters, show_percent) { total <- length(clusters) percent <- -1 i <- 0 for (cluster in clusters) { i <- i + 1 if (show_percent && (i / total) * 100 > percent) { percent <- percent + 1 message("\r", sprintf("\r%d %%", percent), appendLF = FALSE) } features <- orm$feature()$load_by(cluster_id = cluster$get_id()) if (features$any()) { if (!is.null(rt <- features$mean("rt"))) { cluster$set_mean_rt(rt)$save() } features_df <- as.data.frame(features) central_feature <- features_df[ grepl("^\\[M\\]", features_df[, "iso"]), ] central_feature_into <- central_feature[["int_o"]] if ( !identical(central_feature_into, numeric(0)) && central_feature_into != 0 ) { for (feature in as.vector(features)) { feature$set_abundance( feature$get_int_o() / central_feature_into * 100 )$save() } } } } return(NULL) } load_process_params <- function(orm, sample, params) { for (param_list in params) { if (is.null(param_list[["xfunction"]])) { next } if (param_list[["xfunction"]] == "annotatediff") { load_process_params_peak_picking(orm, sample, param_list) } } return(sample) } load_process_params_peak_picking <- function( orm, sample, peak_picking_params ) { return(add_sample_process_parameters( params = peak_picking_params, params_translation = list( ppm = "ppm", maxcharge = "maxCharge", maxiso = "maxIso" ), param_model_generator = orm$peak_picking_parameters, sample_param_setter = sample$set_peak_picking_parameters )) } add_sample_process_parameters <- function( params, params_translation, param_model_generator, sample_param_setter ) { model_params <- list() for (rdata_param_name in names(params_translation)) { database_param_name <- params_translation[[rdata_param_name]] if (is.null(rdata_param <- params[[rdata_param_name]])) { next } model_params[[database_param_name]] <- rdata_param } params_models <- do.call(param_model_generator()$load_by, model_params) if (params_models$any()) { params_model <- params_models$first() } else { params_model <- do.call(param_model_generator, model_params) params_model$save() } return(sample_param_setter(params_model)$save()) } library(optparse) option_list <- list( optparse::make_option( c("-v", "--version"), action = "store_true", help = "Display this tool's version and exits" ), optparse::make_option( c("-V", "--verbose"), action = "store_true", help = "Does more verbose outputs", default = FALSE ), optparse::make_option( c("-i", "--input"), type = "character", help = "The rdata path to import in XSeeker" ), optparse::make_option( c("-s", "--samples"), type = "character", help = "Samples to visualise in XSeeker" ), optparse::make_option( c("-B", "--archetype"), type = "character", help = "The name of the base database" ), optparse::make_option( c("-b", "--database"), type = "character", help = "The base database's path" ), optparse::make_option( c("-c", "--compounds-csv"), type = "character", help = "The csv containing compounds" ), optparse::make_option( c("-m", "--models"), type = "character", help = paste( "The path or url (must begin with http[s]:// or git@) to", "the database's models" ) ), optparse::make_option( c("-k", "--class"), type = "character", help = "The name of the column containing the classes" ), optparse::make_option( c("-o", "--output"), type = "character", help = "The path where to output sqlite" ), optparse::make_option( c("-P", "--not-show-percent"), action = "store_true", help = "Flag not to show the percents", default = FALSE ) ) options(error = function(){traceback(3)}) parser <- OptionParser( usage = "%prog [options] file", option_list = option_list ) args <- parse_args(parser, positional_arguments = 0) err_code <- 0 if (!is.null(args$options$version)) { message(sprintf("%s %s", TOOL_NAME, VERSION)) quit() } models <- get_models(args$options$models) orm <- DBModelR::ORM( connection_params = list(dbname=args$options$output), dbms = "SQLite" ) invisible(orm$models(models)) invisible(create_database(orm)) message("Database model created") insert_adducts(orm) if (!is.null(args$options$database)) { insert_base_data(orm, args$options$database) } message(sprintf("Base data inserted using %s.", args$options$database)) if (!is.null(args$options$archetype)) { insert_base_data(orm, args$options$archetype, archetype = TRUE) } if (!is.null(args$options$`compounds-csv`)) { insert_compounds(orm, args$options$`compounds-csv`) } # if (!is.null(args$options$rdata)) { # load_rdata_in_base(args$options$rdata, args$options$samples, args$options$`not-show-percent`) # } load(args$options$input, rdata <- new.env()) args$options$verbose <- ( if (args$options$verbose) { message("Verbose outputs.") \(...) { message(sprintf(...)) } } else { \(...) { } } ) err_code <- process_rdata(orm, rdata, args$options) quit(status = err_code)