Mercurial > repos > lain > xseekerpreparator



assign("TOOL_NAME", "XSeekerPreparator", envir = globalenv())
lockBinding("TOOL_NAME", globalenv())
assign("VERSION", "1.3.0", envir = globalenv())
lockBinding("VERSION", globalenv())
assign("DEBUG_FAST", FALSE, envir = globalenv())
lockBinding("DEBUG_FAST", globalenv())
assign("DEBUG_FAST_IGNORE_SLOW_OP", DEBUG_FAST, envir = globalenv())
lockBinding("DEBUG_FAST_IGNORE_SLOW_OP", globalenv())
assign("PROCESS_SMOL_BATCH", DEBUG_FAST, envir = globalenv())
lockBinding("PROCESS_SMOL_BATCH", globalenv())
assign("FAST_FEATURE_RATIO", 10, envir = globalenv())
lockBinding("FAST_FEATURE_RATIO", globalenv())
assign("OUTPUT_SPECIFIC_TOOL", "XSeeker_Galaxy", envir = globalenv())
lockBinding("OUTPUT_SPECIFIC_TOOL", globalenv())

assign(
    "ENRICHED_RDATA_VERSION",
    paste(VERSION, OUTPUT_SPECIFIC_TOOL, sep = "-"),
    envir = globalenv()
)
lockBinding("ENRICHED_RDATA_VERSION", globalenv())
assign("ENRICHED_RDATA_DOC", sprintf("
Welcome to the enriched <Version %s> of the output of CAMERA/xcms.
This doc was generated by the tool: %s - Version %s
To show the different variables contained in this rdata, type:
 - `load('this_rdata.rdata', rdata_env <- new.env())`
 - `names(rdata_env)`

Sections
######


This tools helpers
------
    The version number is somewhat special because the evolution of the
    rdata's format is non-linear.
    There may be different branches, each evolving separatly.
    To reflect these branches's diversions, there may be a prepended
    branch name following this format:
        major.minor.patch-branch_name
    Like this, we can process rdata with the same tool, and output
    rdata formated differently, for each tool.


  - enriched_rdata:
    - Description: flag created by that tool to tell it was enriched.
    - Retrieval method: enriched_rdata <- TRUE

  - enriched_rdata_version:
    - Description: A flag created by that tool to tell which version of
        this tool has enriched the rdata.
    - Retrieval method:
      enriched_rdata_version <- sprintf(
          \"%s\",
          ENRICHED_RDATA_VERSION
      )

  - enriched_rdata_doc:
    - Description: Contains the documentation string.

Data from original mzxml file
------
  - tic:
    - Description: Those are the tic values from the original mzxml
        file, extracted using xcms 2.
    - Retrieval method: xcms::xcmsRaw('original_file.mzxml')@tic
    - xcms version: 2.0

  - mz:
    - Description: Those are the m/z values from the original mzxml
        file, extracted using xcms 2.
    - Retrieval method: xcms::xcmsRaw('original_file.mzxml')@env$mz
    - xcms version: 2.0

  - scanindex:
    - Description: Those are the scanindex values from the original mzxml
        file, extracted using xcms 2.
    - Retrieval method: xcms::xcmsRaw('original_file.mzxml')@scanindex
    - xcms version: 2.0

  - scantime:
    - Description: Those are the scantime values from the original mzxml
        file, extracted using xcms 2.
    - Retrieval method: xcms::xcmsRaw('original_file.mzxml')@scantime
    - xcms version: 2.0

  - intensity:
    - Description: Those are the intensity values from the original mzxml
        file, extracted using xcms 2.
    - Retrieval method: xcms::xcmsRaw('original_file.mzxml')@env$intensity
    - xcms version: 2.0

  - polarity:
    - Description: Those are the polarity values from the original mzxml
        file, extracted using xcms 2.
    - Retrieval method:
        as.character(xcms::xcmsRaw(
            'original_file.mzxml'
        )@polarity[[1]])
    - xcms version: 2.0

Data taken from incoming rdata
------
  - variableMetadata:
    - Description: Unmodified copy of variableMetadata from incoming rdata.
    - Retrieval method: rdata_file$variableMetadata

  - process_params:
    - Description: Those are the processing parameters values from the
        curent rdata. They have been simplified to allow easy access like:
        for (params in process_params) {
            if (params[[\"xfunction\"]] == \"annotatediff\") {
                process_peak_picking_params(params)
            }
        }
    - Retrieval method:
        ## just he same list, but simplified
        process_params <- list()
        for (list_name in names(rdata_file$listOFlistArguments)) {
            param_list <- list()
            for (param_name in names(
                    rdata_file$listOFlistArguments[[list_name]]
            )) {
                param_list[[param_name]] <- rdata_file$listOFlistArguments[[
                    list_name
                ]][[param_name]]
            }
            process_params[[length(process_params)+1]] <- param_list
        }
", ENRICHED_RDATA_VERSION, TOOL_NAME, VERSION, ENRICHED_RDATA_VERSION),
envir = globalenv())
lockBinding("ENRICHED_RDATA_DOC", globalenv())


get_models <- function(path) {
    if (is.null(path)) {
        stop("No models to define the database schema")
    } else {
        message(sprintf("Loading models from %s", path))
    }
    ## galaxy mangles the "@" to a "__at__"
    if (substr(path, 1, 9) == "git__at__") {
        path <- sub("^git__at__", "git@", path, perl = TRUE)
    }
    if (
        substr(path, 1, 4) == "git@"
        || substr(path, length(path) - 4, 4) == ".git"
    ) {
        return(get_models_from_git(path))
    }
    if (substr(path, 1, 4) == "http") {
        return(get_models_from_url(path))
    }
    return(source(path)$value)
}

get_models_from_git <- function(url, target_file = "models.R", rm = TRUE) {
    tmp <- tempdir()
    message(sprintf("Cloning %s", url))
    system2("git", c("clone", url, tmp))
    result <- search_tree(file.path(tmp, dir), target_file)
    if (!is.null(result)) {
        models <- source(result)$value
        if (rm) {
            unlink(tmp, recursive = TRUE)
        }
        return(models)
    }
    if (rm) {
        unlink(tmp, recursive = TRUE)
    }
    stop(sprintf(
        "Could not find any file named \"%s\" in this repo",
        target_file
    ))
}

get_models_from_url <- function(url, target_file = "models.R", rm = TRUE) {
    tmp <- tempdir()
    message(sprintf("Downloading %s", url))
    result <- file.path(tmp, target_file)
    if (download.file(url, destfile = result) == 0) {
        models <- source(result)$value
        if (rm) {
            unlink(tmp, recursive = TRUE)
        }
        return(models)
    }
    if (rm) {
        unlink(tmp, recursive = TRUE)
    }
    stop("Could not download any file at this adress.")
}

search_tree <- function(path, target) {
    target <- tolower(target)
    for (file in list.files(path)) {
        if (fs::is.dir(file)) {
            result <- search_tree(file.path(path, file), target)
            if (!is.null(result)) {
                return(result)
            }
        } else if (tolower(file) == target) {
            return(file.path(path, file))
        }
    }
    return(NULL)
}

create_database <- function(orm) {
    orm$recreate_database(no_exists = FALSE)
    set_database_version(orm, "created")
}

insert_adducts <- function(orm) {
    message("Creating adducts...")
    adducts <- list(
        list("[M-H2O-H]-", 1, -1, -48.992020312000001069, 1, 0, 0.5, "H0", "H1O3"),
        list("[M-H-Cl+O]-", 1, -1, -19.981214542000000022, 2, 0, 0.5, "O1", "H1Cl1"),
        list("[M-Cl+O]-", 1, -1, -18.973389510000000512, 3, 0, 0.5, "O1", "Cl1"),
        list("[M-3H]3-", 1, -3, -3.0218293560000000219, 4, 0, 1.0, "H0", "H3"),
        list("[2M-3H]3-", 2, -3, -3.0218293560000000219, 4, 0, 0.5, "H0", "H3"),
        list("[3M-3H]3-", 3, -3, -3.0218293560000000219, 4, 0, 0.5, "H0", "H3"),
        list("[M-2H]2-", 1, -2, -2.0145529039999998666, 5, 0, 1.0, "H0", "H2"),
        list("[2M-2H]2-", 2, -2, -2.0145529039999998666, 5, 0, 0.5, "H0", "H2"),
        list("[3M-2H]2-", 3, -2, -2.0145529039999998666, 5, 0, 0.5, "H0", "H2"),
        list("[M-H]-", 1, -1, -1.0072764519999999333, 6, 1, 1.0, "H0", "H1"),
        list("[2M-H]-", 2, -1, -1.0072764519999999333, 6, 0, 0.5, "H0", "H1"),
        list("[3M-H]-", 3, -1, -1.0072764519999999333, 6, 0, 0.5, "H0", "H1"),
        list("[M]+", 1, 1, -0.00054858000000000000945, 7, 1, 1.0, "H0", "H0"),
        list("[M]-", 1, -1, 0.00054858000000000000945, 8, 1, 1.0, "H0", "H0"),
        list("[M+H]+", 1, 1, 1.0072764519999999333, 9, 1, 1.0, "H1", "H0"),
        list("[2M+H]+", 2, 1, 1.0072764519999999333, 9, 0, 0.5, "H1", "H0"),
        list("[3M+H]+", 3, 1, 1.0072764519999999333, 9, 0, 0.25, "H1", "H0"),
        list("[M+2H]2+", 1, 2, 2.0145529039999998666, 10, 0, 0.75, "H2", "H0"),
        list("[2M+2H]2+", 2, 2, 2.0145529039999998666, 10, 0, 0.5, "H2", "H0"),
        list("[3M+2H]2+", 3, 2, 2.0145529039999998666, 10, 0, 0.25, "H2", "H0"),
        list("[M+3H]3+", 1, 3, 3.0218293560000000219, 11, 0, 0.75, "H3", "H0"),
        list("[2M+3H]3+", 2, 3, 3.0218293560000000219, 11, 0, 0.5, "H3", "H0"),
        list("[3M+3H]3+", 3, 3, 3.0218293560000000219, 11, 0, 0.25, "H3", "H0"),
        list("[M-2H+NH4]-", 1, -1, 16.019272654000001665, 12, 0, 0.25, "N1H4", "H2"),
        list("[2M-2H+NH4]-", 2, -1, 16.019272654000001665, 12, 0, 0.0, "N1H4", "H2"),
        list("[3M-2H+NH4]-", 3, -1, 16.019272654000001665, 12, 0, 0.25, "N1H4", "H2"),
        list("[M+NH4]+", 1, 1, 18.033825558000000199, 13, 1, 1.0, "N1H4", "H0"),
        list("[2M+NH4]+", 2, 1, 18.033825558000000199, 13, 0, 0.5, "N1H4", "H0"),
        list("[3M+NH4]+", 3, 1, 18.033825558000000199, 13, 0, 0.25, "N1H4", "H0"),
        list("[M+H+NH4]2+", 1, 2, 19.041102009999999467, 14, 0, 0.5, "N1H5", "H0"),
        list("[2M+H+NH4]2+", 2, 2, 19.041102009999999467, 14, 0, 0.5, "N1H5", "H0"),
        list("[3M+H+NH4]2+", 3, 2, 19.041102009999999467, 14, 0, 0.25, "N1H5", "H0"),
        list("[M+Na-2H]-", 1, -1, 20.974668176000001551, 15, 0, 0.75, "Na1", "H2"),
        list("[2M-2H+Na]-", 2, -1, 20.974668176000001551, 15, 0, 0.25, "Na1", "H2"),
        list("[3M-2H+Na]-", 3, -1, 20.974668176000001551, 15, 0, 0.25, "Na1", "H2"),
        list("[M+Na]+", 1, 1, 22.989221080000000086, 16, 1, 1.0, "Na1", "H0"),
        list("[2M+Na]+", 2, 1, 22.989221080000000086, 16, 0, 0.5, "Na1", "H0"),
        list("[3M+Na]+", 3, 1, 22.989221080000000086, 16, 0, 0.25, "Na1", "H0"),
        list("[M+H+Na]2+", 1, 2, 23.996497531999999353, 17, 0, 0.5, "Na1H1", "H0"),
        list("[2M+H+Na]2+", 2, 2, 23.996497531999999353, 17, 0, 0.5, "Na1H1", "H0"),
        list("[3M+H+Na]2+", 3, 2, 23.996497531999999353, 17, 0, 0.25, "Na1H1", "H0"),
        list("[M+2H+Na]3+", 1, 3, 25.003773983999998619, 18, 0, 0.25, "H2Na1", "H0"),
        list("[M+CH3OH+H]+", 1, 1, 33.033491200000000276, 19, 0, 0.25, "C1O1H5", "H0"),
        list("[M-H+Cl]2-", 1, -2, 33.962124838000001148, 20, 0, 1.0, "Cl1", "H1"),
        list("[2M-H+Cl]2-", 2, -2, 33.962124838000001148, 20, 0, 0.5, "Cl1", "H1"),
        list("[3M-H+Cl]2-", 3, -2, 33.962124838000001148, 20, 0, 0.5, "Cl1", "H1"),
        list("[M+Cl]-", 1, -1, 34.969401290000000416, 21, 1, 1.0, "Cl1", "H0"),
        list("[2M+Cl]-", 2, -1, 34.969401290000000416, 21, 0, 0.5, "Cl1", "H0"),
        list("[3M+Cl]-", 3, -1, 34.969401290000000416, 21, 0, 0.5, "Cl1", "H0"),
        list("[M+K-2H]-", 1, -1, 36.948605415999999479, 22, 0, 0.5, "K1", "H2"),
        list("[2M-2H+K]-", 2, -1, 36.948605415999999479, 22, 0, 0.0, "K1", "H2"),
        list("[3M-2H+K]-", 3, -1, 36.948605415999999479, 22, 0, 0.0, "K1", "H2"),
        list("[M+K]+", 1, 1, 38.963158319999998013, 23, 1, 1.0, "K1", "H0"),
        list("[2M+K]+", 2, 1, 38.963158319999998013, 23, 0, 0.5, "K1", "H0"),
        list("[3M+K]+", 3, 1, 38.963158319999998013, 23, 0, 0.25, "K1", "H0"),
        list("[M+H+K]2+", 1, 2, 39.970434771999997281, 24, 0, 0.5, "K1H1", "H0"),
        list("[2M+H+K]2+", 2, 2, 39.970434771999997281, 24, 0, 0.5, "K1H1", "H0"),
        list("[3M+H+K]2+", 3, 2, 39.970434771999997281, 24, 0, 0.25, "K1H1", "H0"),
        list("[M+ACN+H]+", 1, 1, 42.033825557999996646, 25, 0, 0.25, "C2H4N1", "H0"),
        list("[2M+ACN+H]+", 2, 1, 42.033825557999996646, 25, 0, 0.25, "C2H4N1", "H0"),
        list("[M+2Na-H]+", 1, 1, 44.971165708000000902, 26, 0, 0.5, "Na2", "H1"),
        list("[2M+2Na-H]+", 2, 1, 44.971165708000000902, 26, 0, 0.25, "Na2", "H1"),
        list("[3M+2Na-H]+", 3, 1, 44.971165708000000902, 26, 0, 0.25, "Na2", "H1"),
        list("[2M+FA-H]-", 2, -1, 44.998202851999998586, 27, 0, 0.25, "C1O2H2", "H1"),
        list("[M+FA-H]-", 1, -1, 44.998202851999998586, 27, 0, 0.5, "C1O2H2", "H1"),
        list("[M+2Na]2+", 1, 2, 45.978442160000000172, 28, 0, 0.5, "Na2", "H0"),
        list("[2M+2Na]2+", 2, 2, 45.978442160000000172, 28, 0, 0.5, "Na2", "H0"),
        list("[3M+2Na]2+", 3, 2, 45.978442160000000172, 28, 0, 0.25, "Na2", "H0"),
        list("[M+H+2Na]3+", 1, 3, 46.985718611999999438, 29, 0, 0.25, "H1Na2", "H0"),
        list("[M+H+FA]+", 1, 1, 47.012755755999997122, 30, 0, 0.25, "C1O2H3", "H0"),
        list("[M+Hac-H]-", 1, -1, 59.013852915999997607, 31, 0, 0.25, "C2O2H4", "H1"),
        list("[2M+Hac-H]-", 2, -1, 59.013852915999997607, 31, 0, 0.25, "C2O2H4", "H1"),
        list("[M+IsoProp+H]+", 1, 1, 61.064791327999998317, 32, 0, 0.25, "C3H9O1", "H0"),
        list("[M+Na+K]2+", 1, 2, 61.9523793999999981, 33, 0, 0.5, "Na1K1", "H0"),
        list("[2M+Na+K]2+", 2, 2, 61.9523793999999981, 33, 0, 0.5, "Na1K1", "H0"),
        list("[3M+Na+K]2+", 3, 2, 61.9523793999999981, 33, 0, 0.25, "Na1K1", "H0"),
        list("[M+NO3]-", 1, -1, 61.988366450000000895, 34, 0, 0.5, "N1O3", "H0"),
        list("[M+ACN+Na]+", 1, 1, 64.015770185999997464, 35, 0, 0.25, "C2H3N1Na1", "H0"),
        list("[2M+ACN+Na]+", 2, 1, 64.015770185999997464, 35, 0, 0.25, "C2H3N1Na1", "H0"),
        list("[M+NH4+FA]+", 1, 1, 64.039304861999994502, 36, 0, 0.25, "N1C1O2H6", "H0"),
        list("[M-2H+Na+FA]-", 1, -1, 66.980147479999999405, 37, 0, 0.5, "NaC1O2H2", "H2"),
        list("[M+3Na]3+", 1, 3, 68.967663239999993153, 38, 0, 0.25, "Na3", "H0"),
        list("[M+Na+FA]+", 1, 1, 68.99470038399999794, 39, 0, 0.25, "Na1C1O2H2", "H0"),
        list("[M+2Cl]2-", 1, -2, 69.938802580000000832, 40, 0, 1.0, "Cl2", "H0"),
        list("[2M+2Cl]2-", 2, -2, 69.938802580000000832, 40, 0, 0.5, "Cl2", "H0"),
        list("[3M+2Cl]2-", 3, -2, 69.938802580000000832, 40, 0, 0.5, "Cl2", "H0"),
        list("[M+2K-H]+", 1, 1, 76.919040187999996758, 41, 0, 0.5, "K2", "H1"),
        list("[2M+2K-H]+", 2, 1, 76.919040187999996758, 41, 0, 0.25, "K2", "H1"),
        list("[3M+2K-H]+", 3, 1, 76.919040187999996758, 41, 0, 0.25, "K2", "H1"),
        list("[M+2K]2+", 1, 2, 77.926316639999996028, 42, 0, 0.5, "K2", "H0"),
        list("[2M+2K]2+", 2, 2, 77.926316639999996028, 42, 0, 0.5, "K2", "H0"),
        list("[3M+2K]2+", 3, 2, 77.926316639999996028, 42, 0, 0.25, "K2", "H0"),
        list("[M+Br]-", 1, -1, 78.918886479999997619, 43, 1, 1.0, "Br1", "H0"),
        list("[M+Cl+FA]-", 1, -1, 80.974880593999998268, 44, 0, 0.5, "Cl1C1O2H2", "H0"),
        list("[M+AcNa-H]-", 1, -1, 80.995797543999998426, 45, 0, 0.25, "C2H3Na1O2", "H1"),
        list("[M+2ACN+2H]2+", 1, 2, 84.067651115999993292, 46, 0, 0.25, "C4H8N2", "H0"),
        list("[M+K+FA]+", 1, 1, 84.968637623999995868, 47, 0, 0.25, "K1C1O2H2", "H0"),
        list("[M+Cl+Na+FA-H]-", 1, -1, 102.95682522200000619, 48, 0, 0.5, "Cl1Na1C1O2H2", "H1"),
        list("[2M+3H2O+2H]+", 2, 1, 104.03153939599999944, 49, 0, 0.25, "H8O6", "H0"),
        list("[M+TFA-H]-", 1, -1, 112.98558742000000165, 50, 0, 0.5, "C2F3O2H1", "H1"),
        list("[M+H+TFA]+", 1, 1, 115.00014032400000019, 51, 0, 0.25, "C2F3O2H2", "H0"),
        list("[M+3ACN+2H]2+", 1, 2, 125.09420022199999778, 52, 0, 0.25, "C6H11N3", "H0"),
        list("[M+NH4+TFA]+", 1, 1, 132.02668943000000468, 53, 0, 0.25, "N1C2F3O2H5", "H0"),
        list("[M+Na+TFA]+", 1, 1, 136.98208495200000811, 54, 0, 0.25, "Na1C2F3O2H1", "H0"),
        list("[M+Cl+TFA]-", 1, -1, 148.96226516199999423, 55, 0, 0.5, "Cl1C2F3O2H1", "H0"),
        list("[M+K+TFA]+", 1, 1, 152.95602219200000604, 56, 0, 0.25, "K1C2F3O2H1","H0")
    )
    dummy_adduct <- orm$adduct()
    for (adduct in adducts) {
        i <- 0
        dummy_adduct$set_name(adduct[[i <- i + 1]])
        dummy_adduct$set_multi(adduct[[i <- i + 1]])
        dummy_adduct$set_charge(adduct[[i <- i + 1]])
        dummy_adduct$set_mass(adduct[[i <- i + 1]])
        dummy_adduct$set_oidscore(adduct[[i <- i + 1]])
        dummy_adduct$set_quasi(adduct[[i <- i + 1]])
        dummy_adduct$set_ips(adduct[[i <- i + 1]])
        dummy_adduct$set_formula_add(adduct[[i <- i + 1]])
        dummy_adduct$set_formula_ded(adduct[[i <- i + 1]])
        invisible(dummy_adduct$save())
        dummy_adduct$clear(unset_id = TRUE)
    }
    message("Adducts created")
}

insert_base_data <- function(orm, path, archetype = FALSE) {
    if (archetype) {
        ## not implemented yet
        return()
    }
    base_data <- readLines(path)
    for (sql in strsplit(paste(base_data, collapse = " "), ";")[[1]]) {
        orm$execute(sql)
    }
    set_database_version(orm, "enriched")
}

insert_compounds <- function(orm, compounds_path) {
    compounds <- read.csv(file = compounds_path, sep = "\t")
    if (is.null(compounds <- translate_compounds(compounds))) {
        stop("Could not find asked compound's attributes in csv file.")
    }
    dummy_compound <- orm$compound()
    compound_list <- list()
    for (i in seq_len(nrow(compounds))) {
        dummy_compound$set_mz(compounds[i, "mz"])
        dummy_compound$set_name(compounds[i, "name"])
        dummy_compound$set_common_name(compounds[i, "common_name"])
        dummy_compound$set_formula(compounds[i, "formula"])
        compound_list[[length(compound_list) + 1]] <- as.list(
            dummy_compound,
            c("mz", "name", "common_name", "formula")
        )
        dummy_compound$clear(unset_id = TRUE)
    }
    invisible(dummy_compound$save(bulk = compound_list))
}

translate_compounds <- function(compounds) {
    recognized_headers <- list(
        c(
            "HMDB_ID", "MzBank", "X.M.H..", "X.M.H...1",
            "MetName", "ChemFormula", "INChIkey"
        )
    )
    header_translators <- list(
        hmdb_header_translator
    )
    for (index in seq_along(recognized_headers)) {
        headers <- recognized_headers[[index]]
        if (identical(colnames(compounds), headers)) {
            return(header_translators[[index]](compounds))
        }
    }
    if (is.null(translator <- guess_translator(colnames(compounds)))) {
        return(NULL)
    }
    return(csv_header_translator(translator, compounds))
}

guess_translator <- function(header) {
    result <- list(
        mz = NULL,
        name = NULL,
        common_name = NULL,
        formula = NULL
    )
    asked_cols <- names(result)
    for (asked_col in asked_cols) {
        for (col in header) {
            if ((twisted <- tolower(col)) == asked_col
                || gsub("-", "_", twisted) == asked_col
                || gsub(" ", "_", twisted) == asked_col
                || tolower(gsub("(.)([A-Z])", "\\1_\\2", col)) == asked_col
            ) {
                result[[asked_col]] <- col
                next
            }
        }
    }
    if (any(mapply(is.null, result))) {
        return(NULL)
    }
    return(result)
}

hmdb_header_translator <- function(compounds) {
    return(csv_header_translator(
        list(
            HMDB_ID = "HMDB_ID",
            mz = "MzBank",
            name = "MetName",
            common_name = "MetName",
            formula = "ChemFormula",
            inchi_key = "INChIkey"
        ), compounds
    ))
}

csv_header_translator <- function(translation_table, csv) {
    header_names <- names(translation_table)
    result <- data.frame(seq_len(nrow(csv)))
    for (i in seq_along(header_names)) {
        result[, header_names[[i]]] <- csv[, translation_table[[i]]]
    }
    result[, "mz"] <- as.numeric(result[, "mz"])
    return(result)
}

set_database_version <- function(orm, version) {
    orm$set_tag(
        version,
        tag_name = "database_version",
        tag_table_name = "XSeeker_tagging_table"
    )
}

process_rdata <- function(orm, rdata, options) {
    mzml_tmp_dir <- gather_mzml_files(rdata)
    samples <- names(rdata$singlefile)
    if (!is.null(options$samples)) {
        samples <- samples[options$samples %in% samples]
    }
    show_percent <- (
        is.null(options$`not-show-percent`)
        || options$`not-show-percent` == FALSE
    )
    error <- tryCatch({
        process_sample_list(
            orm, rdata, samples,
            show_percent = show_percent,
            file_grouping_var = options$class,
            options = options
        )
        NULL
    }, error = function(e) {
        return(e)
    })
    if (!is.null(mzml_tmp_dir)) {
        unlink(mzml_tmp_dir, recursive = TRUE)
    }
    if (!is.null(error)) {
        stop(error)
    }
    return(!is.null(error))
}

gather_mzml_files <- function(rdata) {
    if (is.null(rdata$singlefile)) {
        message("Extracting mxml files")
        tmp <- tempdir()
        rdata$singlefile <- utils::unzip(rdata$zipfile, exdir = tmp)
        names(rdata$singlefile) <- tools::file_path_sans_ext(
            basename(rdata$singlefile)
        )
        message("Extracted")
        return(tmp)
    } else {
        message(sprintf(
            "Not a zip file, loading files directly from path: %s",
            paste(rdata$singlefile, collapse = " ; ")
        ))
    }
    return(NULL)
}

process_sample_list <- function(
    orm,
    rdata,
    sample_names,
    show_percent,
    file_grouping_var = NULL,
    options = list()
) {
    if (is.null(file_grouping_var)) {
        file_grouping_var <- find_grouping_var(rdata$variableMetadata)
        if (is.null(file_grouping_var)) {
            stop("Malformed variableMetada.")
        }
    }
    tryCatch({
        headers <- colnames(rdata$variableMetadata)
        file_grouping_var <- headers[[as.numeric(file_grouping_var)]]
    }, error = function(e) NULL)
    if (
        is.null(file_grouping_var)
        || !(file_grouping_var %in% colnames(rdata$variableMetadata))
    ) {
        stop(sprintf(
            "Could not find grouping variable %s in var meta file.",
            file_grouping_var
        ))
    }
    message("Processing samples.")
    message(sprintf("File grouping variable: %s", file_grouping_var))

    context <- new.env()
    context$samples <- list()
    context$peaks <- rdata$xa@xcmsSet@peaks
    context$groupidx <- rdata$xa@xcmsSet@groupidx
    xcms_set <- rdata$xa@xcmsSet
    singlefile <- rdata$singlefile
    process_arg_list <- rdata$listOFlistArguments
    var_meta <- rdata$variableMetadata

    process_params <- list()
    if (is.null(process_arg_list)) {
        for (history in xcms_set@.processHistory) {
            if (
                class(history@param) == "CentWaveParam"
                && history@type == "Peak detection"
            ) {
                params <- history@param
                process_params <- list(list(
                    xfunction = "annotatediff",
                    ppm = params@ppm,
                    peakwidth = sprintf(
                        "%s - %s",
                        params@peakwidth[[1]],
                        params@peakwidth[[2]]
                    ),
                    snthresh = params@snthresh,
                    prefilterStep = params@prefilter[[1]],
                    prefilterLevel = params@prefilter[[2]],
                    mzdiff = params@mzdiff,
                    fitgauss = params@fitgauss,
                    noise = params@noise,
                    mzCenterFun = params@mzCenterFun,
                    integrate = params@integrate,
                    firstBaselineCheck = params@firstBaselineCheck,
                    snthreshIsoROIs = !identical(params@roiScales, numeric(0))
                ))
                break
            }
        }
    } else {
        for (list_name in names(process_arg_list)) {
            param_list <- list()
            for (param_name in names(process_arg_list[[list_name]])) {
                param_list[[param_name]] <- process_arg_list[[
                    list_name
                ]][[param_name]]
            }
            process_params[[length(process_params) + 1]] <- param_list
        }
    }

    message("Parameters from previous processes extracted.")

    smol_xcms_set <- orm$smol_xcms_set()
    mz_tab_info <- new.env()
    g <- xcms::groups(xcms_set)
    mz_tab_info$group_length <- nrow(g)
    mz_tab_info$dataset_path <- xcms::filepaths(xcms_set)
    mz_tab_info$sampnames <- xcms::sampnames(xcms_set)
    mz_tab_info$sampclass <- xcms::sampclass(xcms_set)
    mz_tab_info$rtmed <- g[, "rtmed"]
    mz_tab_info$mzmed <- g[, "mzmed"]
    mz_tab_info$smallmolecule_abundance_assay <- xcms::groupval(
        xcms_set,
        value = "into"
    )
    blogified <- blob::blob(fst::compress_fst(
        serialize(mz_tab_info, NULL),
        compression = 100
    ))
    rm(mz_tab_info)

    invisible(smol_xcms_set$set_raw(blogified)$save())
    smol_xcms_set_id <- smol_xcms_set$get_id()
    rm(smol_xcms_set)

    for (no in seq_along(names(singlefile))) {
        sample_name <- names(singlefile)[[no]]
        sample_path <- singlefile[[no]]
        if (
            is.na(no)
            || is.null(sample_path)
            || !(sample_name %in% sample_names)
        ) {
            next
        }
        env <- new.env()

        if (!DEBUG_FAST_IGNORE_SLOW_OP) {
            ms_file <- xcms::xcmsRaw(sample_path)
            env$tic <- ms_file@tic
            env$mz <- ms_file@env$mz
            env$scanindex <- ms_file@scanindex
            env$scantime <- ms_file@scantime * 60
            env$intensity <- ms_file@env$intensity
            env$polarity <- as.character(ms_file@polarity[[1]])

            ## Again, ms file is huge, so we get rid of it quickly.
            rm(ms_file)

        }
        env$sample_name <- sample_name
        env$dataset_path <- sample_path
        env$process_params <- process_params
        env$enriched_rdata <- TRUE
        env$enriched_rdata_version <- ENRICHED_RDATA_VERSION
        env$tool_name <- TOOL_NAME
        env$enriched_rdata_doc <- ENRICHED_RDATA_DOC

        sample <- add_sample_to_database(orm, env, context, smol_xcms_set_id)
        rm(env)
        context$samples[no] <- sample$get_id()
        rm(sample)
    }
    context$clusters <- list()
    context$show_percent <- show_percent
    context$cluster_mean_rt_abundance <- list()
    context$central_feature <- list()
    context$adducts <- list()
    load_variable_metadata(orm, var_meta, context)
    clusters <- context$clusters
    rm(context)
    message("Features enrichment")
    complete_features(orm, clusters, show_percent)
    message("Features enrichment done.")
    return(NULL)
}

find_grouping_var <- function(var_meta) {
    known_colnames <- c(
        "name", "namecustom", "mz", "mzmin", "mzmax",
        "rt", "rtmin", "rtmax", "npeaks", "isotopes", "adduct",
        "pcgroup", "ms_level"
    )
    col_names <- colnames(var_meta)
    classes <- list()
    for (name in col_names) {
        if (!(name %in% known_colnames)) {
            classes[[length(classes) + 1]] <- name
        }
    }
    if (length(classes) > 1) {
        stop(sprintf(
            "Only one class expected in the variable metadata. Found %d .",
            length(classes)
        ))
    }
    if (length(classes) == 0) {
        stop("Could not find any class column in your variableMetadata.")
    }
    return(classes[[1]])
}

add_sample_to_database <- function(orm, env, context, smol_xcms_set_id) {
    message(sprintf("Processing sample %s", env$sample_name))
    sample <- (
        orm$sample()
        $set_name(env$sample_name)
        $set_path(env$dataset_path)
        $set_kind("enriched_rdata")
        $set_polarity(
            if (
                is.null(env$polarity)
                || identical(env$polarity, character(0))
            ) ""
            else env$polarity
        )
        $set_raw(blob::blob(fst::compress_fst(
            serialize(env, NULL),
            compression = 100
        )))
    )
    sample[["smol_xcms_set_id"]] <- smol_xcms_set_id
    sample$modified__[["smol_xcms_set_id"]] <- smol_xcms_set_id
    sample <- sample$save()
    load_process_params(orm, sample, env$process_params)
    message(sprintf("Sample %s inserted.", env$sample_name))
    return(sample)
}


load_variable_metadata <- function(orm, var_meta, context) {
    all_clusters <- orm$cluster()$all()

    next_feature_id <- get_next_id(orm$feature()$all(), "featureID") + 1
    next_cluster_id <- 0
    next_pc_group <- get_next_id(all_clusters, "pc_group")
    next_align_group <- get_next_id(all_clusters, "align_group") + 1
    message("Extracting features")
    invisible(create_features(
        orm, var_meta, context,
        next_feature_id, next_cluster_id,
        next_pc_group, next_align_group
    ))
    message("Extracting features done.")
    return(NULL)
}

get_next_id <- function(models, attribute) {
    if ((id <- models$max(attribute)) == Inf || id == -Inf) {
        return(0)
    }
    return(id)
}

create_features <- function(
    orm, var_meta, context,
    next_feature_id, next_cluster_id,
    next_pc_group, next_align_group
) {
    field_names <- as.list(names(orm$feature()$fields__))
    field_names[field_names == "id"] <- NULL

    dummy_feature <- orm$feature()

    if (show_percent <- context$show_percent) {
        percent <- -1
        total <- nrow(var_meta)
    }
    rows <- seq_len(nrow(var_meta))
    if (PROCESS_SMOL_BATCH) {

        rows <- rows[1:as.integer(FAST_FEATURE_RATIO / 100.0 * length(rows))]
    }
    # features <- list()
    features <- as.list(rows) ## allocate all memory before processing
    # cluster_row <- list()
    cluster_row <- as.list(rows) ## allocate all memory before processing
    for (row in rows) {
        if (show_percent && (row / total) * 100 > percent) {
            percent <- percent + 1
            message("\r", sprintf("\r%d %%", percent), appendLF = FALSE)
        }

        dummy_feature$set_featureID(next_feature_id)
        next_feature_id <- next_feature_id + 1

        curent_var_meta <- var_meta[row, ]
        set_feature_fields_from_var_meta(dummy_feature, curent_var_meta)
        fake_iso <- dummy_feature$get_iso()
        iso <- extract_iso(fake_iso)
        clusterID <- extract_clusterID(fake_iso, next_cluster_id)
        context$clusterID <- clusterID
        dummy_feature$set_iso(iso)

        peak_list <- context$peaks[context$groupidx[[row]], ]
        if (! ("matrix" %in% class(peak_list))) {
            peak_list <- matrix(
                peak_list,
                nrow = 1,
                ncol = length(peak_list),
                dimnames = list(c(), names(peak_list))
            )
        }

        clusterID <- as.character(clusterID)
        if (is.null(context$central_feature[[clusterID]])) {
            int_o <- extract_peak_var(peak_list, "into")
            context$central_feature[[clusterID]] <- (
                peak_list[peak_list[, "into"] == int_o, ]["sample"]
            )
        }

        if (!DEBUG_FAST_IGNORE_SLOW_OP) {
            central_feature <- context$central_feature[[clusterID]]
            sample_peak_list <- peak_list[
                as.integer(peak_list[, "sample"]) == central_feature,
                ,
                drop = FALSE
            ]
            if (
                !identical(sample_peak_list, numeric(0))
                && !is.null(nrow(sample_peak_list))
                && nrow(sample_peak_list) != 0
            ) {
                int_o <- extract_peak_var(sample_peak_list, "into")
                if (!is.na(int_o)) {
                    dummy_feature$set_int_o(int_o)
                }
                int_b <- extract_peak_var(sample_peak_list, "intb")
                if (!is.na(int_b)) {
                    dummy_feature$set_int_b(int_b)
                }
                max_o <- extract_peak_var(sample_peak_list, "maxo")
                if (!is.na(max_o)) {
                    dummy_feature$set_max_o(max_o)
                }
            }
        }

        cluster_row[[row]] <- create_associated_cluster(
            orm,
            context$samples[context$central_feature[[clusterID]]][[1]],
            dummy_feature, clusterID,
            context, curent_var_meta, next_pc_group,
            next_align_group
        )
        next_align_group <- next_align_group + 1
        features[[row]] <- as.list(dummy_feature, field_names)
        # features[[length(features) + 1]] <- as.list(dummy_feature, field_names)
        dummy_feature$clear()
    }
    rm(var_meta)
    message("")
    message("Saving features")
    invisible(dummy_feature$save(bulk = features))

    ## We link manually clusters to the sample they're in.
    link_cache <- list()
    for (row in rows) {
        sample_nos <- unique(context$peaks[context$groupidx[[row]], "sample"])
        for (sample_id in context$samples[sample_nos]) {
            cluster_id <- cluster_row[[row]]$get_id()
            id <- paste(sample_id, cluster_id, sep = ";")
            if (is.null(link_cache[[id]])) {
                link_cache[[id]] <- 1
                orm$cluster_sample(
                    sample_id = sample_id,
                    cluster_id = cluster_id
                )$save()
            }
        }
    }

    message("Saved.")
    return(context$clusters)
}

extract_peak_var <- function(peak_list, var_name, selector = max) {
    value <- peak_list[, var_name]
    names(value) <- NULL
    return(selector(value))
}

set_feature_fields_from_var_meta <- function(feature, var_meta) {
    if (!is.null(mz <- var_meta[["mz"]]) && !is.na(mz)) {
        feature$set_mz(mz)
    }
    if (!is.null(mzmin <- var_meta[["mzmin"]]) && !is.na(mzmin)) {
        feature$set_mz_min(mzmin)
    }
    if (!is.null(mzmax <- var_meta[["mzmax"]]) && !is.na(mzmax)) {
        feature$set_mz_max(mzmax)
    }
    if (!is.null(rt <- var_meta[["rt"]]) && !is.na(rt)) {
        feature$set_rt(rt)
    }
    if (!is.null(rtmin <- var_meta[["rtmin"]]) && !is.na(rtmin)) {
        feature$set_rt_min(rtmin)
    }
    if (!is.null(rtmax <- var_meta[["rtmax"]]) && !is.na(rtmax)) {
        feature$set_rt_max(rtmax)
    }
    if (!is.null(isotopes <- var_meta[["isotopes"]]) && !is.na(isotopes)) {
        feature$set_iso(isotopes)
    }
    return(feature)
}

extract_iso  <- function(weird_data) {
    if (grepl("^\\[\\d+\\]", weird_data)[[1]]) {
        return(sub("^\\[\\d+\\]", "", weird_data, perl = TRUE))
    }
    return(weird_data)
}

extract_clusterID <- function(weird_data, next_cluster_id) {
    if (grepl("^\\[\\d+\\]", weird_data)[[1]]) {
        clusterID <- stringr::str_extract(weird_data, "^\\[\\d+\\]")
        clusterID <- as.numeric(stringr::str_extract(clusterID, "\\d+"))
    } else {
        clusterID <- 0
    }
    return(clusterID + next_cluster_id)
}

create_associated_cluster <- function(
    orm,
    main_sample_id, feature, clusterID,
    context, curent_var_meta, next_pc_group, next_align_group
) {
    clusterID <- as.character(clusterID)
    if (is.null(cluster <- context$clusters[[clusterID]])) {
        pcgroup <- as.numeric(curent_var_meta[["pcgroup"]])
        adduct_name <- as.character(curent_var_meta[["adduct"]])
        annotation <- curent_var_meta[["isotopes"]]
        cluster <- context$clusters[[clusterID]] <- orm$cluster(
            pc_group = pcgroup + next_pc_group,
            # adduct=adduct,
            align_group = next_align_group,
            # curent_group=curent_group,
            clusterID = context$clusterID,
            annotation = annotation
        )
        if (is.null(adduct <- context$adducts[[adduct_name]])) {
            context$adducts[[adduct_name]] <- orm$adduct()$load_by(
                name = adduct_name
            )$first()
            if (is.null(adduct <- context$adducts[[adduct_name]])) {
                adduct <- context$adducts[[adduct_name]] <- orm$adduct(
                    name = adduct_name,
                    charge = 0
                )
                adduct$save()
            }
        }
        cluster$set_adduct(adduct)
        ## Crappy hack to assign sample id to cluster without loading the
        ## sample. Samples are too big (their sample$env) and slows the
        ## process, and eat all the menory so we dont't want to load them.
        cluster[["sample_id"]] <- main_sample_id
        cluster$modified__[["sample_id"]] <- main_sample_id
    } else {
        if (context$clusterID != 0 && cluster$get_clusterID() == 0) {
            cluster$set_clusterID(context$clusterID)
        }
    }
    cluster$save()
    feature$set_cluster(cluster)
    feature$save()
    return(cluster)
}

complete_features <- function(orm, clusters, show_percent) {
    total <- length(clusters)
    percent <- -1
    i <- 0
    for (cluster in clusters) {
        i <- i + 1
        if (show_percent && (i / total) * 100 > percent) {
            percent <- percent + 1
            message("\r", sprintf("\r%d %%", percent), appendLF = FALSE)
        }
        features <- orm$feature()$load_by(cluster_id = cluster$get_id())
        if (features$any()) {
            if (!is.null(rt <- features$mean("rt"))) {
                cluster$set_mean_rt(rt)$save()
            }
            features_df <- as.data.frame(features)
            central_feature <- features_df[
                grepl("^\\[M\\]", features_df[, "iso"]),
            ]
            central_feature_into <- central_feature[["int_o"]]
            if (
                !identical(central_feature_into, numeric(0))
                && central_feature_into != 0
            ) {
                for (feature in as.vector(features)) {
                    feature$set_abundance(
                        feature$get_int_o() / central_feature_into * 100
                    )$save()
                }
            }
        }
    }
    return(NULL)
}

load_process_params <- function(orm, sample, params) {
    for (param_list in params) {
        if (is.null(param_list[["xfunction"]])) {
            next
        }
        if (param_list[["xfunction"]] == "annotatediff") {
            load_process_params_peak_picking(orm, sample, param_list)
        }
    }
    return(sample)
}

load_process_params_peak_picking <- function(
    orm,
    sample,
    peak_picking_params
) {
    return(add_sample_process_parameters(
        params = peak_picking_params,
        params_translation = list(
            ppm = "ppm",
            maxcharge = "maxCharge",
            maxiso = "maxIso"
        ),
        param_model_generator = orm$peak_picking_parameters,
        sample_param_setter = sample$set_peak_picking_parameters
    ))
}

add_sample_process_parameters <- function(
    params,
    params_translation,
    param_model_generator,
    sample_param_setter
) {
    model_params <- list()
    for (rdata_param_name in names(params_translation)) {
        database_param_name <- params_translation[[rdata_param_name]]
        if (is.null(rdata_param <- params[[rdata_param_name]])) {
            next
        }
        model_params[[database_param_name]] <- rdata_param
    }
    params_models <- do.call(param_model_generator()$load_by, model_params)
    if (params_models$any()) {
        params_model <- params_models$first()
    } else {
        params_model <- do.call(param_model_generator, model_params)
        params_model$save()
    }
    return(sample_param_setter(params_model)$save())
}


library(optparse)

option_list <- list(
    optparse::make_option(
        c("-v", "--version"),
        action = "store_true",
        help = "Display this tool's version and exits"
    ),
    optparse::make_option(
        c("-V", "--verbose"),
        action = "store_true",
        help = "Does more verbose outputs",
        default = FALSE
    ),
    optparse::make_option(
        c("-i", "--input"),
        type = "character",
        help = "The rdata path to import in XSeeker"
    ),
    optparse::make_option(
        c("-s", "--samples"),
        type = "character",
        help = "Samples to visualise in XSeeker"
    ),
    optparse::make_option(
        c("-B", "--archetype"),
        type = "character",
        help = "The name of the base database"
    ),
    optparse::make_option(
        c("-b", "--database"),
        type = "character",
        help = "The base database's path"
    ),
    optparse::make_option(
        c("-c", "--compounds-csv"),
        type = "character",
        help = "The csv containing compounds"
    ),
    optparse::make_option(
        c("-m", "--models"),
        type = "character",
        help = paste(
            "The path or url (must begin with http[s]:// or git@) to",
            "the database's models"
        )
    ),
    optparse::make_option(
        c("-k", "--class"),
        type = "character",
        help = "The name of the column containing the classes"
   ),
    optparse::make_option(
        c("-o", "--output"),
        type = "character",
        help = "The path where to output sqlite"
    ),
    optparse::make_option(
        c("-P", "--not-show-percent"),
        action = "store_true",
        help = "Flag not to show the percents",
        default = FALSE
    )
)

options(error = function(){traceback(3)})

parser <- OptionParser(
    usage = "%prog [options] file",
    option_list = option_list
)
args <- parse_args(parser, positional_arguments = 0)

err_code <- 0

if (!is.null(args$options$version)) {
    message(sprintf("%s %s", TOOL_NAME, VERSION))
    quit()
}

models <- get_models(args$options$models)
orm <- DBModelR::ORM(
    connection_params = list(dbname=args$options$output),
    dbms = "SQLite"
)

invisible(orm$models(models))
invisible(create_database(orm))

message("Database model created")

insert_adducts(orm)

if (!is.null(args$options$database)) {
    insert_base_data(orm, args$options$database)
}
message(sprintf("Base data inserted using %s.", args$options$database))

if (!is.null(args$options$archetype)) {
    insert_base_data(orm, args$options$archetype, archetype = TRUE)
}
if (!is.null(args$options$`compounds-csv`)) {
    insert_compounds(orm, args$options$`compounds-csv`)
}

# if (!is.null(args$options$rdata)) {
#     load_rdata_in_base(args$options$rdata, args$options$samples, args$options$`not-show-percent`)
# }


load(args$options$input, rdata <- new.env())

args$options$verbose <- (
    if (args$options$verbose) {
        message("Verbose outputs.")
        \(...) {
            message(sprintf(...))
        }
    } else {
        \(...) {
        }
    }
)

err_code <- process_rdata(orm, rdata, args$options)

quit(status = err_code)
author	lain
date	Tue, 06 Dec 2022 10:18:10 +0000
parents	2937e72e5891
children