Mercurial > repos > prog > lcmsmatching
diff msdb-common.R @ 6:f86fec07f392 draft default tip
planemo upload commit c397cd8a93953798d733fd62653f7098caac30ce
author | prog |
---|---|
date | Fri, 22 Feb 2019 16:04:22 -0500 |
parents | fb9c0409d85c |
children |
line wrap: on
line diff
--- a/msdb-common.R Wed Apr 19 10:00:05 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,212 +0,0 @@ -if ( ! exists('.parse_chrom_col_desc')) { # Do not load again if already loaded - - library('stringr') - source('strhlp.R', chdir = TRUE) - source('biodb-common.R', chdir = TRUE) - - ############# - # CONSTANTS # - ############# - - # Field tags - MSDB.TAG.MZ <- 'mz' - MSDB.TAG.MZEXP <- 'mzexp' - MSDB.TAG.MZTHEO <- 'mztheo' - MSDB.TAG.RT <- 'rt' - MSDB.TAG.MODE <- 'msmode' - MSDB.TAG.MOLID <- 'compoundid' - MSDB.TAG.COL <- 'chromcol' - MSDB.TAG.COLRT <- 'chromcolrt' - MSDB.TAG.ATTR <- 'peakattr' - MSDB.TAG.INT <- 'intensity' - MSDB.TAG.REL <- 'relative.intensity' - MSDB.TAG.COMP <- 'peakcomp' - MSDB.TAG.MOLNAMES <- 'fullnames' - MSDB.TAG.MOLCOMP <- 'compoundmass' - MSDB.TAG.MOLMASS <- 'compoundcomp' - MSDB.TAG.INCHI <- 'inchi' - MSDB.TAG.INCHIKEY <- 'inchikey' - MSDB.TAG.PUBCHEM <- 'pubchemcompid' - MSDB.TAG.CHEBI <- 'chebiid' - MSDB.TAG.HMDB <- 'hmdbid' - MSDB.TAG.KEGG <- 'keggid' - - # Mode tags - MSDB.TAG.POS <- 'neg' - MSDB.TAG.NEG <- 'pos' - - # Fields containing multiple values - MSDB.MULTIVAL.FIELDS <- c(MSDB.TAG.MOLNAMES) - MSDB.MULTIVAL.FIELD.SEP <- ';' - - # Authorized mz tolerance unit values - MSDB.MZTOLUNIT.PPM <- 'ppm' - MSDB.MZTOLUNIT.PLAIN <- 'plain' # same as mz: mass-to-charge ratio - MSDB.MZTOLUNIT.VALS <- c(MSDB.MZTOLUNIT.PPM, MSDB.MZTOLUNIT.PLAIN) - - # Authorized rt units - MSDB.RTUNIT.SEC <- 'sec' - MSDB.RTUNIT.MIN <- 'min' - MSDB.RTUNIT.VALS <- c(MSDB.RTUNIT.SEC ,MSDB.RTUNIT.MIN) - - # Default values - MSDB.DFT.PREC <- list() - MSDB.DFT.PREC[[MSDB.TAG.POS]] <- c("[(M+H)]+", "[M+H]+", "[(M+Na)]+", "[M+Na]+", "[(M+K)]+", "[M+K]+") - MSDB.DFT.PREC[[MSDB.TAG.NEG]] <- c("[(M-H)]-", "[M-H]-", "[(M+Cl)]-", "[M+Cl]-") - MSDB.DFT.OUTPUT.MULTIVAL.FIELD.SEP <- MSDB.MULTIVAL.FIELD.SEP - MSDB.DFT.MATCH.FIELDS <- list( molids = 'molid', molnames = 'molnames') - MSDB.DFT.MATCH.SEP <- ',' - MSDB.DFT.MODES <- list( pos = 'POS', neg = 'NEG') - MSDB.DFT.MZTOLUNIT <- MSDB.MZTOLUNIT.PPM - - ############################ - # GET DEFAULT INPUT FIELDS # - ############################ - - msdb.get.dft.input.fields <- function () { - - dft.fields <- list() - - for(f in c(MSDB.TAG.MZ, MSDB.TAG.RT)) - dft.fields[[f]] <- f - - return(dft.fields) - } - - ######################### - # GET DEFAULT DB FIELDS # - ######################### - - msdb.get.dft.db.fields <- function () { - - dft.fields <- list() - - for (f in c(MSDB.TAG.MZTHEO, MSDB.TAG.COLRT, MSDB.TAG.MOLID, MSDB.TAG.COL, MSDB.TAG.MODE, MSDB.TAG.ATTR, MSDB.TAG.COMP, MSDB.TAG.MOLNAMES, MSDB.TAG.MOLCOMP, MSDB.TAG.MOLMASS, MSDB.TAG.INCHI, MSDB.TAG.INCHIKEY, MSDB.TAG.PUBCHEM, MSDB.TAG.CHEBI, MSDB.TAG.HMDB, MSDB.TAG.KEGG)) - dft.fields[[f]] <- f - - return(dft.fields) - } - - ################## - # MAKE DB FIELDS # - ################## - - msdb.make.db.fields <- function(fields) { - - # Merge with default fields - dft.fields <- msdb.get.dft.db.fields() - absent <- ! names(dft.fields) %in% names(fields) - if (length(absent) > 0) - fields <- c(fields, dft.fields[absent]) - - return(fields) - } - - ######################### - # MAKE INPUT DATA FRAME # - ######################### - - msdb.make.input.df <- function(mz, rt = NULL, rtunit = MSDB.RTUNIT.SEC) { - - field <- msdb.get.dft.input.fields() - - x <- data.frame() - - # Set mz - if (length(mz) > 1) - x[seq(mz), field[[MSDB.TAG.MZ]]] <- mz - else if (length(mz) == 1) - x[1, field[[MSDB.TAG.MZ]]] <- mz - else - x[, field[[MSDB.TAG.MZ]]] <- numeric() - - # Set rt - if ( ! is.null(rt)) { - if (rtunit == MSDB.RTUNIT.MIN) - rtunit <- rtunit * 60 - if (length(rt) > 1) - x[seq(rt), field[[MSDB.TAG.RT]]] <- rt - else if (length(rt) == 1) - x[1, field[[MSDB.TAG.RT]]] <- rt - else - x[, field[[MSDB.TAG.RT]]] <- numeric() - } - - return(x) - } - - ############################### - # GET EMPTY RESULT DATA FRAME # - ############################### - - .get.empty.result.df <- function(rt = FALSE) { - - df <- data.frame(stringsAsFactors = FALSE) - df[MSDB.TAG.MOLID] <- character() - df[MSDB.TAG.MOLNAMES] <- character() - df[MSDB.TAG.MZ] <- numeric() - df[MSDB.TAG.MZTHEO] <- numeric() - df[MSDB.TAG.ATTR] <- character() - df[MSDB.TAG.COMP] <- character() - if (rt) { - df[MSDB.TAG.RT] <- numeric() - df[MSDB.TAG.COL] <- character() - df[MSDB.TAG.COLRT] <- numeric() - } - - return(df) - } - - ############################ - # PARSE COLUMN DESCRIPTION # - ############################ - - .parse_chrom_col_desc <- function(desc) { - - # Clean string - s <- desc - s <- gsub('\\.+', ' ', s, perl = TRUE) # Replace '.' characters by spaces - s <- gsub('[*-]', ' ', s, perl = TRUE) # Replace dashes and asterisks by spaces - s <- gsub('[)(]', '', s, perl = TRUE) # Remove paranthesis - s <- trim(s) - s <- tolower(s) # put in lowercase - - # Match 2 3 4 5 6 7 8 9 10 1112 13 - pattern <- "^(uplc|hsf5|hplc|zicphilic)( (c8|c18|150 5 2 1))?( (\\d+)mn)?( (orbitrap|exactive|qtof|shimadzu exactive))?( (\\d+)mn)?( (bis|ter))?( 1)?$" - g <- str_match(s, pattern) - if (is.na(g[1, 1])) - stop(paste0("Impossible to parse column description \"", desc, "\".")) - - type <- g[1, 2] - stationary_phase <- if ( ! is.na(g[1, 4]) && nchar(g[1, 4]) > 0) g[1, 4] else NA_character_ - msdevice <- if ( ! is.na(g[1, 8]) && nchar(g[1, 8]) > 0) g[1, 8] else NA_character_ - time <- if ( ! is.na(g[1,6]) && nchar(g[1, 6]) > 0) as.integer(g[1, 6]) else ( if ( ! is.na(g[1, 10]) && nchar(g[1, 10]) > 0) as.integer(g[1, 10]) else NA_integer_ ) - - # Correct values - if ( ! is.na(stationary_phase) && stationary_phase == '150 5 2 1') stationary_phase <- '150*5*2.1' - if ( ! is.na(msdevice)) msdevice <- gsub(' ', '', msdevice) # remove spaces - - return(list( type = type, stationary_phase = stationary_phase, time = time, msdevice = msdevice)) - - } - - ######################### - # NORMALIZE COLUMN NAME # - ######################### - - .normalize_column_name <- function(desc) { - - lst <- .parse_chrom_col_desc(desc) - - v <- c(lst$type) - if ( ! is.na(lst$stationary_phase)) - v <- c(v, lst$stationary_phase) - if ( ! is.na(lst$time)) - v <- c(v, paste0(lst$time, "min")) - if ( ! is.na(lst$msdevice)) - v <- c(v, lst$msdevice) - - return(paste(v, collapse = '-')) - } - -} # end of load safe guard