Mercurial > repos > prog > lcmsmatching
comparison msdb-common.R @ 6:f86fec07f392 draft default tip
planemo upload commit c397cd8a93953798d733fd62653f7098caac30ce
| author | prog |
|---|---|
| date | Fri, 22 Feb 2019 16:04:22 -0500 |
| parents | fb9c0409d85c |
| children |
comparison
equal
deleted
inserted
replaced
| 5:fb9c0409d85c | 6:f86fec07f392 |
|---|---|
| 1 if ( ! exists('.parse_chrom_col_desc')) { # Do not load again if already loaded | |
| 2 | |
| 3 library('stringr') | |
| 4 source('strhlp.R', chdir = TRUE) | |
| 5 source('biodb-common.R', chdir = TRUE) | |
| 6 | |
| 7 ############# | |
| 8 # CONSTANTS # | |
| 9 ############# | |
| 10 | |
| 11 # Field tags | |
| 12 MSDB.TAG.MZ <- 'mz' | |
| 13 MSDB.TAG.MZEXP <- 'mzexp' | |
| 14 MSDB.TAG.MZTHEO <- 'mztheo' | |
| 15 MSDB.TAG.RT <- 'rt' | |
| 16 MSDB.TAG.MODE <- 'msmode' | |
| 17 MSDB.TAG.MOLID <- 'compoundid' | |
| 18 MSDB.TAG.COL <- 'chromcol' | |
| 19 MSDB.TAG.COLRT <- 'chromcolrt' | |
| 20 MSDB.TAG.ATTR <- 'peakattr' | |
| 21 MSDB.TAG.INT <- 'intensity' | |
| 22 MSDB.TAG.REL <- 'relative.intensity' | |
| 23 MSDB.TAG.COMP <- 'peakcomp' | |
| 24 MSDB.TAG.MOLNAMES <- 'fullnames' | |
| 25 MSDB.TAG.MOLCOMP <- 'compoundmass' | |
| 26 MSDB.TAG.MOLMASS <- 'compoundcomp' | |
| 27 MSDB.TAG.INCHI <- 'inchi' | |
| 28 MSDB.TAG.INCHIKEY <- 'inchikey' | |
| 29 MSDB.TAG.PUBCHEM <- 'pubchemcompid' | |
| 30 MSDB.TAG.CHEBI <- 'chebiid' | |
| 31 MSDB.TAG.HMDB <- 'hmdbid' | |
| 32 MSDB.TAG.KEGG <- 'keggid' | |
| 33 | |
| 34 # Mode tags | |
| 35 MSDB.TAG.POS <- 'neg' | |
| 36 MSDB.TAG.NEG <- 'pos' | |
| 37 | |
| 38 # Fields containing multiple values | |
| 39 MSDB.MULTIVAL.FIELDS <- c(MSDB.TAG.MOLNAMES) | |
| 40 MSDB.MULTIVAL.FIELD.SEP <- ';' | |
| 41 | |
| 42 # Authorized mz tolerance unit values | |
| 43 MSDB.MZTOLUNIT.PPM <- 'ppm' | |
| 44 MSDB.MZTOLUNIT.PLAIN <- 'plain' # same as mz: mass-to-charge ratio | |
| 45 MSDB.MZTOLUNIT.VALS <- c(MSDB.MZTOLUNIT.PPM, MSDB.MZTOLUNIT.PLAIN) | |
| 46 | |
| 47 # Authorized rt units | |
| 48 MSDB.RTUNIT.SEC <- 'sec' | |
| 49 MSDB.RTUNIT.MIN <- 'min' | |
| 50 MSDB.RTUNIT.VALS <- c(MSDB.RTUNIT.SEC ,MSDB.RTUNIT.MIN) | |
| 51 | |
| 52 # Default values | |
| 53 MSDB.DFT.PREC <- list() | |
| 54 MSDB.DFT.PREC[[MSDB.TAG.POS]] <- c("[(M+H)]+", "[M+H]+", "[(M+Na)]+", "[M+Na]+", "[(M+K)]+", "[M+K]+") | |
| 55 MSDB.DFT.PREC[[MSDB.TAG.NEG]] <- c("[(M-H)]-", "[M-H]-", "[(M+Cl)]-", "[M+Cl]-") | |
| 56 MSDB.DFT.OUTPUT.MULTIVAL.FIELD.SEP <- MSDB.MULTIVAL.FIELD.SEP | |
| 57 MSDB.DFT.MATCH.FIELDS <- list( molids = 'molid', molnames = 'molnames') | |
| 58 MSDB.DFT.MATCH.SEP <- ',' | |
| 59 MSDB.DFT.MODES <- list( pos = 'POS', neg = 'NEG') | |
| 60 MSDB.DFT.MZTOLUNIT <- MSDB.MZTOLUNIT.PPM | |
| 61 | |
| 62 ############################ | |
| 63 # GET DEFAULT INPUT FIELDS # | |
| 64 ############################ | |
| 65 | |
| 66 msdb.get.dft.input.fields <- function () { | |
| 67 | |
| 68 dft.fields <- list() | |
| 69 | |
| 70 for(f in c(MSDB.TAG.MZ, MSDB.TAG.RT)) | |
| 71 dft.fields[[f]] <- f | |
| 72 | |
| 73 return(dft.fields) | |
| 74 } | |
| 75 | |
| 76 ######################### | |
| 77 # GET DEFAULT DB FIELDS # | |
| 78 ######################### | |
| 79 | |
| 80 msdb.get.dft.db.fields <- function () { | |
| 81 | |
| 82 dft.fields <- list() | |
| 83 | |
| 84 for (f in c(MSDB.TAG.MZTHEO, MSDB.TAG.COLRT, MSDB.TAG.MOLID, MSDB.TAG.COL, MSDB.TAG.MODE, MSDB.TAG.ATTR, MSDB.TAG.COMP, MSDB.TAG.MOLNAMES, MSDB.TAG.MOLCOMP, MSDB.TAG.MOLMASS, MSDB.TAG.INCHI, MSDB.TAG.INCHIKEY, MSDB.TAG.PUBCHEM, MSDB.TAG.CHEBI, MSDB.TAG.HMDB, MSDB.TAG.KEGG)) | |
| 85 dft.fields[[f]] <- f | |
| 86 | |
| 87 return(dft.fields) | |
| 88 } | |
| 89 | |
| 90 ################## | |
| 91 # MAKE DB FIELDS # | |
| 92 ################## | |
| 93 | |
| 94 msdb.make.db.fields <- function(fields) { | |
| 95 | |
| 96 # Merge with default fields | |
| 97 dft.fields <- msdb.get.dft.db.fields() | |
| 98 absent <- ! names(dft.fields) %in% names(fields) | |
| 99 if (length(absent) > 0) | |
| 100 fields <- c(fields, dft.fields[absent]) | |
| 101 | |
| 102 return(fields) | |
| 103 } | |
| 104 | |
| 105 ######################### | |
| 106 # MAKE INPUT DATA FRAME # | |
| 107 ######################### | |
| 108 | |
| 109 msdb.make.input.df <- function(mz, rt = NULL, rtunit = MSDB.RTUNIT.SEC) { | |
| 110 | |
| 111 field <- msdb.get.dft.input.fields() | |
| 112 | |
| 113 x <- data.frame() | |
| 114 | |
| 115 # Set mz | |
| 116 if (length(mz) > 1) | |
| 117 x[seq(mz), field[[MSDB.TAG.MZ]]] <- mz | |
| 118 else if (length(mz) == 1) | |
| 119 x[1, field[[MSDB.TAG.MZ]]] <- mz | |
| 120 else | |
| 121 x[, field[[MSDB.TAG.MZ]]] <- numeric() | |
| 122 | |
| 123 # Set rt | |
| 124 if ( ! is.null(rt)) { | |
| 125 if (rtunit == MSDB.RTUNIT.MIN) | |
| 126 rtunit <- rtunit * 60 | |
| 127 if (length(rt) > 1) | |
| 128 x[seq(rt), field[[MSDB.TAG.RT]]] <- rt | |
| 129 else if (length(rt) == 1) | |
| 130 x[1, field[[MSDB.TAG.RT]]] <- rt | |
| 131 else | |
| 132 x[, field[[MSDB.TAG.RT]]] <- numeric() | |
| 133 } | |
| 134 | |
| 135 return(x) | |
| 136 } | |
| 137 | |
| 138 ############################### | |
| 139 # GET EMPTY RESULT DATA FRAME # | |
| 140 ############################### | |
| 141 | |
| 142 .get.empty.result.df <- function(rt = FALSE) { | |
| 143 | |
| 144 df <- data.frame(stringsAsFactors = FALSE) | |
| 145 df[MSDB.TAG.MOLID] <- character() | |
| 146 df[MSDB.TAG.MOLNAMES] <- character() | |
| 147 df[MSDB.TAG.MZ] <- numeric() | |
| 148 df[MSDB.TAG.MZTHEO] <- numeric() | |
| 149 df[MSDB.TAG.ATTR] <- character() | |
| 150 df[MSDB.TAG.COMP] <- character() | |
| 151 if (rt) { | |
| 152 df[MSDB.TAG.RT] <- numeric() | |
| 153 df[MSDB.TAG.COL] <- character() | |
| 154 df[MSDB.TAG.COLRT] <- numeric() | |
| 155 } | |
| 156 | |
| 157 return(df) | |
| 158 } | |
| 159 | |
| 160 ############################ | |
| 161 # PARSE COLUMN DESCRIPTION # | |
| 162 ############################ | |
| 163 | |
| 164 .parse_chrom_col_desc <- function(desc) { | |
| 165 | |
| 166 # Clean string | |
| 167 s <- desc | |
| 168 s <- gsub('\\.+', ' ', s, perl = TRUE) # Replace '.' characters by spaces | |
| 169 s <- gsub('[*-]', ' ', s, perl = TRUE) # Replace dashes and asterisks by spaces | |
| 170 s <- gsub('[)(]', '', s, perl = TRUE) # Remove paranthesis | |
| 171 s <- trim(s) | |
| 172 s <- tolower(s) # put in lowercase | |
| 173 | |
| 174 # Match 2 3 4 5 6 7 8 9 10 1112 13 | |
| 175 pattern <- "^(uplc|hsf5|hplc|zicphilic)( (c8|c18|150 5 2 1))?( (\\d+)mn)?( (orbitrap|exactive|qtof|shimadzu exactive))?( (\\d+)mn)?( (bis|ter))?( 1)?$" | |
| 176 g <- str_match(s, pattern) | |
| 177 if (is.na(g[1, 1])) | |
| 178 stop(paste0("Impossible to parse column description \"", desc, "\".")) | |
| 179 | |
| 180 type <- g[1, 2] | |
| 181 stationary_phase <- if ( ! is.na(g[1, 4]) && nchar(g[1, 4]) > 0) g[1, 4] else NA_character_ | |
| 182 msdevice <- if ( ! is.na(g[1, 8]) && nchar(g[1, 8]) > 0) g[1, 8] else NA_character_ | |
| 183 time <- if ( ! is.na(g[1,6]) && nchar(g[1, 6]) > 0) as.integer(g[1, 6]) else ( if ( ! is.na(g[1, 10]) && nchar(g[1, 10]) > 0) as.integer(g[1, 10]) else NA_integer_ ) | |
| 184 | |
| 185 # Correct values | |
| 186 if ( ! is.na(stationary_phase) && stationary_phase == '150 5 2 1') stationary_phase <- '150*5*2.1' | |
| 187 if ( ! is.na(msdevice)) msdevice <- gsub(' ', '', msdevice) # remove spaces | |
| 188 | |
| 189 return(list( type = type, stationary_phase = stationary_phase, time = time, msdevice = msdevice)) | |
| 190 | |
| 191 } | |
| 192 | |
| 193 ######################### | |
| 194 # NORMALIZE COLUMN NAME # | |
| 195 ######################### | |
| 196 | |
| 197 .normalize_column_name <- function(desc) { | |
| 198 | |
| 199 lst <- .parse_chrom_col_desc(desc) | |
| 200 | |
| 201 v <- c(lst$type) | |
| 202 if ( ! is.na(lst$stationary_phase)) | |
| 203 v <- c(v, lst$stationary_phase) | |
| 204 if ( ! is.na(lst$time)) | |
| 205 v <- c(v, paste0(lst$time, "min")) | |
| 206 if ( ! is.na(lst$msdevice)) | |
| 207 v <- c(v, lst$msdevice) | |
| 208 | |
| 209 return(paste(v, collapse = '-')) | |
| 210 } | |
| 211 | |
| 212 } # end of load safe guard |
