Mercurial > repos > prog > lcmsmatching
comparison msdb-common.R @ 6:f86fec07f392 draft default tip
planemo upload commit c397cd8a93953798d733fd62653f7098caac30ce
author | prog |
---|---|
date | Fri, 22 Feb 2019 16:04:22 -0500 |
parents | fb9c0409d85c |
children |
comparison
equal
deleted
inserted
replaced
5:fb9c0409d85c | 6:f86fec07f392 |
---|---|
1 if ( ! exists('.parse_chrom_col_desc')) { # Do not load again if already loaded | |
2 | |
3 library('stringr') | |
4 source('strhlp.R', chdir = TRUE) | |
5 source('biodb-common.R', chdir = TRUE) | |
6 | |
7 ############# | |
8 # CONSTANTS # | |
9 ############# | |
10 | |
11 # Field tags | |
12 MSDB.TAG.MZ <- 'mz' | |
13 MSDB.TAG.MZEXP <- 'mzexp' | |
14 MSDB.TAG.MZTHEO <- 'mztheo' | |
15 MSDB.TAG.RT <- 'rt' | |
16 MSDB.TAG.MODE <- 'msmode' | |
17 MSDB.TAG.MOLID <- 'compoundid' | |
18 MSDB.TAG.COL <- 'chromcol' | |
19 MSDB.TAG.COLRT <- 'chromcolrt' | |
20 MSDB.TAG.ATTR <- 'peakattr' | |
21 MSDB.TAG.INT <- 'intensity' | |
22 MSDB.TAG.REL <- 'relative.intensity' | |
23 MSDB.TAG.COMP <- 'peakcomp' | |
24 MSDB.TAG.MOLNAMES <- 'fullnames' | |
25 MSDB.TAG.MOLCOMP <- 'compoundmass' | |
26 MSDB.TAG.MOLMASS <- 'compoundcomp' | |
27 MSDB.TAG.INCHI <- 'inchi' | |
28 MSDB.TAG.INCHIKEY <- 'inchikey' | |
29 MSDB.TAG.PUBCHEM <- 'pubchemcompid' | |
30 MSDB.TAG.CHEBI <- 'chebiid' | |
31 MSDB.TAG.HMDB <- 'hmdbid' | |
32 MSDB.TAG.KEGG <- 'keggid' | |
33 | |
34 # Mode tags | |
35 MSDB.TAG.POS <- 'neg' | |
36 MSDB.TAG.NEG <- 'pos' | |
37 | |
38 # Fields containing multiple values | |
39 MSDB.MULTIVAL.FIELDS <- c(MSDB.TAG.MOLNAMES) | |
40 MSDB.MULTIVAL.FIELD.SEP <- ';' | |
41 | |
42 # Authorized mz tolerance unit values | |
43 MSDB.MZTOLUNIT.PPM <- 'ppm' | |
44 MSDB.MZTOLUNIT.PLAIN <- 'plain' # same as mz: mass-to-charge ratio | |
45 MSDB.MZTOLUNIT.VALS <- c(MSDB.MZTOLUNIT.PPM, MSDB.MZTOLUNIT.PLAIN) | |
46 | |
47 # Authorized rt units | |
48 MSDB.RTUNIT.SEC <- 'sec' | |
49 MSDB.RTUNIT.MIN <- 'min' | |
50 MSDB.RTUNIT.VALS <- c(MSDB.RTUNIT.SEC ,MSDB.RTUNIT.MIN) | |
51 | |
52 # Default values | |
53 MSDB.DFT.PREC <- list() | |
54 MSDB.DFT.PREC[[MSDB.TAG.POS]] <- c("[(M+H)]+", "[M+H]+", "[(M+Na)]+", "[M+Na]+", "[(M+K)]+", "[M+K]+") | |
55 MSDB.DFT.PREC[[MSDB.TAG.NEG]] <- c("[(M-H)]-", "[M-H]-", "[(M+Cl)]-", "[M+Cl]-") | |
56 MSDB.DFT.OUTPUT.MULTIVAL.FIELD.SEP <- MSDB.MULTIVAL.FIELD.SEP | |
57 MSDB.DFT.MATCH.FIELDS <- list( molids = 'molid', molnames = 'molnames') | |
58 MSDB.DFT.MATCH.SEP <- ',' | |
59 MSDB.DFT.MODES <- list( pos = 'POS', neg = 'NEG') | |
60 MSDB.DFT.MZTOLUNIT <- MSDB.MZTOLUNIT.PPM | |
61 | |
62 ############################ | |
63 # GET DEFAULT INPUT FIELDS # | |
64 ############################ | |
65 | |
66 msdb.get.dft.input.fields <- function () { | |
67 | |
68 dft.fields <- list() | |
69 | |
70 for(f in c(MSDB.TAG.MZ, MSDB.TAG.RT)) | |
71 dft.fields[[f]] <- f | |
72 | |
73 return(dft.fields) | |
74 } | |
75 | |
76 ######################### | |
77 # GET DEFAULT DB FIELDS # | |
78 ######################### | |
79 | |
80 msdb.get.dft.db.fields <- function () { | |
81 | |
82 dft.fields <- list() | |
83 | |
84 for (f in c(MSDB.TAG.MZTHEO, MSDB.TAG.COLRT, MSDB.TAG.MOLID, MSDB.TAG.COL, MSDB.TAG.MODE, MSDB.TAG.ATTR, MSDB.TAG.COMP, MSDB.TAG.MOLNAMES, MSDB.TAG.MOLCOMP, MSDB.TAG.MOLMASS, MSDB.TAG.INCHI, MSDB.TAG.INCHIKEY, MSDB.TAG.PUBCHEM, MSDB.TAG.CHEBI, MSDB.TAG.HMDB, MSDB.TAG.KEGG)) | |
85 dft.fields[[f]] <- f | |
86 | |
87 return(dft.fields) | |
88 } | |
89 | |
90 ################## | |
91 # MAKE DB FIELDS # | |
92 ################## | |
93 | |
94 msdb.make.db.fields <- function(fields) { | |
95 | |
96 # Merge with default fields | |
97 dft.fields <- msdb.get.dft.db.fields() | |
98 absent <- ! names(dft.fields) %in% names(fields) | |
99 if (length(absent) > 0) | |
100 fields <- c(fields, dft.fields[absent]) | |
101 | |
102 return(fields) | |
103 } | |
104 | |
105 ######################### | |
106 # MAKE INPUT DATA FRAME # | |
107 ######################### | |
108 | |
109 msdb.make.input.df <- function(mz, rt = NULL, rtunit = MSDB.RTUNIT.SEC) { | |
110 | |
111 field <- msdb.get.dft.input.fields() | |
112 | |
113 x <- data.frame() | |
114 | |
115 # Set mz | |
116 if (length(mz) > 1) | |
117 x[seq(mz), field[[MSDB.TAG.MZ]]] <- mz | |
118 else if (length(mz) == 1) | |
119 x[1, field[[MSDB.TAG.MZ]]] <- mz | |
120 else | |
121 x[, field[[MSDB.TAG.MZ]]] <- numeric() | |
122 | |
123 # Set rt | |
124 if ( ! is.null(rt)) { | |
125 if (rtunit == MSDB.RTUNIT.MIN) | |
126 rtunit <- rtunit * 60 | |
127 if (length(rt) > 1) | |
128 x[seq(rt), field[[MSDB.TAG.RT]]] <- rt | |
129 else if (length(rt) == 1) | |
130 x[1, field[[MSDB.TAG.RT]]] <- rt | |
131 else | |
132 x[, field[[MSDB.TAG.RT]]] <- numeric() | |
133 } | |
134 | |
135 return(x) | |
136 } | |
137 | |
138 ############################### | |
139 # GET EMPTY RESULT DATA FRAME # | |
140 ############################### | |
141 | |
142 .get.empty.result.df <- function(rt = FALSE) { | |
143 | |
144 df <- data.frame(stringsAsFactors = FALSE) | |
145 df[MSDB.TAG.MOLID] <- character() | |
146 df[MSDB.TAG.MOLNAMES] <- character() | |
147 df[MSDB.TAG.MZ] <- numeric() | |
148 df[MSDB.TAG.MZTHEO] <- numeric() | |
149 df[MSDB.TAG.ATTR] <- character() | |
150 df[MSDB.TAG.COMP] <- character() | |
151 if (rt) { | |
152 df[MSDB.TAG.RT] <- numeric() | |
153 df[MSDB.TAG.COL] <- character() | |
154 df[MSDB.TAG.COLRT] <- numeric() | |
155 } | |
156 | |
157 return(df) | |
158 } | |
159 | |
160 ############################ | |
161 # PARSE COLUMN DESCRIPTION # | |
162 ############################ | |
163 | |
164 .parse_chrom_col_desc <- function(desc) { | |
165 | |
166 # Clean string | |
167 s <- desc | |
168 s <- gsub('\\.+', ' ', s, perl = TRUE) # Replace '.' characters by spaces | |
169 s <- gsub('[*-]', ' ', s, perl = TRUE) # Replace dashes and asterisks by spaces | |
170 s <- gsub('[)(]', '', s, perl = TRUE) # Remove paranthesis | |
171 s <- trim(s) | |
172 s <- tolower(s) # put in lowercase | |
173 | |
174 # Match 2 3 4 5 6 7 8 9 10 1112 13 | |
175 pattern <- "^(uplc|hsf5|hplc|zicphilic)( (c8|c18|150 5 2 1))?( (\\d+)mn)?( (orbitrap|exactive|qtof|shimadzu exactive))?( (\\d+)mn)?( (bis|ter))?( 1)?$" | |
176 g <- str_match(s, pattern) | |
177 if (is.na(g[1, 1])) | |
178 stop(paste0("Impossible to parse column description \"", desc, "\".")) | |
179 | |
180 type <- g[1, 2] | |
181 stationary_phase <- if ( ! is.na(g[1, 4]) && nchar(g[1, 4]) > 0) g[1, 4] else NA_character_ | |
182 msdevice <- if ( ! is.na(g[1, 8]) && nchar(g[1, 8]) > 0) g[1, 8] else NA_character_ | |
183 time <- if ( ! is.na(g[1,6]) && nchar(g[1, 6]) > 0) as.integer(g[1, 6]) else ( if ( ! is.na(g[1, 10]) && nchar(g[1, 10]) > 0) as.integer(g[1, 10]) else NA_integer_ ) | |
184 | |
185 # Correct values | |
186 if ( ! is.na(stationary_phase) && stationary_phase == '150 5 2 1') stationary_phase <- '150*5*2.1' | |
187 if ( ! is.na(msdevice)) msdevice <- gsub(' ', '', msdevice) # remove spaces | |
188 | |
189 return(list( type = type, stationary_phase = stationary_phase, time = time, msdevice = msdevice)) | |
190 | |
191 } | |
192 | |
193 ######################### | |
194 # NORMALIZE COLUMN NAME # | |
195 ######################### | |
196 | |
197 .normalize_column_name <- function(desc) { | |
198 | |
199 lst <- .parse_chrom_col_desc(desc) | |
200 | |
201 v <- c(lst$type) | |
202 if ( ! is.na(lst$stationary_phase)) | |
203 v <- c(v, lst$stationary_phase) | |
204 if ( ! is.na(lst$time)) | |
205 v <- c(v, paste0(lst$time, "min")) | |
206 if ( ! is.na(lst$msdevice)) | |
207 v <- c(v, lst$msdevice) | |
208 | |
209 return(paste(v, collapse = '-')) | |
210 } | |
211 | |
212 } # end of load safe guard |