Mercurial > repos > prog > lcmsmatching
comparison msdb-common.R @ 0:e66bb061af06 draft
planemo upload for repository https://github.com/workflow4metabolomics/lcmsmatching.git commit 3529b25417f8e1a5836474c9adec4b696d35099d-dirty
author | prog |
---|---|
date | Tue, 12 Jul 2016 12:02:37 -0400 |
parents | |
children | 20d69a062da3 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e66bb061af06 |
---|---|
1 if ( ! exists('.parse_chrom_col_desc')) { # Do not load again if already loaded | |
2 | |
3 library('stringr') | |
4 source('strhlp.R', chdir = TRUE) | |
5 | |
6 ############# | |
7 # CONSTANTS # | |
8 ############# | |
9 | |
10 # Field tags | |
11 MSDB.TAG.MZ <- 'mz' | |
12 MSDB.TAG.MZEXP <- 'mzexp' | |
13 MSDB.TAG.MZTHEO <- 'mztheo' | |
14 MSDB.TAG.RT <- 'rt' | |
15 MSDB.TAG.MODE <- 'mode' | |
16 MSDB.TAG.MOLID <- 'molid' | |
17 MSDB.TAG.COL <- 'col' | |
18 MSDB.TAG.COLRT <- 'colrt' | |
19 MSDB.TAG.ATTR <- 'attr' | |
20 MSDB.TAG.INT <- 'int' # Absolute intensity | |
21 MSDB.TAG.REL <- 'rel' # Relative intensity | |
22 MSDB.TAG.COMP <- 'comp' | |
23 MSDB.TAG.MOLNAMES <- 'molnames' | |
24 MSDB.TAG.MOLCOMP <- 'molcomp' | |
25 MSDB.TAG.MOLATTR <- 'molattr' | |
26 MSDB.TAG.MOLMASS <- 'molmass' | |
27 MSDB.TAG.INCHI <- 'inchi' | |
28 MSDB.TAG.INCHIKEY <- 'inchikey' | |
29 MSDB.TAG.PUBCHEM <- 'pubchem' | |
30 MSDB.TAG.CHEBI <- 'chebi' | |
31 MSDB.TAG.HMDB <- 'hmdb' | |
32 MSDB.TAG.KEGG <- 'kegg' | |
33 | |
34 # Mode tags | |
35 MSDB.TAG.POS <- 'ms.pos' | |
36 MSDB.TAG.NEG <- 'ms.neg' | |
37 | |
38 # Fields containing multiple values | |
39 MSDB.MULTIVAL.FIELDS <- c(MSDB.TAG.MOLNAMES) | |
40 MSDB.MULTIVAL.FIELD.SEP <- ';' | |
41 | |
42 # Authorized mz tolerance unit values | |
43 MSDB.MZTOLUNIT.PPM <- 'ppm' | |
44 MSDB.MZTOLUNIT.PLAIN <- 'plain' # same as mz: mass-to-charge ratio | |
45 MSDB.MZTOLUNIT.VALS <- c(MSDB.MZTOLUNIT.PPM, MSDB.MZTOLUNIT.PLAIN) | |
46 | |
47 # Default values | |
48 MSDB.DFT.PREC <- list() | |
49 MSDB.DFT.PREC[[MSDB.TAG.POS]] <- c("[(M+H)]+", "[M+H]+", "[(M+Na)]+", "[M+Na]+", "[(M+K)]+", "[M+K]+") | |
50 MSDB.DFT.PREC[[MSDB.TAG.NEG]] <- c("[(M-H)]-", "[M-H]-", "[(M+Cl)]-", "[M+Cl]-") | |
51 MSDB.DFT.OUTPUT.FIELDS <- list( mz = 'mz', rt = 'rt', col = 'col', colrt = 'colrt', molid = 'id', attr = 'attribution', comp = 'composition', int = 'intensity', rel = 'relative', mzexp = 'mzexp', mztheo = 'mztheo', msmatching = 'msmatching', molnames = 'molnames', molcomp = 'molcomp', molmass = 'molmass', inchi = 'inchi', inchikey = 'inchikey', pubchem = 'pubchem', chebi = 'chebi', hmdb = 'hmdb', kegg = 'kegg') | |
52 MSDB.DFT.OUTPUT.MULTIVAL.FIELD.SEP <- MSDB.MULTIVAL.FIELD.SEP | |
53 MSDB.DFT.MATCH.FIELDS <- list( molids = 'molid', molnames = 'molnames') | |
54 MSDB.DFT.MATCH.SEP <- ',' | |
55 MSDB.DFT.MODES <- list( pos = 'POS', neg = 'NEG') | |
56 MSDB.DFT.MZTOLUNIT <- MSDB.MZTOLUNIT.PPM | |
57 | |
58 ############################ | |
59 # GET DEFAULT INPUT FIELDS # | |
60 ############################ | |
61 | |
62 msdb.get.dft.input.fields <- function () { | |
63 | |
64 dft.fields <- list() | |
65 | |
66 for(f in c(MSDB.TAG.MZ, MSDB.TAG.RT)) | |
67 dft.fields[[f]] <- f | |
68 | |
69 return(dft.fields) | |
70 } | |
71 | |
72 ############################# | |
73 # GET DEFAULT OUTPUT FIELDS # | |
74 ############################# | |
75 | |
76 msdb.get.dft.output.fields <- function () { | |
77 | |
78 dft.fields <- list() | |
79 | |
80 for(f in c(MSDB.TAG.MZ, MSDB.TAG.RT, MSDB.TAG.COL, MSDB.TAG.COLRT, MSDB.TAG.MOLID, MSDB.TAG.ATTR, MSDB.TAG.COMP, MSDB.TAG.INT, MSDB.TAG.REL, MSDB.TAG.MZEXP, MSDB.TAG.MZTHEO, MSDB.TAG.MOLNAMES, MSDB.TAG.MOLCOMP, MSDB.TAG.MOLMASS, MSDB.TAG.INCHI, MSDB.TAG.INCHIKEY, MSDB.TAG.PUBCHEM, MSDB.TAG.CHEBI, MSDB.TAG.HMDB, MSDB.TAG.KEGG)) | |
81 dft.fields[[f]] <- f | |
82 | |
83 return(dft.fields) | |
84 } | |
85 | |
86 ######################### | |
87 # GET DEFAULT DB FIELDS # | |
88 ######################### | |
89 | |
90 msdb.get.dft.db.fields <- function () { | |
91 | |
92 dft.fields <- list() | |
93 | |
94 for (f in c(MSDB.TAG.MZTHEO, MSDB.TAG.COLRT, MSDB.TAG.MOLID, MSDB.TAG.COL, MSDB.TAG.MODE, MSDB.TAG.ATTR, MSDB.TAG.COMP, MSDB.TAG.MOLNAMES, MSDB.TAG.MOLCOMP, MSDB.TAG.MOLMASS, MSDB.TAG.INCHI, MSDB.TAG.INCHIKEY, MSDB.TAG.PUBCHEM, MSDB.TAG.CHEBI, MSDB.TAG.HMDB, MSDB.TAG.KEGG)) | |
95 dft.fields[[f]] <- f | |
96 | |
97 return(dft.fields) | |
98 } | |
99 | |
100 ################## | |
101 # MAKE DB FIELDS # | |
102 ################## | |
103 | |
104 msdb.make.db.fields <- function(fields) { | |
105 | |
106 # Merge with default fields | |
107 dft.fields <- msdb.get.dft.db.fields() | |
108 absent <- ! names(dft.fields) %in% names(fields) | |
109 if (length(absent) > 0) | |
110 fields <- c(fields, dft.fields[absent]) | |
111 | |
112 return(fields) | |
113 } | |
114 | |
115 ######################### | |
116 # MAKE INPUT DATA FRAME # | |
117 ######################### | |
118 | |
119 msdb.make.input.df <- function(mz, rt = NULL) { | |
120 | |
121 field <- msdb.get.dft.input.fields() | |
122 | |
123 x <- data.frame() | |
124 | |
125 # Set mz | |
126 if (length(mz) > 1) | |
127 x[seq(mz), field[[MSDB.TAG.MZ]]] <- mz | |
128 else if (length(mz) == 1) | |
129 x[1, field[[MSDB.TAG.MZ]]] <- mz | |
130 else | |
131 x[, field[[MSDB.TAG.MZ]]] <- numeric() | |
132 | |
133 # Set rt | |
134 if ( ! is.null(rt)) { | |
135 if (length(rt) > 1) | |
136 x[seq(rt), field[[MSDB.TAG.RT]]] <- rt | |
137 else if (length(rt) == 1) | |
138 x[1, field[[MSDB.TAG.RT]]] <- rt | |
139 else | |
140 x[, field[[MSDB.TAG.RT]]] <- numeric() | |
141 } | |
142 | |
143 return(x) | |
144 } | |
145 | |
146 ############################### | |
147 # GET EMPTY RESULT DATA FRAME # | |
148 ############################### | |
149 | |
150 .get.empty.result.df <- function(rt = FALSE) { | |
151 | |
152 df <- data.frame(stringsAsFactors = FALSE) | |
153 df[MSDB.TAG.MOLID] <- character() | |
154 df[MSDB.TAG.MOLNAMES] <- character() | |
155 df[MSDB.TAG.MZ] <- numeric() | |
156 df[MSDB.TAG.MZTHEO] <- numeric() | |
157 df[MSDB.TAG.ATTR] <- character() | |
158 df[MSDB.TAG.COMP] <- character() | |
159 if (rt) { | |
160 df[MSDB.TAG.RT] <- numeric() | |
161 df[MSDB.TAG.COL] <- character() | |
162 df[MSDB.TAG.COLRT] <- numeric() | |
163 } | |
164 | |
165 return(df) | |
166 } | |
167 | |
168 ############################ | |
169 # PARSE COLUMN DESCRIPTION # | |
170 ############################ | |
171 | |
172 .parse_chrom_col_desc <- function(desc) { | |
173 | |
174 # Clean string | |
175 s <- desc | |
176 s <- gsub('\\.+', ' ', s, perl = TRUE) # Replace '.' characters by spaces | |
177 s <- gsub('[*-]', ' ', s, perl = TRUE) # Replace dashes and asterisks by spaces | |
178 s <- gsub('[)(]', '', s, perl = TRUE) # Remove paranthesis | |
179 s <- trim(s) | |
180 s <- tolower(s) # put in lowercase | |
181 | |
182 # Match 2 3 4 5 6 7 8 9 10 1112 13 | |
183 pattern <- "^(uplc|hsf5|hplc|zicphilic)( (c8|c18|150 5 2 1))?( (\\d+)mn)?( (orbitrap|exactive|qtof|shimadzu exactive))?( (\\d+)mn)?( (bis|ter))?( 1)?$" | |
184 g <- str_match(s, pattern) | |
185 if (is.na(g[1, 1])) | |
186 stop(paste0("Impossible to parse column description \"", desc, "\".")) | |
187 | |
188 type <- g[1, 2] | |
189 stationary_phase <- if ( ! is.na(g[1, 4]) && nchar(g[1, 4]) > 0) g[1, 4] else NA_character_ | |
190 msdevice <- if ( ! is.na(g[1, 8]) && nchar(g[1, 8]) > 0) g[1, 8] else NA_character_ | |
191 time <- if ( ! is.na(g[1,6]) && nchar(g[1, 6]) > 0) as.integer(g[1, 6]) else ( if ( ! is.na(g[1, 10]) && nchar(g[1, 10]) > 0) as.integer(g[1, 10]) else NA_integer_ ) | |
192 | |
193 # Correct values | |
194 if ( ! is.na(stationary_phase) && stationary_phase == '150 5 2 1') stationary_phase <- '150*5*2.1' | |
195 if ( ! is.na(msdevice)) msdevice <- gsub(' ', '', msdevice) # remove spaces | |
196 | |
197 return(list( type = type, stationary_phase = stationary_phase, time = time, msdevice = msdevice)) | |
198 | |
199 } | |
200 | |
201 ######################### | |
202 # NORMALIZE COLUMN NAME # | |
203 ######################### | |
204 | |
205 .normalize_column_name <- function(desc) { | |
206 | |
207 lst <- .parse_chrom_col_desc(desc) | |
208 | |
209 v <- c(lst$type) | |
210 if ( ! is.na(lst$stationary_phase)) | |
211 v <- c(v, lst$stationary_phase) | |
212 if ( ! is.na(lst$time)) | |
213 v <- c(v, paste0(lst$time, "min")) | |
214 if ( ! is.na(lst$msdevice)) | |
215 v <- c(v, lst$msdevice) | |
216 | |
217 return(paste(v, collapse = '-')) | |
218 } | |
219 | |
220 } # end of load safe guard |