comparison msdb-common.R @ 0:e66bb061af06 draft

planemo upload for repository https://github.com/workflow4metabolomics/lcmsmatching.git commit 3529b25417f8e1a5836474c9adec4b696d35099d-dirty
author prog
date Tue, 12 Jul 2016 12:02:37 -0400
parents
children 20d69a062da3
comparison
equal deleted inserted replaced
-1:000000000000 0:e66bb061af06
1 if ( ! exists('.parse_chrom_col_desc')) { # Do not load again if already loaded
2
3 library('stringr')
4 source('strhlp.R', chdir = TRUE)
5
6 #############
7 # CONSTANTS #
8 #############
9
10 # Field tags
11 MSDB.TAG.MZ <- 'mz'
12 MSDB.TAG.MZEXP <- 'mzexp'
13 MSDB.TAG.MZTHEO <- 'mztheo'
14 MSDB.TAG.RT <- 'rt'
15 MSDB.TAG.MODE <- 'mode'
16 MSDB.TAG.MOLID <- 'molid'
17 MSDB.TAG.COL <- 'col'
18 MSDB.TAG.COLRT <- 'colrt'
19 MSDB.TAG.ATTR <- 'attr'
20 MSDB.TAG.INT <- 'int' # Absolute intensity
21 MSDB.TAG.REL <- 'rel' # Relative intensity
22 MSDB.TAG.COMP <- 'comp'
23 MSDB.TAG.MOLNAMES <- 'molnames'
24 MSDB.TAG.MOLCOMP <- 'molcomp'
25 MSDB.TAG.MOLATTR <- 'molattr'
26 MSDB.TAG.MOLMASS <- 'molmass'
27 MSDB.TAG.INCHI <- 'inchi'
28 MSDB.TAG.INCHIKEY <- 'inchikey'
29 MSDB.TAG.PUBCHEM <- 'pubchem'
30 MSDB.TAG.CHEBI <- 'chebi'
31 MSDB.TAG.HMDB <- 'hmdb'
32 MSDB.TAG.KEGG <- 'kegg'
33
34 # Mode tags
35 MSDB.TAG.POS <- 'ms.pos'
36 MSDB.TAG.NEG <- 'ms.neg'
37
38 # Fields containing multiple values
39 MSDB.MULTIVAL.FIELDS <- c(MSDB.TAG.MOLNAMES)
40 MSDB.MULTIVAL.FIELD.SEP <- ';'
41
42 # Authorized mz tolerance unit values
43 MSDB.MZTOLUNIT.PPM <- 'ppm'
44 MSDB.MZTOLUNIT.PLAIN <- 'plain' # same as mz: mass-to-charge ratio
45 MSDB.MZTOLUNIT.VALS <- c(MSDB.MZTOLUNIT.PPM, MSDB.MZTOLUNIT.PLAIN)
46
47 # Default values
48 MSDB.DFT.PREC <- list()
49 MSDB.DFT.PREC[[MSDB.TAG.POS]] <- c("[(M+H)]+", "[M+H]+", "[(M+Na)]+", "[M+Na]+", "[(M+K)]+", "[M+K]+")
50 MSDB.DFT.PREC[[MSDB.TAG.NEG]] <- c("[(M-H)]-", "[M-H]-", "[(M+Cl)]-", "[M+Cl]-")
51 MSDB.DFT.OUTPUT.FIELDS <- list( mz = 'mz', rt = 'rt', col = 'col', colrt = 'colrt', molid = 'id', attr = 'attribution', comp = 'composition', int = 'intensity', rel = 'relative', mzexp = 'mzexp', mztheo = 'mztheo', msmatching = 'msmatching', molnames = 'molnames', molcomp = 'molcomp', molmass = 'molmass', inchi = 'inchi', inchikey = 'inchikey', pubchem = 'pubchem', chebi = 'chebi', hmdb = 'hmdb', kegg = 'kegg')
52 MSDB.DFT.OUTPUT.MULTIVAL.FIELD.SEP <- MSDB.MULTIVAL.FIELD.SEP
53 MSDB.DFT.MATCH.FIELDS <- list( molids = 'molid', molnames = 'molnames')
54 MSDB.DFT.MATCH.SEP <- ','
55 MSDB.DFT.MODES <- list( pos = 'POS', neg = 'NEG')
56 MSDB.DFT.MZTOLUNIT <- MSDB.MZTOLUNIT.PPM
57
58 ############################
59 # GET DEFAULT INPUT FIELDS #
60 ############################
61
62 msdb.get.dft.input.fields <- function () {
63
64 dft.fields <- list()
65
66 for(f in c(MSDB.TAG.MZ, MSDB.TAG.RT))
67 dft.fields[[f]] <- f
68
69 return(dft.fields)
70 }
71
72 #############################
73 # GET DEFAULT OUTPUT FIELDS #
74 #############################
75
76 msdb.get.dft.output.fields <- function () {
77
78 dft.fields <- list()
79
80 for(f in c(MSDB.TAG.MZ, MSDB.TAG.RT, MSDB.TAG.COL, MSDB.TAG.COLRT, MSDB.TAG.MOLID, MSDB.TAG.ATTR, MSDB.TAG.COMP, MSDB.TAG.INT, MSDB.TAG.REL, MSDB.TAG.MZEXP, MSDB.TAG.MZTHEO, MSDB.TAG.MOLNAMES, MSDB.TAG.MOLCOMP, MSDB.TAG.MOLMASS, MSDB.TAG.INCHI, MSDB.TAG.INCHIKEY, MSDB.TAG.PUBCHEM, MSDB.TAG.CHEBI, MSDB.TAG.HMDB, MSDB.TAG.KEGG))
81 dft.fields[[f]] <- f
82
83 return(dft.fields)
84 }
85
86 #########################
87 # GET DEFAULT DB FIELDS #
88 #########################
89
90 msdb.get.dft.db.fields <- function () {
91
92 dft.fields <- list()
93
94 for (f in c(MSDB.TAG.MZTHEO, MSDB.TAG.COLRT, MSDB.TAG.MOLID, MSDB.TAG.COL, MSDB.TAG.MODE, MSDB.TAG.ATTR, MSDB.TAG.COMP, MSDB.TAG.MOLNAMES, MSDB.TAG.MOLCOMP, MSDB.TAG.MOLMASS, MSDB.TAG.INCHI, MSDB.TAG.INCHIKEY, MSDB.TAG.PUBCHEM, MSDB.TAG.CHEBI, MSDB.TAG.HMDB, MSDB.TAG.KEGG))
95 dft.fields[[f]] <- f
96
97 return(dft.fields)
98 }
99
100 ##################
101 # MAKE DB FIELDS #
102 ##################
103
104 msdb.make.db.fields <- function(fields) {
105
106 # Merge with default fields
107 dft.fields <- msdb.get.dft.db.fields()
108 absent <- ! names(dft.fields) %in% names(fields)
109 if (length(absent) > 0)
110 fields <- c(fields, dft.fields[absent])
111
112 return(fields)
113 }
114
115 #########################
116 # MAKE INPUT DATA FRAME #
117 #########################
118
119 msdb.make.input.df <- function(mz, rt = NULL) {
120
121 field <- msdb.get.dft.input.fields()
122
123 x <- data.frame()
124
125 # Set mz
126 if (length(mz) > 1)
127 x[seq(mz), field[[MSDB.TAG.MZ]]] <- mz
128 else if (length(mz) == 1)
129 x[1, field[[MSDB.TAG.MZ]]] <- mz
130 else
131 x[, field[[MSDB.TAG.MZ]]] <- numeric()
132
133 # Set rt
134 if ( ! is.null(rt)) {
135 if (length(rt) > 1)
136 x[seq(rt), field[[MSDB.TAG.RT]]] <- rt
137 else if (length(rt) == 1)
138 x[1, field[[MSDB.TAG.RT]]] <- rt
139 else
140 x[, field[[MSDB.TAG.RT]]] <- numeric()
141 }
142
143 return(x)
144 }
145
146 ###############################
147 # GET EMPTY RESULT DATA FRAME #
148 ###############################
149
150 .get.empty.result.df <- function(rt = FALSE) {
151
152 df <- data.frame(stringsAsFactors = FALSE)
153 df[MSDB.TAG.MOLID] <- character()
154 df[MSDB.TAG.MOLNAMES] <- character()
155 df[MSDB.TAG.MZ] <- numeric()
156 df[MSDB.TAG.MZTHEO] <- numeric()
157 df[MSDB.TAG.ATTR] <- character()
158 df[MSDB.TAG.COMP] <- character()
159 if (rt) {
160 df[MSDB.TAG.RT] <- numeric()
161 df[MSDB.TAG.COL] <- character()
162 df[MSDB.TAG.COLRT] <- numeric()
163 }
164
165 return(df)
166 }
167
168 ############################
169 # PARSE COLUMN DESCRIPTION #
170 ############################
171
172 .parse_chrom_col_desc <- function(desc) {
173
174 # Clean string
175 s <- desc
176 s <- gsub('\\.+', ' ', s, perl = TRUE) # Replace '.' characters by spaces
177 s <- gsub('[*-]', ' ', s, perl = TRUE) # Replace dashes and asterisks by spaces
178 s <- gsub('[)(]', '', s, perl = TRUE) # Remove paranthesis
179 s <- trim(s)
180 s <- tolower(s) # put in lowercase
181
182 # Match 2 3 4 5 6 7 8 9 10 1112 13
183 pattern <- "^(uplc|hsf5|hplc|zicphilic)( (c8|c18|150 5 2 1))?( (\\d+)mn)?( (orbitrap|exactive|qtof|shimadzu exactive))?( (\\d+)mn)?( (bis|ter))?( 1)?$"
184 g <- str_match(s, pattern)
185 if (is.na(g[1, 1]))
186 stop(paste0("Impossible to parse column description \"", desc, "\"."))
187
188 type <- g[1, 2]
189 stationary_phase <- if ( ! is.na(g[1, 4]) && nchar(g[1, 4]) > 0) g[1, 4] else NA_character_
190 msdevice <- if ( ! is.na(g[1, 8]) && nchar(g[1, 8]) > 0) g[1, 8] else NA_character_
191 time <- if ( ! is.na(g[1,6]) && nchar(g[1, 6]) > 0) as.integer(g[1, 6]) else ( if ( ! is.na(g[1, 10]) && nchar(g[1, 10]) > 0) as.integer(g[1, 10]) else NA_integer_ )
192
193 # Correct values
194 if ( ! is.na(stationary_phase) && stationary_phase == '150 5 2 1') stationary_phase <- '150*5*2.1'
195 if ( ! is.na(msdevice)) msdevice <- gsub(' ', '', msdevice) # remove spaces
196
197 return(list( type = type, stationary_phase = stationary_phase, time = time, msdevice = msdevice))
198
199 }
200
201 #########################
202 # NORMALIZE COLUMN NAME #
203 #########################
204
205 .normalize_column_name <- function(desc) {
206
207 lst <- .parse_chrom_col_desc(desc)
208
209 v <- c(lst$type)
210 if ( ! is.na(lst$stationary_phase))
211 v <- c(v, lst$stationary_phase)
212 if ( ! is.na(lst$time))
213 v <- c(v, paste0(lst$time, "min"))
214 if ( ! is.na(lst$msdevice))
215 v <- c(v, lst$msdevice)
216
217 return(paste(v, collapse = '-'))
218 }
219
220 } # end of load safe guard