comparison msdb-common.R @ 6:f86fec07f392 draft default tip

planemo upload commit c397cd8a93953798d733fd62653f7098caac30ce
author prog
date Fri, 22 Feb 2019 16:04:22 -0500
parents fb9c0409d85c
children
comparison
equal deleted inserted replaced
5:fb9c0409d85c 6:f86fec07f392
1 if ( ! exists('.parse_chrom_col_desc')) { # Do not load again if already loaded
2
3 library('stringr')
4 source('strhlp.R', chdir = TRUE)
5 source('biodb-common.R', chdir = TRUE)
6
7 #############
8 # CONSTANTS #
9 #############
10
11 # Field tags
12 MSDB.TAG.MZ <- 'mz'
13 MSDB.TAG.MZEXP <- 'mzexp'
14 MSDB.TAG.MZTHEO <- 'mztheo'
15 MSDB.TAG.RT <- 'rt'
16 MSDB.TAG.MODE <- 'msmode'
17 MSDB.TAG.MOLID <- 'compoundid'
18 MSDB.TAG.COL <- 'chromcol'
19 MSDB.TAG.COLRT <- 'chromcolrt'
20 MSDB.TAG.ATTR <- 'peakattr'
21 MSDB.TAG.INT <- 'intensity'
22 MSDB.TAG.REL <- 'relative.intensity'
23 MSDB.TAG.COMP <- 'peakcomp'
24 MSDB.TAG.MOLNAMES <- 'fullnames'
25 MSDB.TAG.MOLCOMP <- 'compoundmass'
26 MSDB.TAG.MOLMASS <- 'compoundcomp'
27 MSDB.TAG.INCHI <- 'inchi'
28 MSDB.TAG.INCHIKEY <- 'inchikey'
29 MSDB.TAG.PUBCHEM <- 'pubchemcompid'
30 MSDB.TAG.CHEBI <- 'chebiid'
31 MSDB.TAG.HMDB <- 'hmdbid'
32 MSDB.TAG.KEGG <- 'keggid'
33
34 # Mode tags
35 MSDB.TAG.POS <- 'neg'
36 MSDB.TAG.NEG <- 'pos'
37
38 # Fields containing multiple values
39 MSDB.MULTIVAL.FIELDS <- c(MSDB.TAG.MOLNAMES)
40 MSDB.MULTIVAL.FIELD.SEP <- ';'
41
42 # Authorized mz tolerance unit values
43 MSDB.MZTOLUNIT.PPM <- 'ppm'
44 MSDB.MZTOLUNIT.PLAIN <- 'plain' # same as mz: mass-to-charge ratio
45 MSDB.MZTOLUNIT.VALS <- c(MSDB.MZTOLUNIT.PPM, MSDB.MZTOLUNIT.PLAIN)
46
47 # Authorized rt units
48 MSDB.RTUNIT.SEC <- 'sec'
49 MSDB.RTUNIT.MIN <- 'min'
50 MSDB.RTUNIT.VALS <- c(MSDB.RTUNIT.SEC ,MSDB.RTUNIT.MIN)
51
52 # Default values
53 MSDB.DFT.PREC <- list()
54 MSDB.DFT.PREC[[MSDB.TAG.POS]] <- c("[(M+H)]+", "[M+H]+", "[(M+Na)]+", "[M+Na]+", "[(M+K)]+", "[M+K]+")
55 MSDB.DFT.PREC[[MSDB.TAG.NEG]] <- c("[(M-H)]-", "[M-H]-", "[(M+Cl)]-", "[M+Cl]-")
56 MSDB.DFT.OUTPUT.MULTIVAL.FIELD.SEP <- MSDB.MULTIVAL.FIELD.SEP
57 MSDB.DFT.MATCH.FIELDS <- list( molids = 'molid', molnames = 'molnames')
58 MSDB.DFT.MATCH.SEP <- ','
59 MSDB.DFT.MODES <- list( pos = 'POS', neg = 'NEG')
60 MSDB.DFT.MZTOLUNIT <- MSDB.MZTOLUNIT.PPM
61
62 ############################
63 # GET DEFAULT INPUT FIELDS #
64 ############################
65
66 msdb.get.dft.input.fields <- function () {
67
68 dft.fields <- list()
69
70 for(f in c(MSDB.TAG.MZ, MSDB.TAG.RT))
71 dft.fields[[f]] <- f
72
73 return(dft.fields)
74 }
75
76 #########################
77 # GET DEFAULT DB FIELDS #
78 #########################
79
80 msdb.get.dft.db.fields <- function () {
81
82 dft.fields <- list()
83
84 for (f in c(MSDB.TAG.MZTHEO, MSDB.TAG.COLRT, MSDB.TAG.MOLID, MSDB.TAG.COL, MSDB.TAG.MODE, MSDB.TAG.ATTR, MSDB.TAG.COMP, MSDB.TAG.MOLNAMES, MSDB.TAG.MOLCOMP, MSDB.TAG.MOLMASS, MSDB.TAG.INCHI, MSDB.TAG.INCHIKEY, MSDB.TAG.PUBCHEM, MSDB.TAG.CHEBI, MSDB.TAG.HMDB, MSDB.TAG.KEGG))
85 dft.fields[[f]] <- f
86
87 return(dft.fields)
88 }
89
90 ##################
91 # MAKE DB FIELDS #
92 ##################
93
94 msdb.make.db.fields <- function(fields) {
95
96 # Merge with default fields
97 dft.fields <- msdb.get.dft.db.fields()
98 absent <- ! names(dft.fields) %in% names(fields)
99 if (length(absent) > 0)
100 fields <- c(fields, dft.fields[absent])
101
102 return(fields)
103 }
104
105 #########################
106 # MAKE INPUT DATA FRAME #
107 #########################
108
109 msdb.make.input.df <- function(mz, rt = NULL, rtunit = MSDB.RTUNIT.SEC) {
110
111 field <- msdb.get.dft.input.fields()
112
113 x <- data.frame()
114
115 # Set mz
116 if (length(mz) > 1)
117 x[seq(mz), field[[MSDB.TAG.MZ]]] <- mz
118 else if (length(mz) == 1)
119 x[1, field[[MSDB.TAG.MZ]]] <- mz
120 else
121 x[, field[[MSDB.TAG.MZ]]] <- numeric()
122
123 # Set rt
124 if ( ! is.null(rt)) {
125 if (rtunit == MSDB.RTUNIT.MIN)
126 rtunit <- rtunit * 60
127 if (length(rt) > 1)
128 x[seq(rt), field[[MSDB.TAG.RT]]] <- rt
129 else if (length(rt) == 1)
130 x[1, field[[MSDB.TAG.RT]]] <- rt
131 else
132 x[, field[[MSDB.TAG.RT]]] <- numeric()
133 }
134
135 return(x)
136 }
137
138 ###############################
139 # GET EMPTY RESULT DATA FRAME #
140 ###############################
141
142 .get.empty.result.df <- function(rt = FALSE) {
143
144 df <- data.frame(stringsAsFactors = FALSE)
145 df[MSDB.TAG.MOLID] <- character()
146 df[MSDB.TAG.MOLNAMES] <- character()
147 df[MSDB.TAG.MZ] <- numeric()
148 df[MSDB.TAG.MZTHEO] <- numeric()
149 df[MSDB.TAG.ATTR] <- character()
150 df[MSDB.TAG.COMP] <- character()
151 if (rt) {
152 df[MSDB.TAG.RT] <- numeric()
153 df[MSDB.TAG.COL] <- character()
154 df[MSDB.TAG.COLRT] <- numeric()
155 }
156
157 return(df)
158 }
159
160 ############################
161 # PARSE COLUMN DESCRIPTION #
162 ############################
163
164 .parse_chrom_col_desc <- function(desc) {
165
166 # Clean string
167 s <- desc
168 s <- gsub('\\.+', ' ', s, perl = TRUE) # Replace '.' characters by spaces
169 s <- gsub('[*-]', ' ', s, perl = TRUE) # Replace dashes and asterisks by spaces
170 s <- gsub('[)(]', '', s, perl = TRUE) # Remove paranthesis
171 s <- trim(s)
172 s <- tolower(s) # put in lowercase
173
174 # Match 2 3 4 5 6 7 8 9 10 1112 13
175 pattern <- "^(uplc|hsf5|hplc|zicphilic)( (c8|c18|150 5 2 1))?( (\\d+)mn)?( (orbitrap|exactive|qtof|shimadzu exactive))?( (\\d+)mn)?( (bis|ter))?( 1)?$"
176 g <- str_match(s, pattern)
177 if (is.na(g[1, 1]))
178 stop(paste0("Impossible to parse column description \"", desc, "\"."))
179
180 type <- g[1, 2]
181 stationary_phase <- if ( ! is.na(g[1, 4]) && nchar(g[1, 4]) > 0) g[1, 4] else NA_character_
182 msdevice <- if ( ! is.na(g[1, 8]) && nchar(g[1, 8]) > 0) g[1, 8] else NA_character_
183 time <- if ( ! is.na(g[1,6]) && nchar(g[1, 6]) > 0) as.integer(g[1, 6]) else ( if ( ! is.na(g[1, 10]) && nchar(g[1, 10]) > 0) as.integer(g[1, 10]) else NA_integer_ )
184
185 # Correct values
186 if ( ! is.na(stationary_phase) && stationary_phase == '150 5 2 1') stationary_phase <- '150*5*2.1'
187 if ( ! is.na(msdevice)) msdevice <- gsub(' ', '', msdevice) # remove spaces
188
189 return(list( type = type, stationary_phase = stationary_phase, time = time, msdevice = msdevice))
190
191 }
192
193 #########################
194 # NORMALIZE COLUMN NAME #
195 #########################
196
197 .normalize_column_name <- function(desc) {
198
199 lst <- .parse_chrom_col_desc(desc)
200
201 v <- c(lst$type)
202 if ( ! is.na(lst$stationary_phase))
203 v <- c(v, lst$stationary_phase)
204 if ( ! is.na(lst$time))
205 v <- c(v, paste0(lst$time, "min"))
206 if ( ! is.na(lst$msdevice))
207 v <- c(v, lst$msdevice)
208
209 return(paste(v, collapse = '-'))
210 }
211
212 } # end of load safe guard