Mercurial > repos > prog > lcmsmatching
comparison search-mz @ 6:f86fec07f392 draft default tip
planemo upload commit c397cd8a93953798d733fd62653f7098caac30ce
author | prog |
---|---|
date | Fri, 22 Feb 2019 16:04:22 -0500 |
parents | fb9c0409d85c |
children |
comparison
equal
deleted
inserted
replaced
5:fb9c0409d85c | 6:f86fec07f392 |
---|---|
1 #!/usr/bin/env Rscript | |
2 # vi: ft=R fdm=marker | |
3 args <- commandArgs(trailingOnly = F) | |
4 script.path <- sub("--file=","",args[grep("--file=",args)]) | |
5 library(getopt) | |
6 source(file.path(dirname(script.path), 'msdb-common.R'), chdir = TRUE) | |
7 source(file.path(dirname(script.path), 'MsDbLogger.R'), chdir = TRUE) | |
8 source(file.path(dirname(script.path), 'MsDbInputDataFrameStream.R'), chdir = TRUE) | |
9 source(file.path(dirname(script.path), 'MsDbOutputDataFrameStream.R'), chdir = TRUE) | |
10 source(file.path(dirname(script.path), 'htmlhlp.R'), chdir = TRUE) | |
11 source(file.path(dirname(script.path), 'strhlp.R'), chdir = TRUE) | |
12 source(file.path(dirname(script.path), 'fshlp.R'), chdir = TRUE) | |
13 source(file.path(dirname(script.path), 'biodb-common.R'), chdir = TRUE) | |
14 source(file.path(dirname(script.path), 'nethlp.R'), chdir = TRUE) | |
15 | |
16 # Missing paste0() function in R 2.14.1 | |
17 if (as.integer(R.Version()$major) == 2 && as.numeric(R.Version()$minor) < 15) | |
18 paste0 <- function(...) paste(..., sep = '') | |
19 | |
20 # Constants {{{1 | |
21 ################################################################ | |
22 | |
23 PROG <- sub('^.*/([^/]+)$', '\\1', commandArgs()[4], perl = TRUE) | |
24 USERAGENT <- 'search-mz ; pierrick.roger@gmail.com' | |
25 | |
26 # Authorized database types | |
27 MSDB.XLS <- 'xls' | |
28 MSDB.4TABSQL <- '4tabsql' | |
29 MSDB.FILE <- 'file' | |
30 MSDB.PEAKFOREST <- 'peakforest' | |
31 MSDB.VALS <- c(MSDB.XLS, MSDB.4TABSQL, MSDB.FILE, MSDB.PEAKFOREST) | |
32 DB.SRC.FILE <- list () | |
33 DB.SRC.FILE[[MSDB.FILE]] <- 'MsFileDb.R' | |
34 DB.SRC.FILE[[MSDB.PEAKFOREST]] <- 'MsPeakForestDb.R' | |
35 DB.SRC.FILE[[MSDB.XLS]] <- 'MsXlsDb.R' | |
36 DB.SRC.FILE[[MSDB.4TABSQL]] <- 'Ms4TabSqlDb.R' | |
37 | |
38 # Authorized mode values | |
39 POS_MODE <- 'pos' | |
40 NEG_MODE <- 'neg' | |
41 MSDB.MODE.VALS <- c(POS_MODE, NEG_MODE) | |
42 | |
43 # Default | |
44 MSDB.DFT <- list() | |
45 MSDB.DFT[['mzshift']] <- 0 # in ppm | |
46 MSDB.DFT[['mzprec']] <- 5 # in ppm | |
47 MSDB.DFT[['mztolunit']] <- MSDB.DFT.MZTOLUNIT | |
48 MSDB.DFT[['precursor-rt-tol']] <- 5 | |
49 MSDB.DFT[['molids-sep']] <- MSDB.DFT.MATCH.SEP | |
50 MSDB.DFT[['db-fields']] <- concat.kv.list(msdb.get.dft.db.fields()) | |
51 MSDB.DFT[['db-ms-modes']] <- concat.kv.list(MSDB.DFT.MODES) | |
52 MSDB.DFT[['pos-prec']] <- paste(MSDB.DFT.PREC[[MSDB.TAG.POS]], collapse = ',') | |
53 MSDB.DFT[['neg-prec']] <- paste(MSDB.DFT.PREC[[MSDB.TAG.NEG]], collapse = ',') | |
54 MSDB.DFT[['db-rt-unit']] <- MSDB.RTUNIT.SEC | |
55 MSDB.DFT[['rtunit']] <- MSDB.RTUNIT.SEC | |
56 DEFAULT.ARG.VALUES <- MSDB.DFT | |
57 DEFAULT.ARG.VALUES[['input-col-names']] <- concat.kv.list(msdb.get.dft.input.fields()) | |
58 | |
59 # Print help {{{1 | |
60 ################################################################ | |
61 | |
62 print.help <- function() { | |
63 | |
64 cat("USAGE:\n") | |
65 prog.mz.match <- paste(PROG, ' -d (', paste(MSDB.VALS, collapse = '|'), ') --url (file|dir|database URL) -i <file> -m (', paste(MSDB.MODE.VALS, collapse = '|'), ") -p <mz precision> -s <mz shift> -u (", paste(MSDB.MZTOLUNIT.VALS, collapse = '|'), ") -o <file>", sep = '') | |
66 cat("\t(1) ", prog.mz.match, " ...\n", sep = '') | |
67 cat("\n") | |
68 cat("\t(2) ", prog.mz.match, "(--all-cols|-c <cols>) -x <X RT tolerance> -y <Y RT tolerance>", " ...\n", sep = '') | |
69 cat("\n") | |
70 cat("\t(3) ", PROG, ' -d (', paste(MSDB.VALS, collapse = '|'), ") --url (file|dir|database URL) --list-cols\n", sep = '') | |
71 | |
72 cat("\nDETAILS:\n") | |
73 cat("Form (1) is for running an MZ match on a database.\n") | |
74 cat("Form (2) is for running an MZ/RT match on a database.\n") | |
75 cat("Form (3) is for getting a list of available chromatographic columns in a database.\n") | |
76 | |
77 cat("\nOPTIONS:\n") | |
78 spec <- matrix(make.getopt.spec(), byrow = TRUE, ncol = 5) | |
79 max.length.opt.cols <- max(nchar(spec[,1])) + 1 | |
80 sections <- list(database = "Database setting", input = "Input file", output = "Output files", mz = "M/Z matching", rt = "RT matching", precursor = "Precursor matching", misc = "Miscellaneous") | |
81 for (section in names(sections)) { | |
82 cat("\n\t", sections[[section]], ":\n", sep = '') | |
83 spec <- matrix(make.getopt.spec(section), byrow = TRUE, ncol = 5) | |
84 for (i in seq(nrow(spec))) { | |
85 opt <- '' | |
86 if ( ! is.na(spec[i,2])) | |
87 opt <- paste('-', spec[i,2], '|', sep = '') | |
88 opt <- paste(opt, '--', spec[i, 1], sep = '') | |
89 nb.space.padding <- max.length.opt.cols - nchar(opt) + 6 | |
90 padding <- paste(rep(' ', nb.space.padding), sep = '') | |
91 cat("\t\t", opt, padding, "\t", spec[i, 5], "\n", sep = '') | |
92 } | |
93 } | |
94 | |
95 cat("\nEXAMPLES:\n") | |
96 | |
97 cat("\nSimple M/Z matching with a file database:\n") | |
98 cat("\t./", PROG, " -d file --url mydbfile.tsv -i input.tsv -m pos -o output.tsv\n", sep = '') | |
99 | |
100 cat("\nFile database with M/Z tolerance:\n") | |
101 cat("\t./", PROG, " -d file --url mydbfile.tsv -i input.tsv -m pos -o output.tsv -p 0.5 -s 0\n", sep = '') | |
102 | |
103 cat("\nFile database with M/Z tolerance unit:\n") | |
104 cat("\t./", PROG, " -d file --url mydbfile.tsv -i input.tsv -m pos -o output.tsv -p 1 -s 0.5 -u plain\n", sep = '') | |
105 | |
106 cat("\nPeakforest database:\n") | |
107 cat("\t./", PROG, " -d peakforest --url https://rest.peakforest.org/ --db-token <your Peakforest token> -i input.tsv -m pos -o output.tsv\n", sep = '') | |
108 } | |
109 | |
110 # Set default argument values {{{1 | |
111 ################################################################ | |
112 | |
113 set.dft.arg.val <-function(opt) { | |
114 | |
115 for (f in names(MSDB.DFT)) | |
116 if (is.null(opt[[f]])) | |
117 opt[[f]] <- MSDB.DFT[[f]] | |
118 | |
119 # Set default values | |
120 if ( opt$database == MSDB.XLS && ! is.null(opt$url) && is.null(opt[['cache-dir']])) | |
121 opt[['cache-dir']] <- file.path(opt$url, 'cache') | |
122 | |
123 if ( ! is.null(opt$rtcol) && opt$rtcol == '') | |
124 opt$rtcol <- NULL | |
125 | |
126 return(opt) | |
127 } | |
128 | |
129 # Parse argument values {{{1 | |
130 ################################################################ | |
131 | |
132 parse.arg.val <- function(opt) { | |
133 | |
134 # Parse input column names | |
135 if ( ! is.null(opt[['db-fields']])) { | |
136 cust <- split.kv.list(opt[['db-fields']]) | |
137 cust <- cust[cust != 'NA'] | |
138 opt[['db-fields']] <- split.kv.list(MSDB.DFT[['db-fields']]) | |
139 cust <- cust[names(cust) %in% names(opt[['db-fields']])] | |
140 opt[['db-fields']][names(cust)] <- cust | |
141 } | |
142 | |
143 # Parse MS modes | |
144 if ( ! is.null(opt[['db-ms-modes']])) { | |
145 cust <- split.kv.list(opt[['db-ms-modes']]) | |
146 opt[['db-ms-modes']] <- split.kv.list(MSDB.DFT[['db-ms-modes']]) | |
147 opt[['db-ms-modes']][names(cust)] <- cust | |
148 } | |
149 | |
150 # Parse retention time columns | |
151 if ( ! is.null(opt$rtcol)) | |
152 opt$rtcol <- strsplit(opt$rtcol, ',')[[1]] | |
153 | |
154 # Parse input column names | |
155 if (is.null(opt[['input-col-names']])) { | |
156 opt[['input-col-names']] <- msdb.get.dft.input.fields() | |
157 } | |
158 else { | |
159 custcols <- split.kv.list(opt[['input-col-names']]) | |
160 custcols <- custcols[custcols != 'NA'] | |
161 dftcols <- msdb.get.dft.input.fields() | |
162 opt[['input-col-names']] <- c(custcols, dftcols[ ! names(dftcols) %in% names(custcols)]) | |
163 } | |
164 | |
165 # Parse lists of precursors | |
166 if ( ! is.null(opt[['pos-prec']])) | |
167 opt[['pos-prec']] <- split.str(opt[['pos-prec']], unlist = TRUE) | |
168 if ( ! is.null(opt[['neg-prec']])) | |
169 opt[['neg-prec']] <- split.str(opt[['neg-prec']], unlist = TRUE) | |
170 | |
171 return(opt) | |
172 } | |
173 | |
174 # Make getopt specifications {{{1 | |
175 ################################################################ | |
176 | |
177 make.getopt.spec <- function(sections = NULL) { | |
178 | |
179 spec <- character(0) | |
180 | |
181 if (is.null(sections) || 'input' %in% sections) | |
182 spec <- c(spec, | |
183 'input-file', 'i', 1, 'character', 'Set input file.', | |
184 'input-col-names', 'j', 1, 'character', paste0('Set the input column names. Default is "', DEFAULT.ARG.VALUES[['input-col-names']], '".') | |
185 ) | |
186 | |
187 if (is.null(sections) || 'mz' %in% sections) | |
188 spec <- c(spec, | |
189 'mode', 'm', 1, 'character', paste0('MS mode. Possible values are:', paste(MSDB.MODE.VALS, collapse = ", "), '.'), | |
190 'mzshift', 's', 1, 'numeric', paste0('Shift on m/z. Default is ', MSDB.DFT$mzshift,'.'), | |
191 'mzprec', 'p', 1, 'numeric', paste0('Tolerance on m/z. Default is ', MSDB.DFT$mzprec,'.'), | |
192 'mztolunit', 'u', 1, 'character', paste0('Unit used for tolerance values (options -s and -p) on M/Z. Default is ', MSDB.DFT$mztolunit,'.') | |
193 ) | |
194 | |
195 if (is.null(sections) || 'rt' %in% sections) | |
196 spec <- c(spec, | |
197 'all-cols', 'A', 0, 'logical', 'Use all available chromatographic columns to match retention times.', | |
198 'rtcol', 'c', 1, 'character', paste0('Chromatographic column to use. Unset by default. If set, use the corresponding column to filter on retention times, if retention times are provided.'), | |
199 'check-cols', 'k', 0, 'logical', 'Check that the chromatographic column names specified with option -c really exist.', | |
200 'list-cols', 'l', 0, 'logical', 'List all chromatographic columns present in the database. Write list inside the file specified by -o option.', | |
201 'rttol', 'r', 1, 'numeric', paste0('Tolerance on retention times. Unset by default.'), | |
202 'rttolx', 'x', 1, 'numeric', paste0('Tolerance on retention times. Unset by default.'), | |
203 'rttoly', 'y', 1, 'numeric', paste0('Tolerance on retention times. Unset by default.'), | |
204 'rtunit', 'v', 1, 'character', paste0('Retention time unit for the input file. Default is ', MSDB.DFT$rtunit, '. Allowed values are:', paste(MSDB.RTUNIT.VALS, collapse = ", "), '.') | |
205 ) | |
206 | |
207 if (is.null(sections) || 'precursor' %in% sections) | |
208 spec <- c(spec, | |
209 'precursor-match', 'Q', 0, 'logical', 'Remove peaks whose molecule precursor peak has not been matched. Unset by default.', | |
210 'precursor-rt-tol', 'R', 1, 'numeric', paste0('Precursor retention time tolerance. Only used when precursor-match is enabled. Default is ', MSDB.DFT[['precursor-rt-tol']], '.'), | |
211 'pos-prec', 'Y', 1, 'character', paste0('Set the list of precursors to use in positive mode. Default is "', MSDB.DFT[['pos-prec']], '".'), | |
212 'neg-prec', 'Z', 1, 'character', paste0('Set the list of precursors to use in negative mode. Default is "', MSDB.DFT[['neg-prec']], '".') | |
213 ) | |
214 | |
215 if (is.null(sections) || 'output' %in% sections) | |
216 spec <- c(spec, | |
217 'output-file', 'o', 1, 'character', 'Set file to use for the main output.', | |
218 'peak-output-file', 'O', 1, 'character', 'If set and if --same-rows is set, then output all matches inside the specified file, with one mz match per line. The output columns are: mz, rt, id, col, colrt, composition, attribution. This means that if an mz value is matched several times, then it will repeated on several lines, with one match description per line.', | |
219 'html-output-file', 'H', 1, 'character', 'Set file to use for the HTML output.', | |
220 'no-main-table-in-html-output', 't', 0, 'logical', 'Do not display main table in HTML output.', | |
221 'same-rows', 'a', 0, 'logical', 'If set, output exactly the same number of rows as the input. This means that in case of multiple matches for one mz, then only one line is output (i.e.: the mz value is not duplicated on several lines). In the main output file, an "ms.matching" column is output with inside, for each mz, a comma separated list of matched component/molecule IDs. If unset, then only the main output file is used, and one single is written to it with one line per peak match, and eventual mz line duplicated if there are multiple matches for this mz.', | |
222 'same-cols', 'b', 0, 'logical', 'If set, output the same columns as inside the input. All input columns are copied to the output.', | |
223 'molids-sep', 'S', 1, 'character', paste0('Set character separator used to when concatenating molecule IDs in output. Default is "', MSDB.DFT[['molids-sep']] , '".'), | |
224 'first-val', '1', 0, 'logical', 'Keep only the first value in multi-value fields. Unset by default.', | |
225 'excel2011comp', 'X', 0, 'logical', 'Excel 2011 compatiblity mode. Output ASCII text files instead of UTF-8 files, where greek letters are replaced with their latin names, plusminus sign is replaced with +- and apostrophe is replaced with \"prime\". All other non-ASCII characters are repladed with underscore.' | |
226 ) | |
227 | |
228 if (is.null(sections) || 'database' %in% sections) | |
229 spec <- c(spec, | |
230 'database', 'd', 1, 'character', paste0('Set database to use: "xls" for an Excel database, "file" for a single file database, "4tabsql" for a 4Tab SQL database, and "peakforest" for a connection to PeakForest database.'), | |
231 'url', 'W', 1, 'character', 'URL of database. For "peakforest" database it is the HTTP URL, for the "xls" database it is the path to the directory containing the Excel files, for the "file" database it is the path to the file database and for the "4tabsql" database it is the IP address of the server.', | |
232 'cache-dir', 'C', 1, 'character', 'Path to directory where to store cache files. Only used when database flag is set to "xls".', | |
233 'db-name', 'N', 1, 'character', 'Name of the database. Used by the "4tabsql" database.', | |
234 'db-user', 'U', 1, 'character', 'User of the database. Used by the "4tabsql" database.', | |
235 'db-password', 'P', 1, 'character', 'Password of the database user. Used by the "4tabsql" database.', | |
236 'db-ms-modes', 'M', 1, 'character', paste0('Comma separated key/value list giving the MS modes to be used in the single file database. Default is "', MSDB.DFT[['db-ms-modes']], '".'), | |
237 'db-rt-unit', 'V', 1, 'character', paste0('Retention time unit for the database, used in the single file database. Default is "', MSDB.DFT[['db-rt-unit']], '". Allowed values are:', paste(MSDB.RTUNIT.VALS, collapse = ", "), '.'), | |
238 'db-token', 'T', 1, 'character', 'Database token. Used by Peakforest database.', | |
239 'db-fields', 'F', 1, 'character', paste0('Comma separated key/value list giving the field names to be used in the single file database. Default is "', MSDB.DFT[['db-fields']], '".') | |
240 ) | |
241 | |
242 if (is.null(sections) || 'misc' %in% sections) | |
243 spec <- c(spec, | |
244 'help', 'h', 0, 'logical', 'Print this help.', | |
245 'debug', 'g', 0, 'logical', 'Set debug mode.' | |
246 ) | |
247 | |
248 return(spec) | |
249 } | |
250 | |
251 # Read args {{{1 | |
252 ################################################################ | |
253 | |
254 read_args <- function() { | |
255 | |
256 # Get options | |
257 opt <- getopt(matrix(make.getopt.spec(), byrow = TRUE, ncol = 5)) | |
258 | |
259 # help | |
260 if ( ! is.null(opt$help)) { | |
261 print.help() | |
262 quit() | |
263 } | |
264 | |
265 opt <- set.dft.arg.val(opt) # Set default values | |
266 opt <- parse.arg.val(opt) # Parse list values | |
267 | |
268 # Check values | |
269 error <- check.args(opt) | |
270 | |
271 return(opt) | |
272 } | |
273 | |
274 # Check args {{{1 | |
275 ################################################################ | |
276 | |
277 check.args <- function(opt) { | |
278 | |
279 # Check database type | |
280 if (is.null(opt$database)) | |
281 stop("You must provide a database type through --database option.") | |
282 if ( ! opt$database %in% MSDB.VALS) | |
283 stop(paste0("Invalid value \"", opt$database, "\" for --database option.")) | |
284 | |
285 # Check filedb database | |
286 if (opt$database == MSDB.FILE) { | |
287 if (is.null(opt$url)) | |
288 stop("When using single file database, you must specify the location of the database file with option --url.") | |
289 if ( ! file.exists(opt$url)) | |
290 stop(paste0("The file path \"", opt$url,"\" specified with --db-file option is not valid.")) | |
291 } | |
292 | |
293 # Check Excel database | |
294 if (opt$database == MSDB.XLS) { | |
295 if (is.null(opt$url)) | |
296 stop("When using Excel database, you must specify the location of the Excel files directory with option --url.") | |
297 if ( ! file.exists(opt$url)) | |
298 stop(paste0("The directory path \"", opt$url,"\" specified with --xls-dir option is not valid.")) | |
299 } | |
300 | |
301 # Check 4 tab database | |
302 if (opt$database == MSDB.4TABSQL) { | |
303 if (is.null(opt$url)) | |
304 stop("When using 4Tab SQL database, you must specify the URL of the SQL server with option --url.") | |
305 if (is.null(opt[['db-name']])) | |
306 stop("When using 4Tab SQL database, you must specify the database name through the --db-name option.") | |
307 if (is.null(opt[['db-user']])) | |
308 stop("When using 4Tab SQL database, you must specify the database user through the --db-user option.") | |
309 if (is.null(opt[['db-password']])) | |
310 stop("When using 4Tab SQL database, you must specify the database user password through the --db-password option.") | |
311 } | |
312 | |
313 # Check Peakforest database | |
314 if (opt$database == MSDB.PEAKFOREST) { | |
315 if (is.null(opt$url)) | |
316 stop("When using PeakForest database, you must specify the URL of the PeakForest server with option --url.") | |
317 } | |
318 | |
319 if (is.null(opt[['list-cols']])) { | |
320 | |
321 if (is.null(opt[['output-file']])) | |
322 stop("You must set a path for the output file.") | |
323 | |
324 if (is.null(opt[['input-file']])) | |
325 stop("You must provide an input file.") | |
326 | |
327 if (is.null(opt$mode) || ( ! opt$mode %in% MSDB.MODE.VALS)) | |
328 stop("You must specify a mode through the --mode option.") | |
329 | |
330 if (is.null(opt$mzprec)) | |
331 stop("You must set a precision in MZ with the --mzprec option.") | |
332 | |
333 if ( ( ! is.null(opt$rtcol) || ! is.null(opt[['all-cols']])) && (is.null(opt$rttolx) || is.null(opt$rttoly))) | |
334 stop("When chromatographic columns are set, you must provide values for --rttolx and -rttoly.") | |
335 | |
336 if (is.null(opt$mztolunit) || ( ! opt$mztolunit %in% MSDB.MZTOLUNIT.VALS)) | |
337 stop("You must specify an M/Z tolerance unit through the --mztolunit option.") | |
338 } | |
339 } | |
340 | |
341 # Load database {{{1 | |
342 ################################################################ | |
343 | |
344 .load.db <- function(opt) { | |
345 | |
346 if (is.null(opt[['pos-prec']]) && is.null(opt[['neg-prec']])) { | |
347 precursors <- NULL | |
348 } else { | |
349 precursors <- list() | |
350 precursors[[MSDB.TAG.POS]] <- opt[['pos-prec']] | |
351 precursors[[MSDB.TAG.NEG]] <- opt[['neg-prec']] | |
352 } | |
353 | |
354 db <- switch(opt$database, | |
355 peakforest = MsPeakForestDb$new(url = opt$url, useragent = USERAGENT, token = opt[['db-token']]), | |
356 xls = MsXlsDb$new(db_dir = opt$url, cache_dir = opt[['cache-dir']]), | |
357 '4tabsql' = Ms4TabSqlDb$new(host = extract.address(opt$url), port = extract.port(opt$url), dbname = opt[['db-name']], user = opt[['db-user']], password = opt[['db-password']]), | |
358 file = MsFileDb$new(file = opt$url), | |
359 NULL) | |
360 db$setPrecursors(precursors) | |
361 if (db$areDbFieldsSettable()) | |
362 db$setDbFields(opt[['db-fields']]) | |
363 if (db$areDbMsModesSettable()) | |
364 db$setDbMsModes(opt[['db-ms-modes']]) | |
365 db$addObservers(MsDbLogger$new()) | |
366 | |
367 return(db) | |
368 } | |
369 | |
370 # Output HTML {{{1 | |
371 ################################################################ | |
372 | |
373 output.html <- function(db, peaks, file) { | |
374 | |
375 # Replace public database IDs by URLs | |
376 if ( ! is.null(peaks)) { | |
377 # Conversion from extdb id field to extdb name | |
378 extdb2classdb = list() | |
379 extdb2classdb[MSDB.TAG.KEGG] = BIODB.KEGG | |
380 extdb2classdb[MSDB.TAG.HMDB] = BIODB.HMDB | |
381 extdb2classdb[MSDB.TAG.CHEBI] = BIODB.CHEBI | |
382 extdb2classdb[MSDB.TAG.PUBCHEM] = BIODB.PUBCHEMCOMP | |
383 | |
384 # Loop on all dbs | |
385 for (extdb in c(MSDB.TAG.KEGG, MSDB.TAG.HMDB, MSDB.TAG.CHEBI, MSDB.TAG.PUBCHEM)) { | |
386 if ( ! is.null(peaks) && extdb %in% colnames(peaks)) | |
387 peaks[[extdb]] <- vapply(peaks[[extdb]], function(id) if (is.na(id)) '' else paste0('<a href="', get.entry.url(class = extdb2classdb[[extdb]], accession = id, content.type = BIODB.HTML), '">', id, '</a>'), FUN.VALUE = '') | |
388 } | |
389 } | |
390 | |
391 # Write HTML | |
392 html <- HtmlWriter(file = file) | |
393 html$writeBegTag('html') | |
394 html$writeBegTag('header') | |
395 html$writeTag('meta', attr = c(charset = "UTF-8")) | |
396 html$writeTag('title', text = "LC/MS matching results") | |
397 html$writeBegTag('style') | |
398 html$write('table, th, td { border-collapse: collapse; }') | |
399 html$write('table, th { border: 1px solid black; }') | |
400 html$write('td { border-left: 1px solid black; border-right: 1px solid black; }') | |
401 html$write('th, td { padding: 5px; }') | |
402 html$write('th { background-color: LightBlue; }') | |
403 html$write('tr:nth-child(even) { background-color: LemonChiffon; }') | |
404 html$write('tr:nth-child(odd) { background-color: LightGreen; }') | |
405 html$writeEndTag('style') | |
406 html$writeEndTag('header') | |
407 html$writeBegTag('body') | |
408 | |
409 # Write results | |
410 results <- FALSE | |
411 if ( ! is.null(peaks) && nrow(peaks) > 0) { | |
412 html$writeTag('h3', text = "Matched peaks") | |
413 html$writeTable(peaks) | |
414 results <- TRUE | |
415 } | |
416 if ( ! results) | |
417 html$writeTag('p', 'None.') | |
418 | |
419 html$writeEndTag('body') | |
420 html$writeEndTag('html') | |
421 } | |
422 | |
423 # MAIN {{{1 | |
424 ################################################################ | |
425 | |
426 # Read command line arguments | |
427 opt <- read_args() | |
428 | |
429 if (is.null(opt$debug)) { | |
430 options(error = function() { quit(status = 1) }, warn = 0 ) | |
431 } | |
432 | |
433 # Load database | |
434 source(file.path(dirname(script.path), DB.SRC.FILE[[opt$database]]), chdir = TRUE) | |
435 db <- .load.db(opt) | |
436 | |
437 # Print columns | |
438 if ( ! is.null(opt[['list-cols']])) { | |
439 cols <- db$getChromCol() | |
440 df.write.tsv(cols, file = if (is.null(opt[['output-file']])) stdout() else opt[['output-file']]) | |
441 q(status = 0) | |
442 } | |
443 | |
444 # Read input | |
445 if ( ! is.null(opt[['input-file']]) && ! file.exists(opt[['input-file']])) | |
446 stop(paste0("Input file \"", opt[['input-file']], "\" does not exist.")) | |
447 if (file.info(opt[['input-file']])$size > 0) { | |
448 | |
449 # Load file into data frame | |
450 input <- read.table(file = opt[['input-file']], header = TRUE, sep = "\t", stringsAsFactor = FALSE, check.names = FALSE, comment.char = '') | |
451 | |
452 # Convert each column that is identified by a number into a name | |
453 for (field in names(opt[['input-col-names']])) { | |
454 if ( ! opt[['input-col-names']][[field]] %in% colnames(input) && length(grep('^[0-9]+$', opt[['input-col-names']][[field]])) > 0) { | |
455 col.index <- as.integer(opt[['input-col-names']][[field]]) | |
456 if (col.index < 1 || col.index > length(colnames(input))) | |
457 stop(paste0("No column n°", col.index, " for input field ", field, ".")) | |
458 opt[['input-col-names']][[field]] <- colnames(input)[[col.index]] | |
459 } | |
460 } | |
461 } else { | |
462 input <- data.frame() | |
463 input[[opt[['input-col-names']][['mz']]]] <- double() | |
464 input[[opt[['input-col-names']][['rt']]]] <- double() | |
465 } | |
466 | |
467 # Check mz column | |
468 if ( ! opt[['input-col-names']][['mz']] %in% colnames(input)) | |
469 stop(paste0('No column named "', opt[['input-col-names']][['mz']], '" in input file.')) | |
470 | |
471 # Set columns 'all-cols' specified | |
472 if ( ! is.null(opt[['all-cols']])) | |
473 opt$rtcol <- db$getChromCol()[['id']] | |
474 | |
475 # Check chrom columns | |
476 if ( ! is.null(opt[['check-cols']]) && ! is.null(opt$rtcol)) { | |
477 dbcols <- db$getChromCol()[['id']] | |
478 unknown.cols <- opt$rtcol[ ! opt$rtcol %in% dbcols] | |
479 if (length(unknown.cols) > 0) { | |
480 stop(paste0("Unknown chromatographic column", (if (length(unknown.cols) > 1) 's' else ''), ': ', paste(unknown.cols, collapse = ', '), ".\nAllowed chromatographic column names are:\n", paste(dbcols, collapse = "\n"))) | |
481 } | |
482 } | |
483 | |
484 # Check that an RT column exists when using MZ/RT matching | |
485 if ( ! is.null(opt$rtcol) && ! opt[['input-col-names']][['rt']] %in% colnames(input)) | |
486 stop(paste0("You are running an MZ/RT match run on your input data, but no retention time column named '", opt[['input-col-names']][['rt']],"' can be found inside your input file.")) | |
487 | |
488 # Set output col names | |
489 output.col.names <- opt[['input-col-names']] | |
490 | |
491 # Set streams | |
492 input.stream <- MsDbInputDataFrameStream$new(df = input, input.fields = opt[['input-col-names']], rtunit = opt[['rtunit']]) | |
493 main.output <- MsDbOutputDataFrameStream$new(keep.unused = ! is.null(opt[['same-cols']]), output.fields = output.col.names, one.line = ! is.null(opt[['same-rows']]), match.sep = opt[['molids-sep']], first.val = ! is.null(opt[['first-val']]), ascii = ! is.null(opt[['excel2011comp']]), nogreek = ! is.null(opt[['excel2011comp']]), noapostrophe = ! is.null(opt[['excel2011comp']]), noplusminus = ! is.null(opt[['excel2011comp']]), rtunit = opt[['rtunit']]) | |
494 peaks.output <- MsDbOutputDataFrameStream$new(keep.unused = ! is.null(opt[['same-cols']]), output.fields = output.col.names, first.val = ! is.null(opt[['first-val']]), ascii = ! is.null(opt[['excel2011comp']]), nogreek = ! is.null(opt[['excel2011comp']]), noapostrophe = ! is.null(opt[['excel2011comp']]), noplusminus = ! is.null(opt[['excel2011comp']]), rtunit = opt[['rtunit']]) | |
495 invisible(db$setInputStream(input.stream)) | |
496 db$addOutputStreams(c(main.output, peaks.output)) | |
497 | |
498 # Set database units | |
499 db$setMzTolUnit(opt$mztolunit) | |
500 if ( ! is.null(opt[['db-rt-unit']]) && opt$database == 'file') | |
501 db$setRtUnit(opt[['db-rt-unit']]) | |
502 | |
503 # Search database | |
504 mode <- if (opt$mode == POS_MODE) MSDB.TAG.POS else MSDB.TAG.NEG | |
505 db$searchForMzRtList(mode = mode, shift = opt$mzshift, prec = opt$mzprec, rt.tol = opt$rttol, rt.tol.x = opt$rttolx, rt.tol.y = opt$rttoly, col = opt$rtcol, precursor.match = ! is.null(opt[['precursor-match']]), precursor.rt.tol = opt[['precursor-rt-tol']]) | |
506 | |
507 # Write output | |
508 main.output$moveColumnsToBeginning(colnames(input)) | |
509 peaks.output$moveColumnsToBeginning(colnames(input)) | |
510 # TODO Create a class MsDbOutputCsvFileStream | |
511 df.write.tsv(main.output$getDataFrame(), file = opt[['output-file']], row.names = FALSE) | |
512 if ( ! is.null(opt[['peak-output-file']])) | |
513 # TODO Create a class MsDbOutputCsvFileStream | |
514 df.write.tsv(peaks.output$getDataFrame(), file = opt[['peak-output-file']], row.names = FALSE) | |
515 if ( ! is.null(opt[['html-output-file']])) | |
516 # TODO Create a class MsDbOutputHtmlFileStream | |
517 output.html(db = db, peaks = peaks.output$getDataFrame(), file = opt[['html-output-file']]) |