lcmsmatching: search-mz comparison

comparison search-mz @ 5:fb9c0409d85c draft

planemo upload for repository https://github.com/workflow4metabolomics/lcmsmatching.git commit 608d9e59a0d2dcf85a037968ddb2c61137fb9bce

author	prog
date	Wed, 19 Apr 2017 10:00:05 -0400
parents	20d69a062da3
children

comparison

equal deleted inserted replaced

-:b34c14151f25
+:fb9c0409d85c
 #!/usr/bin/env Rscript
-# vi: ft=R
+# vi: ft=R fdm=marker
 args <- commandArgs(trailingOnly = F)
 script.path <- sub("--file=","",args[grep("--file=",args)])
 library(getopt)
 source(file.path(dirname(script.path), 'msdb-common.R'), chdir = TRUE)
-source(file.path(dirname(script.path), 'MsFileDb.R'), chdir = TRUE)
-source(file.path(dirname(script.path), 'MsPeakForestDb.R'), chdir = TRUE)
-source(file.path(dirname(script.path), 'MsXlsDb.R'), chdir = TRUE)
-source(file.path(dirname(script.path), 'Ms4TabSqlDb.R'), chdir = TRUE)
 source(file.path(dirname(script.path), 'MsDbLogger.R'), chdir = TRUE)
 source(file.path(dirname(script.path), 'MsDbInputDataFrameStream.R'), chdir = TRUE)
 source(file.path(dirname(script.path), 'MsDbOutputDataFrameStream.R'), chdir = TRUE)
 source(file.path(dirname(script.path), 'htmlhlp.R'), chdir = TRUE)
 source(file.path(dirname(script.path), 'strhlp.R'), chdir = TRUE)
 # Missing paste0() function in R 2.14.1
 if (as.integer(R.Version()$major) == 2 && as.numeric(R.Version()$minor) < 15)
 	paste0 <- function(...) paste(..., sep = '')
-#############
+# Constants {{{1
-# CONSTANTS #
+################################################################
-#############
 PROG <- sub('^.*/([^/]+)$', '\\1', commandArgs()[4], perl = TRUE)
 USERAGENT <- 'search-mz ; pierrick.roger@gmail.com'
 # Authorized database types
 MSDB.XLS <- 'xls'
 MSDB.4TABSQL <- '4tabsql'
 MSDB.FILE <- 'file'
 MSDB.PEAKFOREST <- 'peakforest'
 MSDB.VALS <- c(MSDB.XLS, MSDB.4TABSQL, MSDB.FILE, MSDB.PEAKFOREST)
+DB.SRC.FILE <- list ()
+DB.SRC.FILE[[MSDB.FILE]] <- 'MsFileDb.R'
+DB.SRC.FILE[[MSDB.PEAKFOREST]] <- 'MsPeakForestDb.R'
+DB.SRC.FILE[[MSDB.XLS]] <- 'MsXlsDb.R'
+DB.SRC.FILE[[MSDB.4TABSQL]] <- 'Ms4TabSqlDb.R'
 # Authorized mode values
 POS_MODE <- 'pos'
 NEG_MODE <- 'neg'
 MSDB.MODE.VALS <- c(POS_MODE, NEG_MODE)
 MSDB.DFT[['molids-sep']] <- MSDB.DFT.MATCH.SEP
 MSDB.DFT[['db-fields']] <- concat.kv.list(msdb.get.dft.db.fields())
 MSDB.DFT[['db-ms-modes']] <- concat.kv.list(MSDB.DFT.MODES)
 MSDB.DFT[['pos-prec']] <- paste(MSDB.DFT.PREC[[MSDB.TAG.POS]], collapse = ',')
 MSDB.DFT[['neg-prec']] <- paste(MSDB.DFT.PREC[[MSDB.TAG.NEG]], collapse = ',')
+MSDB.DFT[['db-rt-unit']] <- MSDB.RTUNIT.SEC
+MSDB.DFT[['rtunit']] <- MSDB.RTUNIT.SEC
 DEFAULT.ARG.VALUES <- MSDB.DFT
 DEFAULT.ARG.VALUES[['input-col-names']] <- concat.kv.list(msdb.get.dft.input.fields())
-DEFAULT.ARG.VALUES[['output-col-names']] <- concat.kv.list(msdb.get.dft.output.fields())
+# Print help {{{1
-##############
+################################################################
-# PRINT HELP #
-##############
+print.help <- function() {
-print.help <- function(spec, status = 0) {
+	cat("USAGE:\n")
-	cat(getopt(spec, usage = TRUE, command = PROG))
+	prog.mz.match <- paste(PROG, ' -d (', paste(MSDB.VALS, collapse = '|'), ') --url (file|dir|database URL) -i <file> -m (', paste(MSDB.MODE.VALS, collapse = '|'), ") -p <mz precision> -s <mz shift> -u (", paste(MSDB.MZTOLUNIT.VALS, collapse = '|'), ") -o <file>", sep = '')
-	q(status = status)
+	cat("\t(1) ", prog.mz.match, " ...\n", sep = '')
-}
+	cat("\n")
+	cat("\t(2) ", prog.mz.match, "(--all-cols|-c <cols>) -x <X RT tolerance> -y <Y RT tolerance>", " ...\n", sep = '')
-###############################
+	cat("\n")
-# SET DEFAULT ARGUMENT VALUES #
+	cat("\t(3) ", PROG, ' -d (', paste(MSDB.VALS, collapse = '|'), ") --url (file|dir|database URL) --list-cols\n", sep = '')
-###############################
+	cat("\nDETAILS:\n")
+	cat("Form (1) is for running an MZ match on a database.\n")
+	cat("Form (2) is for running an MZ/RT match on a database.\n")
+	cat("Form (3) is for getting a list of available chromatographic columns in a database.\n")
+	cat("\nOPTIONS:\n")
+	spec <- matrix(make.getopt.spec(), byrow = TRUE, ncol = 5)
+	max.length.opt.cols <- max(nchar(spec[,1])) + 1
+	sections <- list(database = "Database setting", input = "Input file", output = "Output files", mz = "M/Z matching", rt = "RT matching", precursor = "Precursor matching", misc = "Miscellaneous")
+	for (section in names(sections)) {
+		cat("\n\t", sections[[section]], ":\n", sep = '')
+		spec <- matrix(make.getopt.spec(section), byrow = TRUE, ncol = 5)
+		for (i in seq(nrow(spec))) {
+			opt <- ''
+			if ( ! is.na(spec[i,2]))
+				opt <- paste('-', spec[i,2], '|', sep = '')
+			opt <- paste(opt, '--', spec[i, 1], sep = '')
+			nb.space.padding <- max.length.opt.cols - nchar(opt) + 6
+			padding <- paste(rep(' ', nb.space.padding), sep = '')
+			cat("\t\t", opt, padding, "\t", spec[i, 5], "\n", sep = '')
+		}
+	}
+	cat("\nEXAMPLES:\n")
+	cat("\nSimple M/Z matching with a file database:\n")
+	cat("\t./", PROG, " -d file --url mydbfile.tsv -i input.tsv -m pos -o output.tsv\n", sep = '')
+	cat("\nFile database with M/Z tolerance:\n")
+	cat("\t./", PROG, " -d file --url mydbfile.tsv -i input.tsv -m pos -o output.tsv -p 0.5 -s 0\n", sep = '')
+	cat("\nFile database with M/Z tolerance unit:\n")
+	cat("\t./", PROG, " -d file --url mydbfile.tsv -i input.tsv -m pos -o output.tsv -p 1 -s 0.5 -u plain\n", sep = '')
+	cat("\nPeakforest database:\n")
+	cat("\t./", PROG, " -d peakforest --url https://rest.peakforest.org/ --db-token <your Peakforest token> -i input.tsv -m pos -o output.tsv\n", sep = '')
+}
+# Set default argument values {{{1
+################################################################
 set.dft.arg.val <-function(opt) {
 	for (f in names(MSDB.DFT))
 		if (is.null(opt[[f]]))
 		opt$rtcol <- NULL
 	return(opt)
 }
-#########################
+# Parse argument values {{{1
-# PARSE ARGUMENT VALUES #
+################################################################
-#########################
 parse.arg.val <- function(opt) {
 	# Parse input column names
 	if ( ! is.null(opt[['db-fields']])) {
 		cust <- split.kv.list(opt[['db-fields']])
+		cust <- cust[cust != 'NA']
 		opt[['db-fields']] <- split.kv.list(MSDB.DFT[['db-fields']])
+		cust <- cust[names(cust) %in% names(opt[['db-fields']])]
 		opt[['db-fields']][names(cust)] <- cust
 	}
 	# Parse MS modes
 	if ( ! is.null(opt[['db-ms-modes']])) {
 	if (is.null(opt[['input-col-names']])) {
 		opt[['input-col-names']] <- msdb.get.dft.input.fields()
 	}
 	else {
 		custcols <- split.kv.list(opt[['input-col-names']])
+		custcols <- custcols[custcols != 'NA']
 		dftcols <- msdb.get.dft.input.fields()
 		opt[['input-col-names']] <- c(custcols, dftcols[ ! names(dftcols) %in% names(custcols)])
-	}
-	# Parse output column names
-	if (is.null(opt[['output-col-names']])) {
-		# By default keep input col names for output
-		opt[['output-col-names']] <- msdb.get.dft.output.fields()
-		input.cols <- names(opt[['input-col-names']])
-		output.cols <- names(opt[['output-col-names']])
-		opt[['output-col-names']] <- c(opt[['input-col-names']][input.cols %in% output.cols], opt[['output-col-names']][ ! output.cols %in% input.cols])
-	}
-	else {
-		custcols <- split.kv.list(opt[['output-col-names']])
-		dftcols <- msdb.get.dft.output.fields()
-		opt[['output-col-names']] <- c(custcols, dftcols[ ! names(dftcols) %in% names(custcols)])
 	}
 	# Parse lists of precursors
 	if ( ! is.null(opt[['pos-prec']]))
 		opt[['pos-prec']] <- split.str(opt[['pos-prec']], unlist = TRUE)
 		opt[['neg-prec']] <- split.str(opt[['neg-prec']], unlist = TRUE)
 	return(opt)
 }
-#################################
+# Make getopt specifications {{{1
-# PRINT DEFAULT ARGUMENT VALUES #
+################################################################
-#################################
+make.getopt.spec <- function(sections = NULL) {
-print.dft.arg.val <- function(opt) {
+	spec <- character(0)
-	print.flags <- DEFAULT.ARG.VALUES
-	names(print.flags) <- vapply(names(print.flags), function(x) paste0('print-', x), FUN.VALUE = '')
+	if (is.null(sections) || 'input' %in% sections)
-	for (f in names(print.flags))
+		spec <- c(spec,
-		if ( ! is.null(opt[[f]])) {
+			'input-file',       'i',            1,  'character',    'Set input file.',
-			cat(print.flags[[f]])
+			'input-col-names',  'j',            1,  'character',    paste0('Set the input column names. Default is "', DEFAULT.ARG.VALUES[['input-col-names']], '".')
-			q(status = 0)
+		)
-		}
-}
+	if (is.null(sections) || 'mz' %in% sections)
+		spec <- c(spec,
-make.getopt.spec.print.dft <- function() {
+			'mode',             'm',            1,  'character',    paste0('MS mode. Possible values are:', paste(MSDB.MODE.VALS, collapse = ", "), '.'),
+			'mzshift',          's',            1,  'numeric',      paste0('Shift on m/z. Default is ', MSDB.DFT$mzshift,'.'),
-	spec <- character()
+			'mzprec',           'p',            1,  'numeric',      paste0('Tolerance on m/z. Default is ', MSDB.DFT$mzprec,'.'),
+			'mztolunit',        'u',            1,  'character',    paste0('Unit used for tolerance values (options -s and -p) on M/Z. Default is ', MSDB.DFT$mztolunit,'.')
-	for (f in names(DEFAULT.ARG.VALUES))
+		)
-		spec <- c(spec, paste0('print-', f), NA_character_, 0, 'logical', paste0('Print default value of --', f))
+	if (is.null(sections) || 'rt' %in% sections)
+		spec <- c(spec,
+			'all-cols',         'A',            0,  'logical',      'Use all available chromatographic columns to match retention times.',
+			'rtcol',            'c',            1,  'character',    paste0('Chromatographic column to use. Unset by default. If set, use the corresponding column to filter on retention times, if retention times are provided.'),
+			'check-cols',       'k',            0,  'logical',      'Check that the chromatographic column names specified with option -c really exist.',
+			'list-cols',        'l',            0,  'logical',      'List all chromatographic columns present in the database. Write list inside the file specified by -o option.',
+			'rttol',            'r',            1,  'numeric',      paste0('Tolerance on retention times. Unset by default.'),
+			'rttolx',           'x',            1,  'numeric',      paste0('Tolerance on retention times. Unset by default.'),
+			'rttoly',           'y',            1,  'numeric',      paste0('Tolerance on retention times. Unset by default.'),
+			'rtunit',           'v',            1,  'character',    paste0('Retention time unit for the input file. Default is ', MSDB.DFT$rtunit, '. Allowed values are:', paste(MSDB.RTUNIT.VALS, collapse = ", "), '.')
+		)
+	if (is.null(sections) || 'precursor' %in% sections)
+		spec <- c(spec,
+			'precursor-match',  'Q',            0,  'logical',      'Remove peaks whose molecule precursor peak has not been matched. Unset by default.',
+			'precursor-rt-tol', 'R',            1,  'numeric',      paste0('Precursor retention time tolerance. Only used when precursor-match is enabled. Default is ', MSDB.DFT[['precursor-rt-tol']], '.'),
+			'pos-prec',         'Y',            1,  'character',    paste0('Set the list of precursors to use in positive mode. Default is "', MSDB.DFT[['pos-prec']], '".'),
+			'neg-prec',         'Z',            1,  'character',    paste0('Set the list of precursors to use in negative mode. Default is "', MSDB.DFT[['neg-prec']], '".')
+		)
+	if (is.null(sections) || 'output' %in% sections)
+		spec <- c(spec,
+			'output-file',      'o',            1,  'character',    'Set file to use for the main output.',
+			'peak-output-file', 'O',            1,  'character',    'If set and if --same-rows is set, then output all matches inside the specified file, with one mz match per line. The output columns are: mz, rt, id, col, colrt, composition, attribution. This means that if an mz value is matched several times, then it will repeated on several lines, with one match description per line.',
+			'html-output-file', 'H',            1,  'character',    'Set file to use for the HTML output.',
+			'no-main-table-in-html-output',  't',  0,  'logical',   'Do not display main table in HTML output.',
+			'same-rows',        'a',            0,  'logical',      'If set, output exactly the same number of rows as the input. This means that in case of multiple matches for one mz, then only one line is output (i.e.: the mz value is not duplicated on several lines). In the main output file, an "ms.matching" column is output with inside, for each mz, a comma separated list of matched component/molecule IDs. If unset, then only the main output file is used, and one single is written to it with one line per peak match, and eventual mz line duplicated if there are multiple matches for this mz.',
+			'same-cols',        'b',            0,  'logical',      'If set, output the same columns as inside the input. All input columns are copied to the output.',
+			'molids-sep',       'S',            1,  'character',    paste0('Set character separator used to when concatenating molecule IDs in output. Default is "', MSDB.DFT[['molids-sep']] , '".'),
+			'first-val',        '1',            0,  'logical',      'Keep only the first value in multi-value fields. Unset by default.',
+			'excel2011comp',    'X',            0,  'logical',      'Excel 2011 compatiblity mode. Output ASCII text files instead of UTF-8 files, where greek letters are replaced with their latin names, plusminus sign is replaced with +- and apostrophe is replaced with \"prime\". All other non-ASCII characters are repladed with underscore.'
+		)
+	if (is.null(sections) || 'database' %in% sections)
+		spec <- c(spec,
+			'database',         'd',            1,  'character',    paste0('Set database to use: "xls" for an Excel database, "file" for a single file database, "4tabsql" for a 4Tab SQL database, and "peakforest" for a connection to PeakForest database.'),
+			'url',              'W',            1,  'character',    'URL of database. For "peakforest" database it is the HTTP URL, for the "xls" database it is the path to the directory containing the Excel files, for the "file" database it is the path to the file database and for the "4tabsql" database it is the IP address of the server.',
+			'cache-dir',        'C',            1,  'character',    'Path to directory where to store cache files. Only used when database flag is set to "xls".',
+			'db-name',          'N',            1,  'character',    'Name of the database. Used by the "4tabsql" database.',
+			'db-user',          'U',            1,  'character',    'User of the database. Used by the "4tabsql" database.',
+			'db-password',      'P',            1,  'character',    'Password of the database user. Used by the "4tabsql" database.',
+			'db-ms-modes',      'M',            1,  'character',    paste0('Comma separated key/value list giving the MS modes to be used in the single file database. Default is "', MSDB.DFT[['db-ms-modes']], '".'),
+			'db-rt-unit',       'V',            1,  'character',    paste0('Retention time unit for the database, used in the single file database. Default is "', MSDB.DFT[['db-rt-unit']], '". Allowed values are:', paste(MSDB.RTUNIT.VALS, collapse = ", "), '.'),
+			'db-token',         'T',            1,  'character',    'Database token. Used by Peakforest database.',
+			'db-fields',        'F',            1,  'character',    paste0('Comma separated key/value list giving the field names to be used in the single file database. Default is "', MSDB.DFT[['db-fields']], '".')
+		)
+	if (is.null(sections) || 'misc' %in% sections)
+		spec <- c(spec,
+			'help',             'h',            0,  'logical',      'Print this help.',
+			'debug',            'g',            0,  'logical',      'Set debug mode.'
+		)
 	return(spec)
 }
-##############################
+# Read args {{{1
-# MAKE GETOPT SPECIFICATIONS #
+################################################################
-##############################
-make.getopt.spec <- function() {
-	spec = c(
-		'help',             'h',            0,  'logical',      'Print this help.',
-		'mode',             'm',            1,  'character',    paste0('MS mode. Possible values are:', paste(MSDB.MODE.VALS, collapse = ", "), '.'),
-		'mzshift',          's',            1,  'numeric',      paste0('Shift on m/z, in ppm. Default is ', MSDB.DFT$mzshift,'.'),
-		'mzprec',           'p',            1,  'numeric',      paste0('Tolerance on m/z, in ppm. Default is ', MSDB.DFT$mzprec,'.'),
-		'mztolunit',        NA_character_,  1,  'character',    paste0('Tolerance on m/z, in ppm. Default is ', MSDB.DFT$mztolunit,'.'),
-		'rttol',            'r',            1,  'numeric',      paste0('Tolerance on retention times. Unset by default.'),
-		'rttolx',           'x',            1,  'numeric',      paste0('Tolerance on retention times. Unset by default.'),
-		'rttoly',           'y',            1,  'numeric',      paste0('Tolerance on retention times. Unset by default.'),
-		'rtcol',            'c',            1,  'character',    paste0('Chromatographic column to use. Unset by default. If set, use the corresponding column to filter on retention times, if retention times are provided.'),
-		'all-cols',         NA_character_,  0,  'logical',      'Use all available chromatographic columns to match retention times.',
-		'check-cols',       NA_character_,  0,  'logical',      'Check that the chromatographic column names specified with option -c really exist.',
-		'list-cols',        NA_character_,  0,  'logical',      'List all chromatographic columns present in the database. Write list inside the file specified by -o option.',
-		'same-rows',        'a',            0,  'logical',      'If set, output exactly the same number of rows as the input. This means that in case of multiple matches for one mz, then only one line is output (i.e.: the mz value is not duplicated on several lines). In the main output file, an "ms.matching" column is output with inside, for each mz, a comma separated list of matched component/molecule IDs. If unset, then only the main output file is used, and one single is written to it with one line per peak match, and eventual mz line duplicated if there are multiple matches for this mz.',
-		'same-cols',        'b',            0,  'logical',      'If set, output the same columns as inside the input. All input columns are copied to the output.',
-		'input-file',       'i',            1,  'character',    'Set input file.',
-		'output-file',      'o',            1,  'character',    'Set file to use for the main output.',
-		'peak-output-file', NA_character_,  1,  'character',    'If set and if --same-rows is set, then output all matches inside the specified file, with one mz match per line. The output columns are: mz, rt, id, col, colrt, composition, attribution. This means that if an mz value is matched several times, then it will repeated on several lines, with one match description per line.',
-		'html-output-file', NA_character_,  1,  'character',    'Set file to use for the HTML output.',
-		'no-main-table-in-html-output',  NA_character_,  0,  'logical',      'Do not display main table in HTML output.',
-		'precursor-match',  NA_character_,  0,  'logical',      'Remove peaks whose molecule precursor peak has not been matched. Unset by default.',
-		'precursor-rt-tol', NA_character_,  1,  'numeric',      paste0('Precursor retention time tolerance. Only used when precursor-match is enabled. Default is ', MSDB.DFT[['precursor-rt-tol']], '.'),
-		'pos-prec',         NA_character_,  1,  'character',    paste0('Set the list of precursors to use in positive mode. Default is "', MSDB.DFT[['pos-prec']], '".'),
-		'neg-prec',         NA_character_,  1,  'character',    paste0('Set the list of precursors to use in negative mode. Default is "', MSDB.DFT[['neg-prec']], '".'),
-		'input-col-names',  NA_character_,  1,  'character',    paste0('Set the input column names. Default is "', DEFAULT.ARG.VALUES[['input-col-names']], '".'),
-		'output-col-names', NA_character_,  1,  'character',    paste0('Set the output column names. Default is "', DEFAULT.ARG.VALUES[['output-col-names']], '".'),
-		'molids-sep',       NA_character_,  1,  'character',    paste0('Set character separator used to when concatenating molecule IDs in output. Default is "', MSDB.DFT[['molids-sep']] , '".'),
-		'first-val',        NA_character_,  0,  'logical',      'Keep only the first value in multi-value fields. Unset by default.',
-		'excel2011comp',            NA_character_,  0,  'logical',      'Excel 2011 compatiblity mode. Output ASCII text files instead of UTF-8 files, where greek letters are replaced with their latin names, plusminus sign is replaced with +- and apostrophe is replaced with \"prime\". All other non-ASCII characters are repladed with underscore.',
-		'database',         'd',            1,  'character',    paste0('Set database to use: "xls" for an Excel database, "file" for a single file database, "4tabsql" for a 4Tab SQL database, and "peakforest" for a connection to PeakForest database.'),
-		'url',              NA_character_,  1,  'character',    'URL of database. For "peakforest" database it is the HTTP URL, for the "xls" database it is the path to the directory containing the Excel files, for the "file" database it is the path to the file database and for the "4tabsql" database it is the IP address of the server.',
-		'cache-dir',        NA_character_,  1,  'character',    'Path to directory where to store cache files. Only used when database flag is set to "xls".',
-		'db-name',          NA_character_,  1,  'character',    'Name of the database. Used by the "4tabsql" database.',
-		'db-user',          NA_character_,  1,  'character',    'User of the database. Used by the "4tabsql" database.',
-		'db-password',      NA_character_,  1,  'character',    'Password of the database user. Used by the "4tabsql" database.',
-		'db-fields',        NA_character_,  1,  'character',    paste0('Comma separated key/value list giving the field names to be used in the single file database (option --db-file). Default is "', MSDB.DFT[['db-fields']], '".'),
-		'db-ms-modes',      NA_character_,  1,  'character',    paste0('Comma separated key/value list giving the MS modes to be used in the single file database (option --db-file). Default is "', MSDB.DFT[['db-ms-modes']], '".'),
-		'db-token',         NA_character_,  1,  'character',    'Database token. Used by Peakforest database.',
-		'debug',            NA_character_,  0,  'logical',      'Set debug mode.'
-		)
-	spec <- c(spec, make.getopt.spec.print.dft())
-	if ( ! is.null(spec))
-		spec <- matrix(spec, byrow = TRUE, ncol = 5)
-	return(spec)
-}
-#############
-# READ ARGS #
-#############
 read_args <- function() {
-	# options
+	# Get options
-	spec <- make.getopt.spec()
+	opt <- getopt(matrix(make.getopt.spec(), byrow = TRUE, ncol = 5))
-	opt <- getopt(spec)
 	# help
-	if ( ! is.null(opt$help))
+	if ( ! is.null(opt$help)) {
-		print.help(spec)
+		print.help()
+		quit()
-	print.dft.arg.val(opt) # Print default values
+	}
 	opt <- set.dft.arg.val(opt) # Set default values
 	opt <- parse.arg.val(opt) # Parse list values
 	# Check values
-	error <- .check.db.conn.opts(opt)
+	error <- check.args(opt)
-	if (is.null(opt[['output-file']]) && is.null(opt[['list-cols']])) {
-		warning("You must set a path for the output file.")
+	return(opt)
-		error <- TRUE
+}
-	}
+# Check args {{{1
+################################################################
+check.args <- function(opt) {
+	# Check database type
+	if (is.null(opt$database))
+		stop("You must provide a database type through --database option.")
+	if ( ! opt$database %in% MSDB.VALS)
+		stop(paste0("Invalid value \"", opt$database, "\" for --database option."))
+	# Check filedb database
+	if (opt$database == MSDB.FILE) {
+		if (is.null(opt$url))
+			stop("When using single file database, you must specify the location of the database file with option --url.")
+		if ( ! file.exists(opt$url))
+			stop(paste0("The file path \"", opt$url,"\" specified with --db-file option is not valid."))
+	}
+	# Check Excel database
+	if (opt$database == MSDB.XLS) {
+		if (is.null(opt$url))
+			stop("When using Excel database, you must specify the location of the Excel files directory with option --url.")
+		if ( ! file.exists(opt$url))
+			stop(paste0("The directory path \"", opt$url,"\" specified with --xls-dir option is not valid."))
+	}
+	# Check 4 tab database
+	if (opt$database == MSDB.4TABSQL) {
+		if (is.null(opt$url))
+			stop("When using 4Tab SQL database, you must specify the URL of the SQL server with option --url.")
+		if (is.null(opt[['db-name']]))
+			stop("When using 4Tab SQL database, you must specify the database name through the --db-name option.")
+		if (is.null(opt[['db-user']]))
+			stop("When using 4Tab SQL database, you must specify the database user through the --db-user option.")
+		if (is.null(opt[['db-password']]))
+			stop("When using 4Tab SQL database, you must specify the database user password through the --db-password option.")
+	}
+	# Check Peakforest database
+	if (opt$database == MSDB.PEAKFOREST) {
+		if (is.null(opt$url))
+			stop("When using PeakForest database, you must specify the URL of the PeakForest server with option --url.")
+	}
 	if (is.null(opt[['list-cols']])) {
-		if (is.null(opt[['input-file']])) {
-			warning("You must provide an input file.")
+		if (is.null(opt[['output-file']]))
-			error <- TRUE
+			stop("You must set a path for the output file.")
-		}
-		if (is.null(opt$mode) || ( ! opt$mode %in% MSDB.MODE.VALS)) {
+		if (is.null(opt[['input-file']]))
-			warning("You must specify a mode through the --mode option.")
+			stop("You must provide an input file.")
-			error <- TRUE
-		}
+		if (is.null(opt$mode) || ( ! opt$mode %in% MSDB.MODE.VALS))
-		if (is.null(opt$mzprec)) {
+			stop("You must specify a mode through the --mode option.")
-			warning("You must set a precision in MZ with the --mzprec option.")
-			error <- TRUE
+		if (is.null(opt$mzprec))
-		}
+			stop("You must set a precision in MZ with the --mzprec option.")
-		if ( ( ! is.null(opt$rtcol) || ! is.null(opt[['all-cols']])) && (is.null(opt$rttolx) || is.null(opt$rttoly))) {
-			warning("When chromatographic columns are set, you must provide values for --rttolx and -rttoly.")
+		if ( ( ! is.null(opt$rtcol) || ! is.null(opt[['all-cols']])) && (is.null(opt$rttolx) || is.null(opt$rttoly)))
-			error <- TRUE
+			stop("When chromatographic columns are set, you must provide values for --rttolx and -rttoly.")
-		}
-		if (is.null(opt$mztolunit) || ( ! opt$mztolunit %in% MSDB.MZTOLUNIT.VALS)) {
+		if (is.null(opt$mztolunit) || ( ! opt$mztolunit %in% MSDB.MZTOLUNIT.VALS))
-			warning("You must specify an M/Z tolerance unit through the --mztolunit option.")
+			stop("You must specify an M/Z tolerance unit through the --mztolunit option.")
-			error <- TRUE
+	}
-		}
+}
-	}
+# Load database {{{1
-	# help
+################################################################
-	if (error)
-		print.help(spec, status = 1)
+.load.db <- function(opt) {
-	return(opt)
+	if (is.null(opt[['pos-prec']]) && is.null(opt[['neg-prec']])) {
-}
+		precursors <- NULL
+	} else {
-	#####################################
+		precursors <- list()
-	# CHECK DATABASE CONNECTION OPTIONS #
+		precursors[[MSDB.TAG.POS]] <- opt[['pos-prec']]
-	#####################################
+		precursors[[MSDB.TAG.NEG]] <- opt[['neg-prec']]
+	}
-	.check.db.conn.opts <- function(opt) {
+	db <- switch(opt$database,
-		# Print default values
+		         peakforest = MsPeakForestDb$new(url = opt$url, useragent = USERAGENT, token = opt[['db-token']]),
-		if ( ! is.null(opt[['print-db-fields']])) {
+		         xls = MsXlsDb$new(db_dir = opt$url, cache_dir = opt[['cache-dir']]),
-			cat(MSDB.DFT[['db-fields']])
+		         '4tabsql' = Ms4TabSqlDb$new(host = extract.address(opt$url), port = extract.port(opt$url), dbname = opt[['db-name']], user = opt[['db-user']], password = opt[['db-password']]),
-			q(status = 0)
+		         file = MsFileDb$new(file = opt$url),
-		}
+		         NULL)
-		if ( ! is.null(opt[['print-db-ms-modes']])) {
+	db$setPrecursors(precursors)
-			cat(MSDB.DFT[['db-ms-modes']])
+	if (db$areDbFieldsSettable())
-			q(status = 0)
+		db$setDbFields(opt[['db-fields']])
-		}
+	if (db$areDbMsModesSettable())
+		db$setDbMsModes(opt[['db-ms-modes']])
-		# Check values
+	db$addObservers(MsDbLogger$new())
-		error <- FALSE
-		if (is.null(opt$database)) {
+	return(db)
-			warning("You must provide a database type through --database option.")
+}
-			error <- TRUE
-		}
+# Output HTML {{{1
-		if ( ! opt$database %in% MSDB.VALS) {
+################################################################
-			warning(paste0("Invalid value \"", opt$database, "\" for --database option."))
-			error <- TRUE
+output.html <- function(db, peaks, file) {
-		}
-		if (opt$database == MSDB.FILE) {
-			if (is.null(opt$url)) {
-			warning("When using single file database, you must specify the location of the database file with option --url.")
-			error <- TRUE
-			}
-			if ( ! file.exists(opt$url)) {
-				warning(paste0("The file path \"", opt$url,"\" specified with --db-file option is not valid."))
-				error <- TRUE
-			}
-		}
-		if (opt$database == MSDB.XLS) {
-			if (is.null(opt$url)) {
-				warning("When using Excel database, you must specify the location of the Excel files directory with option --url.")
-				error <- TRUE
-			}
-			if ( ! file.exists(opt$url)) {
-				warning(paste0("The directory path \"", opt$url,"\" specified with --xls-dir option is not valid."))
-				error <- TRUE
-			}
-		}
-		if (opt$database == MSDB.4TABSQL) {
-			if (is.null(opt$url)) {
-				warning("When using 4Tab SQL database, you must specify the URL of the SQL server with option --url.")
-				error <- TRUE
-			}
-			if (is.null(opt[['db-name']])) {
-				warning("When using 4Tab SQL database, you must specify the database name through the --db-name option.")
-				error <- TRUE
-			}
-			if (is.null(opt[['db-user']])) {
-				warning("When using 4Tab SQL database, you must specify the database user through the --db-user option.")
-				error <- TRUE
-			}
-			if (is.null(opt[['db-password']])) {
-				warning("When using 4Tab SQL database, you must specify the database user password through the --db-password option.")
-				error <- TRUE
-			}
-		}
-		if (opt$database == MSDB.PEAKFOREST) {
-			if (is.null(opt$url)) {
-				warning("When using PeakForest database, you must specify the URL of the PeakForest server with option --url.")
-				error <- TRUE
-			}
-		}
-		return(error)
-	}
-	#############################
-	# DISPLAY COMMAND LINE HELP #
-	#############################
-	.disp.cmd.line.help <- function(optspec, opt, prog, error = FALSE) {
-		if ( ! is.null(opt$help) || error ) {
-			cat(getopt(optspec, usage = TRUE, command = prog))
-			q(status = 1)
-		}
-	}
-	#################
-	# LOAD DATABASE #
-	#################
-	.load.db <- function(opt) {
-		if (is.null(opt[['pos-prec']]) && is.null(opt[['neg-prec']])) {
-			precursors <- NULL
-		} else {
-			precursors <- list()
-			precursors[[MSDB.TAG.POS]] <- opt[['pos-prec']]
-			precursors[[MSDB.TAG.NEG]] <- opt[['neg-prec']]
-		}
-		db <- switch(opt$database,
-		             peakforest = MsPeakForestDb$new(url = opt$url, useragent = USERAGENT, token = opt[['db-token']]),
-		             xls = MsXlsDb$new(db_dir = opt$url, cache_dir = opt[['cache-dir']]),
-		             '4tabsql' = Ms4TabSqlDb$new(host = extract.address(opt$url), port = extract.port(opt$url), dbname = opt[['db-name']], user = opt[['db-user']], password = opt[['db-password']]),
-		             file = MsFileDb$new(file = opt$url),
-		             NULL)
-		db$setPrecursors(precursors)
-		if (db$areDbFieldsSettable())
-			db$setDbFields(opt[['db-fields']])
-		if (db$areDbMsModesSettable())
-			db$setDbMsModes(opt[['db-ms-modes']])
-		db$addObservers(MsDbLogger$new())
-		return(db)
-	}
-###############
-# OUTPUT HTML #
-###############
-output.html <- function(db, main, peaks, file, opt, output.fields) {
 	# Replace public database IDs by URLs
-	if ( ! is.null(peaks) || ! is.null(main)) {
+	if ( ! is.null(peaks)) {
 		# Conversion from extdb id field to extdb name
 		extdb2classdb = list()
 		extdb2classdb[MSDB.TAG.KEGG] = BIODB.KEGG
 		extdb2classdb[MSDB.TAG.HMDB] = BIODB.HMDB
 		extdb2classdb[MSDB.TAG.CHEBI] = BIODB.CHEBI
 		extdb2classdb[MSDB.TAG.PUBCHEM] = BIODB.PUBCHEMCOMP
 		# Loop on all dbs
 		for (extdb in c(MSDB.TAG.KEGG, MSDB.TAG.HMDB, MSDB.TAG.CHEBI, MSDB.TAG.PUBCHEM)) {
-			field <- output.fields[[extdb]]
+			if ( ! is.null(peaks) && extdb %in% colnames(peaks))
-			if ( ! is.null(peaks) && field %in% colnames(peaks))
+				peaks[[extdb]] <- vapply(peaks[[extdb]], function(id) if (is.na(id)) '' else paste0('<a href="', get.entry.url(class = extdb2classdb[[extdb]], accession = id, content.type = BIODB.HTML), '">', id, '</a>'), FUN.VALUE = '')
-				peaks[[field]] <- vapply(peaks[[field]], function(id) if (is.na(id)) '' else paste0('<a href="', get.entry.url(class = extdb2classdb[[extdb]], accession = id, content.type = BIODB.HTML), '">', id, '</a>'), FUN.VALUE = '')
-			if ( ! is.null(main) && field %in% colnames(main))
-				main[[field]] <- vapply(main[[field]], function(ids) if (is.na(ids) || nchar(ids) == 0) '' else paste(vapply(strsplit(ids, opt[['molids-sep']])[[1]], function(id) paste0('<a href="', get.entry.url(class = extdb2classdb[[extdb]], accession = id, content.type = BIODB.HTML), '">', id, '</a>'), FUN.VALUE = ''), collapse = opt[['molids-sep']]), FUN.VALUE = '')
 		}
 	}
 	# Write HTML
 	html <- HtmlWriter(file = file)
 	html$write('tr:nth-child(even) { background-color: LemonChiffon; }')
 	html$write('tr:nth-child(odd) { background-color: LightGreen; }')
 	html$writeEndTag('style')
 	html$writeEndTag('header')
 	html$writeBegTag('body')
-	html$writeTag('h1', text = "LC/MS matching")
-	# Write parameters
-	html$writeTag('h2', text = "Parameters")
-	html$writeBegTag('ul')
-	html$writeTag('li', text = paste0("Mode = ", opt$mode, "."))
-	html$writeTag('li', text = paste0("M/Z precision = ", opt$mzprec, "."))
-	html$writeTag('li', text = paste0("M/Z shift = ", opt$mzshift, "."))
-	html$writeTag('li', text = paste0("Precursor match = ", (if (is.null(opt[['precursor-match']])) "no" else "yes"), "."))
-	if ( ! is.null(opt[['precursor-match']])) {
-		html$writeTag('li', text = paste0("Positive precursors = ", paste0(opt[['pos-prec']], collapse = ', '), "."))
-		html$writeTag('li', text = paste0("Negative precursors = ", paste0(opt[['neg-prec']], collapse = ', '), "."))
-	}
-	if ( ! is.null(opt$rtcol)) {
-		html$writeTag('li', text = paste0("Columns = ", paste(opt$rtcol, collapse = ", "), "."))
-		html$writeTag('li', text = paste0("RTX = ", opt$rttolx, "."))
-		html$writeTag('li', text = paste0("RTY = ", opt$rttoly, "."))
-		if ( ! is.null(opt[['precursor-match']]))
-			html$writeTag('li', text = paste0("RTZ = ", opt[['precursor-rt-tol']], "."))
-	}
-	html$writeEndTag('ul')
 	# Write results
-	html$writeTag('h2', text = "Results")
 	results <- FALSE
-	if ( ! is.null(main) && nrow(main) > 0 && is.null(opt[['no-main-table-in-html-output']])) {
-		html$writeTag('h3', text = "Main output")
-		html$writeTable(main)
-		results <- TRUE
-	}
 	if ( ! is.null(peaks) && nrow(peaks) > 0) {
 		html$writeTag('h3', text = "Matched peaks")
 		html$writeTable(peaks)
 		results <- TRUE
 	}
 	html$writeEndTag('body')
 	html$writeEndTag('html')
 }
-########
+# MAIN {{{1
-# MAIN #
+################################################################
-########
-options(error = function() { traceback(2) ; quit(status = 1) }, warn = 2 )
 # Read command line arguments
 opt <- read_args()
 if (is.null(opt$debug)) {
 	options(error = function() { quit(status = 1) }, warn = 0 )
 }
 # Load database
+source(file.path(dirname(script.path), DB.SRC.FILE[[opt$database]]), chdir = TRUE)
 db <- .load.db(opt)
 # Print columns
 if ( ! is.null(opt[['list-cols']])) {
 	cols <- db$getChromCol()
 if ( ! is.null(opt[['input-file']]) && ! file.exists(opt[['input-file']]))
 	stop(paste0("Input file \"", opt[['input-file']], "\" does not exist."))
 if (file.info(opt[['input-file']])$size > 0) {
 	# Load file into data frame
-	input <- read.table(file = opt[['input-file']], header = TRUE, sep = "\t", stringsAsFactor = FALSE)
+	input <- read.table(file = opt[['input-file']], header = TRUE, sep = "\t", stringsAsFactor = FALSE, check.names = FALSE, comment.char = '')
 	# Convert each column that is identified by a number into a name
 	for (field in names(opt[['input-col-names']])) {
 		if ( ! opt[['input-col-names']][[field]] %in% colnames(input) && length(grep('^[0-9]+$', opt[['input-col-names']][[field]])) > 0) {
 			col.index <- as.integer(opt[['input-col-names']][[field]])
 if ( ! opt[['input-col-names']][['mz']] %in% colnames(input))
 	stop(paste0('No column named "', opt[['input-col-names']][['mz']], '" in input file.'))
 # Set columns 'all-cols' specified
 if ( ! is.null(opt[['all-cols']]))
-	opt$rtcol <- db$getChromCol()
+	opt$rtcol <- db$getChromCol()[['id']]
 # Check chrom columns
 if ( ! is.null(opt[['check-cols']]) && ! is.null(opt$rtcol)) {
 	dbcols <- db$getChromCol()[['id']]
 	unknown.cols <- opt$rtcol[ ! opt$rtcol %in% dbcols]
 # Check that an RT column exists when using MZ/RT matching
 if ( ! is.null(opt$rtcol) && ! opt[['input-col-names']][['rt']] %in% colnames(input))
 	stop(paste0("You are running an MZ/RT match run on your input data, but no retention time column named '", opt[['input-col-names']][['rt']],"' can be found inside your input file."))
+# Set output col names
+output.col.names <- opt[['input-col-names']]
 # Set streams
-input.stream <- MsDbInputDataFrameStream$new(df = input, input.fields = opt[['input-col-names']])
+input.stream <- MsDbInputDataFrameStream$new(df = input, input.fields = opt[['input-col-names']], rtunit = opt[['rtunit']])
-main.output <- MsDbOutputDataFrameStream$new(keep.unused = ! is.null(opt[['same-cols']]), output.fields = opt[['output-col-names']], one.line = ! is.null(opt[['same-rows']]), match.sep = opt[['molids-sep']], first.val = ! is.null(opt[['first-val']]), ascii = ! is.null(opt[['excel2011comp']]), nogreek = ! is.null(opt[['excel2011comp']]), noapostrophe = ! is.null(opt[['excel2011comp']]), noplusminus = ! is.null(opt[['excel2011comp']]))
+main.output <- MsDbOutputDataFrameStream$new(keep.unused = ! is.null(opt[['same-cols']]), output.fields = output.col.names, one.line = ! is.null(opt[['same-rows']]), match.sep = opt[['molids-sep']], first.val = ! is.null(opt[['first-val']]), ascii = ! is.null(opt[['excel2011comp']]), nogreek = ! is.null(opt[['excel2011comp']]), noapostrophe = ! is.null(opt[['excel2011comp']]), noplusminus = ! is.null(opt[['excel2011comp']]), rtunit = opt[['rtunit']])
-peaks.output <- MsDbOutputDataFrameStream$new(keep.unused = ! is.null(opt[['same-cols']]), output.fields = opt[['output-col-names']], first.val = ! is.null(opt[['first-val']]), ascii = ! is.null(opt[['excel2011comp']]), nogreek = ! is.null(opt[['excel2011comp']]), noapostrophe = ! is.null(opt[['excel2011comp']]), noplusminus = ! is.null(opt[['excel2011comp']]))
+peaks.output <- MsDbOutputDataFrameStream$new(keep.unused = ! is.null(opt[['same-cols']]), output.fields = output.col.names, first.val = ! is.null(opt[['first-val']]), ascii = ! is.null(opt[['excel2011comp']]), nogreek = ! is.null(opt[['excel2011comp']]), noapostrophe = ! is.null(opt[['excel2011comp']]), noplusminus = ! is.null(opt[['excel2011comp']]), rtunit = opt[['rtunit']])
 invisible(db$setInputStream(input.stream))
 db$addOutputStreams(c(main.output, peaks.output))
-# Set M/Z tolerance unit
+# Set database units
 db$setMzTolUnit(opt$mztolunit)
+if ( ! is.null(opt[['db-rt-unit']]) && opt$database == 'file')
+	db$setRtUnit(opt[['db-rt-unit']])
 # Search database
 mode <- if (opt$mode == POS_MODE) MSDB.TAG.POS else MSDB.TAG.NEG
 db$searchForMzRtList(mode = mode, shift = opt$mzshift, prec = opt$mzprec, rt.tol = opt$rttol, rt.tol.x = opt$rttolx, rt.tol.y = opt$rttoly, col = opt$rtcol, precursor.match = ! is.null(opt[['precursor-match']]), precursor.rt.tol = opt[['precursor-rt-tol']])
 if ( ! is.null(opt[['peak-output-file']]))
 	# TODO Create a class MsDbOutputCsvFileStream
 	df.write.tsv(peaks.output$getDataFrame(), file = opt[['peak-output-file']], row.names = FALSE)
 if ( ! is.null(opt[['html-output-file']]))
 	# TODO Create a class MsDbOutputHtmlFileStream
-	output.html(db = db, main = main.output$getDataFrame(), peaks = peaks.output$getDataFrame(), file = opt[['html-output-file']], opt = opt, output.fields = opt[['output-col-names']])
+	output.html(db = db, peaks = peaks.output$getDataFrame(), file = opt[['html-output-file']])

Mercurial > repos > prog > lcmsmatching

comparison search-mz @ 5:fb9c0409d85c draft