Mercurial > repos > prog > lcmsmatching
changeset 6:f86fec07f392 draft default tip
planemo upload commit c397cd8a93953798d733fd62653f7098caac30ce
author | prog |
---|---|
date | Fri, 22 Feb 2019 16:04:22 -0500 (2019-02-22) |
parents | fb9c0409d85c |
children | |
files | BiodbFactory.R BiodbObject.R MassFiledbConn.R MassbankConn.R MassbankEntry.R MassdbConn.R Ms4TabSqlDb.R MsBioDb.R MsDb.R MsDbChecker.R MsDbInputDataFrameStream.R MsDbInputStream.R MsDbLogger.R MsDbObserver.R MsDbOutputDataFrameStream.R MsDbOutputStream.R MsFileDb.R MsPeakForestDb.R MsXlsDb.R PeakforestConn.R PeakforestEntry.R README.md UrlRequestScheduler.R biodb-common.R build.xml dfhlp.R excelhlp.R fshlp.R htmlhlp.R lcmsmatching lcmsmatching.xml list-chrom-cols.py list-file-cols.py list-ms-mode-values.py msdb-common.R nethlp.R search-mz search.R strhlp.R test-data/filedb.tsv test-data/mz-input-small_with_nas.tsv test-data/mzrt-input-small.tsv test-data/test_1_main_output.tsv test-data/test_1_peaks_output.html test-data/test_1_peaks_output.tsv test-data/test_2_main_output.tsv test-data/test_2_peaks_output.html test-data/test_2_peaks_output.tsv test-data/test_3_main_output.tsv test-data/test_3_peaks_output.html test-data/test_3_peaks_output.tsv |
diffstat | 51 files changed, 3405 insertions(+), 6873 deletions(-) [+] |
line wrap: on
line diff
--- a/BiodbFactory.R Wed Apr 19 10:00:05 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,274 +0,0 @@ -# vi: fdm=marker - -########################## -# CLASS DECLARATION {{{1 # -########################## - -BiodbFactory <- methods::setRefClass("BiodbFactory", contains = 'BiodbObject', fields = list(.useragent = "character", - .conn = "list", - .cache.dir = "character", - .cache.mode = "character", - .debug = "logical", - .chunk.size = "integer", - .use.env.var = "logical")) - -############### -# CONSTRUCTOR # -############### - -BiodbFactory$methods( initialize = function(useragent = NA_character_, cache.dir = NA_character_, cache.mode = BIODB.CACHE.READ.WRITE, debug = FALSE, chunk.size = NA_integer_, use.env.var = FALSE, ...) { - - .useragent <<- useragent - .conn <<- list() - .cache.dir <<- cache.dir - .cache.mode <<- cache.mode - .debug <<- debug - .chunk.size <<- as.integer(chunk.size) - .use.env.var <<- use.env.var - - callSuper(...) # calls super-class initializer with remaining parameters -}) - -####################### -# PRINT DEBUG MESSAGE # -####################### - -BiodbFactory$methods( .print.debug.msg = function(msg) { - if (.self$.debug) - .print.msg(msg = msg, class = class(.self)) -}) - -################## -# GET USER AGENT # -################## - -BiodbFactory$methods( getUserAgent = function() { - return(.self$.useragent) -}) - -################## -# SET USER AGENT # -################## - - BiodbFactory$methods( setUserAgent = function(useragent) { - "Set useragent of BiodbFactory." - .useragent <<- useragent -}) - -############### -# CREATE CONN # -############### - -BiodbFactory$methods( createConn = function(class, url = NA_character_, token = NA_character_) { - " Create connection to databases useful for metabolomics." - if (class %in% names(.self$.conn)) - stop(paste0('A connection of type ', class, ' already exists. Please use method getConn() to access it.')) - - # Use environment variables - if (.self$.use.env.var) { - if (is.na(url)) - url <- .biodb.get.env.var(c(class, 'URL')) - if (is.na(token)) - token <- .biodb.get.env.var(c(class, 'TOKEN')) - } - - # Create connection instance - conn <- switch(class, - chebi = ChebiConn$new(useragent = .self$.useragent, debug = .self$.debug), - kegg = KeggConn$new(useragent = .self$.useragent, debug = .self$.debug), - pubchemcomp = PubchemConn$new(useragent = .self$.useragent, db = BIODB.PUBCHEMCOMP, debug = .self$.debug), - pubchemsub = PubchemConn$new(useragent = .self$.useragent, db = BIODB.PUBCHEMSUB, debug = .self$.debug), - hmdb = HmdbConn$new(useragent = .self$.useragent, debug = .self$.debug), - chemspider = ChemspiderConn$new(useragent = .self$.useragent, debug = .self$.debug, token = token), - enzyme = EnzymeConn$new(useragent = .self$.useragent, debug = .self$.debug), - lipidmaps = LipidmapsConn$new(useragent = .self$.useragent, debug = .self$.debug), - mirbase = MirbaseConn$new(useragent = .self$.useragent, debug = .self$.debug), - ncbigene = NcbigeneConn$new(useragent = .self$.useragent, debug = .self$.debug), - ncbiccds = NcbiccdsConn$new(useragent = .self$.useragent, debug = .self$.debug), - uniprot = UniprotConn$new(useragent = .self$.useragent, debug = .self$.debug), - massbank = MassbankConn$new(useragent = .self$.useragent, url = url, debug = .self$.debug), - massfiledb = MassFiledbConn$new(file = url, debug = .self$.debug), - peakforest = PeakforestConn$new(useragent = .self$.useragent, debug = .self$.debug), - NULL) - - # Unknown class - if (is.null(conn)) - stop(paste0("Unknown r-biodb class \"", class,"\".")) - - # Register new class - .self$.conn[[class]] <- conn - - return (.self$.conn[[class]]) -}) - -############ -# GET CONN # -############ - -BiodbFactory$methods( getConn = function(class) { - "Get connection to a database." - - if ( ! class %in% names(.self$.conn)) - .self$createConn(class) - - return (.self$.conn[[class]]) -}) - -################ -# CREATE ENTRY # -################ - -BiodbFactory$methods( createEntry = function(class, id = NULL, content = NULL, drop = TRUE) { - "Create Entry from a database by id." - - is.null(id) && is.null(content) && stop("One of id or content must be set.") - ! is.null(id) && ! is.null(content) && stop("id and content cannot be both set.") - - # Debug - .self$.print.debug.msg(paste0("Creating ", if (is.null(id)) length(content) else length(id), " entries from ", if (is.null(id)) "contents" else paste("ids", paste(if (length(id) > 10) id[1:10] else id, collapse = ", ")), "...")) - - # Get content - if ( ! is.null(id)) - content <- .self$getEntryContent(class, id) - conn <- .self$getConn(class) - entry <- conn$createEntry(content = content, drop = drop) - - # Set factory - .self$.print.debug.msg(paste0("Setting factory reference into entries...")) - for (e in c(entry)) - if ( ! is.null(e)) - e$setFactory(.self) - - return(entry) -}) - -######################## -# GET CACHE FILE PATHS # -######################## - -BiodbFactory$methods( .get.cache.file.paths = function(class, id) { - - # Get extension - ext <- .self$getConn(class)$getEntryContentType() - - # Set filenames - filenames <- vapply(id, function(x) { if (is.na(x)) NA_character_ else paste0(class, '-', x, '.', ext) }, FUN.VALUE = '') - - # set file paths - file.paths <- vapply(filenames, function(x) { if (is.na(x)) NA_character_ else file.path(.self$.cache.dir, x) }, FUN.VALUE = '') - - # Create cache dir if needed - if ( ! is.na(.self$.cache.dir) && ! file.exists(.self$.cache.dir)) - dir.create(.self$.cache.dir) - - return(file.paths) -}) - -########################### -# LOAD CONTENT FROM CACHE # -########################### - -BiodbFactory$methods( .load.content.from.cache = function(class, id) { - - content <- NULL - - # Read contents from files - file.paths <- .self$.get.cache.file.paths(class, id) - content <- lapply(file.paths, function(x) { if (is.na(x)) NA_character_ else ( if (file.exists(x)) paste(readLines(x), collapse = "\n") else NULL )} ) - - return(content) -}) - -############################ -# IS CACHE READING ENABLED # -############################ - -BiodbFactory$methods( .is.cache.reading.enabled = function() { - return( ! is.na(.self$.cache.dir) && .self$.cache.mode %in% c(BIODB.CACHE.READ.ONLY, BIODB.CACHE.READ.WRITE)) -}) - -############################ -# IS CACHE WRITING ENABLED # -############################ - -BiodbFactory$methods( .is.cache.writing.enabled = function() { - return( ! is.na(.self$.cache.dir) && .self$.cache.mode %in% c(BIODB.CACHE.WRITE.ONLY, BIODB.CACHE.READ.WRITE)) -}) - -######################### -# SAVE CONTENT TO CACHE # -######################### - -BiodbFactory$methods( .save.content.to.cache = function(class, id, content) { - - # Write contents into files - file.paths <- .self$.get.cache.file.paths(class, id) - mapply(function(c, f) { if ( ! is.null(c)) writeLines(c, f) }, content, file.paths) -}) - -##################### -# GET ENTRY CONTENT # -##################### - -BiodbFactory$methods( getEntryContent = function(class, id) { - - # Debug - .self$.print.debug.msg(paste0("Get entry content(s) for ", length(id)," id(s)...")) - - # Initialize content - if (.self$.is.cache.reading.enabled()) { - content <- .self$.load.content.from.cache(class, id) - missing.ids <- id[vapply(content, is.null, FUN.VALUE = TRUE)] - } - else { - content <- lapply(id, as.null) - missing.ids <- id - } - - # Remove duplicates - n.duplicates <- sum(duplicated(missing.ids)) - missing.ids <- missing.ids[ ! duplicated(missing.ids)] - - # Debug - if (any(is.na(id))) - .self$.print.debug.msg(paste0(sum(is.na(id)), " entry ids are NA.")) - if (.self$.is.cache.reading.enabled()) { - .self$.print.debug.msg(paste0(sum( ! is.na(id)) - length(missing.ids), " entry content(s) loaded from cache.")) - if (n.duplicates > 0) - .self$.print.debug.msg(paste0(n.duplicates, " entry ids, whose content needs to be fetched, are duplicates.")) - .self$.print.debug.msg(paste0(length(missing.ids), " entry content(s) need to be fetched.")) - } - - # Get contents - if (length(missing.ids) > 0) { - - # Use connector to get missing contents - conn <- .self$getConn(class) - - # Divide list of missing ids in chunks (in order to save in cache regularly) - chunks.of.missing.ids = if (is.na(.self$.chunk.size)) list(missing.ids) else split(missing.ids, ceiling(seq_along(missing.ids) / .self$.chunk.size)) - - # Loop on chunks - missing.contents <- NULL - for (ch.missing.ids in chunks.of.missing.ids) { - - ch.missing.contents <- conn$getEntryContent(ch.missing.ids) - - # Save to cache - if ( ! is.null(ch.missing.contents) && .self$.is.cache.writing.enabled()) - .self$.save.content.to.cache(class, ch.missing.ids, ch.missing.contents) - - # Append - missing.contents <- c(missing.contents, ch.missing.contents) - - # Debug - if (.self$.is.cache.reading.enabled()) - .self$.print.debug.msg(paste0("Now ", length(missing.ids) - length(missing.contents)," id(s) left to be retrieved...")) - } - - # Merge content and missing.contents - content[id %in% missing.ids] <- vapply(id[id %in% missing.ids], function(x) missing.contents[missing.ids %in% x], FUN.VALUE = '') - } - - return(content) -})
--- a/BiodbObject.R Wed Apr 19 10:00:05 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,32 +0,0 @@ -########################## -# CLASS DECLARATION {{{1 # -########################## - -BiodbObject <- methods::setRefClass("BiodbObject", fields = list( .observers = "ANY" )) - -######################## -# ABSTRACT METHOD {{{1 # -######################## - -BiodbObject$methods( .abstract.method = function() { - - class <- class(.self) - method <- sys.call(length(sys.calls()) - 1) - method <- sub('^[^$]*\\$([^(]*)\\(.*$', '\\1()', method) - - stop(paste("Method", method, "is not implemented in", class, "class.")) -}) - -###################### -# ADD OBSERVERS {{{1 # -###################### - -BiodbObject$methods( addObservers = function(obs) { - - # Check types of observers - if ( ( ! is.list(obs) && ! inherits(obs, "BiodbObserver")) || (is.list(obs) && any( ! vapply(obs, function(o) inherits(o, "BiodbObserver"), FUN.VALUE = TRUE)))) - stop("Observers must inherit from BiodbObserver class.") - - # Add observers to current list - .observers <<- if (is.null(.self$.observers)) c(obs) else c(.self$.observers, obs) -})
--- a/MassFiledbConn.R Wed Apr 19 10:00:05 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,275 +0,0 @@ -# LCMS File db. -# In this type of database, a single file is provided in CSV format. Default separator is tabulation. -# Each line is a MS peak measure, . -# The file contains molecule and spectrum information. Each spectrum has an accession id. - -# TODO Rename setField into setFieldName + addNewField, and setMsMode into setMsModeValue - -############# -# CONSTANTS # -############# - -# Default database fields -.BIODB.DFT.DB.FIELDS <- list() -for (f in c(BIODB.ACCESSION, BIODB.NAME, BIODB.FULLNAMES, BIODB.COMPOUND.ID, BIODB.MSMODE, BIODB.PEAK.MZEXP, BIODB.PEAK.MZTHEO, BIODB.PEAK.COMP, BIODB.PEAK.ATTR, BIODB.CHROM.COL, BIODB.CHROM.COL.RT, BIODB.FORMULA, BIODB.MASS)) - .BIODB.DFT.DB.FIELDS[[f]] <- f - -##################### -# CLASS DECLARATION # -##################### - -MassFiledbConn <- methods::setRefClass("MassFiledbConn", contains = "MassdbConn", fields = list(.file = "character", .file.sep = "character", .file.quote = "character", .field.multval.sep = 'character', .db = "ANY", .db.orig.colnames = "character", .fields = "list", .ms.modes = "character")) - -############### -# CONSTRUCTOR # -############### - -MassFiledbConn$methods( initialize = function(file = NA_character_, file.sep = "\t", file.quote = "\"", ...) { - - # Check file - (! is.null(file) && ! is.na(file)) || stop("You must specify a file database to load.") - file.exists(file) || stop(paste0("Cannot locate the file database \"", file ,"\".")) - - # Set fields - .db <<- NULL - .db.orig.colnames <<- NA_character_ - .file <<- file - .file.sep <<- file.sep - .file.quote <<- file.quote - .fields <<- .BIODB.DFT.DB.FIELDS - .field.multval.sep <<- ';' - .ms.modes <<- c(BIODB.MSMODE.NEG, BIODB.MSMODE.POS) - names(.self$.ms.modes) <- .self$.ms.modes - - callSuper(...) -}) - -###################### -# Is valid field tag # -###################### - -MassFiledbConn$methods( isValidFieldTag = function(tag) { - return (tag %in% names(.self$.fields)) -}) - -########### -# INIT DB # -########### - -MassFiledbConn$methods( .init.db = function() { - - if (is.null(.self$.db)) { - - # Load database - .db <<- read.table(.self$.file, sep = .self$.file.sep, .self$.file.quote, header = TRUE, stringsAsFactors = FALSE, row.names = NULL, comment.char = '') - - # Save column names - .db.orig.colnames <<- colnames(.self$.db) - } -}) - -############# -# Set field # -############# - -MassFiledbConn$methods( setField = function(tag, colname) { - - ( ! is.null(tag) && ! is.na(tag)) || stop("No tag specified.") - ( ! is.null(colname) && ! is.na(colname)) || stop("No column name specified.") - - # Load database file - .self$.init.db() - - # Check that this field tag is defined in the fields list - .self$isValidFieldTag(tag) || stop(paste0("Database field tag \"", tag, "\" is not valid.")) - - # Check that columns are defined in database file - all(colname %in% names(.self$.db)) || stop(paste0("One or more columns among ", paste(colname, collapse = ", "), " are not defined in database file.")) - - # Set new definition - if (length(colname) == 1) - .fields[[tag]] <<- colname - else { - new.col <- paste(colname, collapse = ".") - .self$.db[[new.col]] <- vapply(seq(nrow(.self$.db)), function(i) { paste(.self$.db[i, colname], collapse = '.') }, FUN.VALUE = '') - .fields[[tag]] <<- new.col - } - - # Update data frame column names - colnames(.self$.db) <- vapply(.self$.db.orig.colnames, function(c) if (c %in% .self$.fields) names(.self$.fields)[.self$.fields %in% c] else c, FUN.VALUE = '') -}) - -###################################### -# SET FIELD MULTIPLE VALUE SEPARATOR # -###################################### - -MassFiledbConn$methods( setFieldMultValSep = function(sep) { - .field.multval.sep <<- sep -}) - -################ -# SET MS MODES # -################ - -MassFiledbConn$methods( setMsMode = function(mode, value) { - .self$.ms.modes[[mode]] <- value -}) - -########################## -# GET ENTRY CONTENT TYPE # -########################## - -MassFiledbConn$methods( getEntryContentType = function(type) { - return(BIODB.DATAFRAME) -}) - -################ -# CHECK FIELDS # -################ - -MassFiledbConn$methods( .check.fields = function(fields) { - - if (length(fields) ==0 || (length(fields) == 1 && is.na(fields))) - return - - # Check if fields are known - unknown.fields <- names(.self$.fields)[ ! fields %in% names(.self$.fields)] - if (length(unknown.fields) > 0) - stop(paste0("Field(s) ", paste(fields, collapse = ", "), " is/are unknown.")) - - # Init db - .self$.init.db() - - # Check if fields are defined in file database - undefined.fields <- colnames(.self$.db)[ ! fields %in% colnames(.self$.db)] - if (length(undefined.fields) > 0) - stop(paste0("Column(s) ", paste(fields), collapse = ", "), " is/are undefined in file database.") -}) - -########## -# SELECT # -########## - -# Select data from database -MassFiledbConn$methods( .select = function(cols = NULL, mode = NULL, compound.ids = NULL, drop = FALSE, uniq = FALSE, sort = FALSE, max.rows = NA_integer_) { - - x <- NULL - - # Init db - .self$.init.db() - - # Get db - db <- .self$.db - - # Filter db on mode - if ( ! is.null(mode) && ! is.na(mode)) { - - # Check mode value - mode %in% names(.self$.ms.modes) || stop(paste0("Unknown mode value '", mode, "'.")) - .self$.check.fields(BIODB.MSMODE) - - # Filter on mode - db <- db[db[[unlist(.self$.fields[BIODB.MSMODE])]] %in% .self$.ms.modes[[mode]], ] - } - - # Filter db on compound ids - # TODO - - if ( ! is.null(cols) && ! is.na(cols)) - .self$.check.fields(cols) - - # Get subset - if (is.null(cols) || is.na(cols)) - x <- db - else - x <- db[, unlist(.self$.fields[cols]), drop = drop] - - # Rearrange - if (drop && is.vector(x)) { - if (uniq) - x <- x[ ! duplicated(x)] - if (sort) - x <- sort(x) - } - - # Cut - if ( ! is.na(max.rows)) - x <- if (is.vector(x)) x[1:max.rows] else x[1:max.rows, ] - - return(x) -}) - -################# -# GET ENTRY IDS # -################# - -MassFiledbConn$methods( getEntryIds = function(type) { - - ids <- NA_character_ - - if (type %in% c(BIODB.SPECTRUM, BIODB.COMPOUND)) - ids <- as.character(.self$.select(cols = if (type == BIODB.SPECTRUM) BIODB.ACCESSION else BIODB.COMPOUND.ID, drop = TRUE, uniq = TRUE, sort = TRUE)) - - return(ids) -}) - -################## -# GET NB ENTRIES # -################## - -MassFiledbConn$methods( getNbEntries = function(type) { - return(length(.self$getEntryIds(type))) -}) - -############################### -# GET CHROMATOGRAPHIC COLUMNS # -############################### - -# Inherited from MassdbConn. -MassFiledbConn$methods( getChromCol = function(compound.ids = NULL) { - - # Extract needed columns - db <- .self$.select(cols = c(BIODB.COMPOUND.ID, BIODB.CHROM.COL)) - - # Filter on molecule IDs - if ( ! is.null(compound.ids)) - db <- db[db[[BIODB.COMPOUND.ID]] %in% compound.ids, ] - - # Get column names - cols <- db[[BIODB.CHROM.COL]] - - # Remove duplicates - cols <- cols[ ! duplicated(cols)] - - # Make data frame - chrom.cols <- data.frame(cols, cols, stringsAsFactors = FALSE) - colnames(chrom.cols) <- c(BIODB.ID, BIODB.TITLE) - - return(chrom.cols) -}) - -################# -# GET MZ VALUES # -################# - -# Inherited from MassdbConn. -MassFiledbConn$methods( getMzValues = function(mode = NULL, max.results = NA_integer_) { - - # Get mz values - mz <- .self$.select(cols = BIODB.PEAK.MZ, mode = mode, drop = TRUE, uniq = TRUE, sort = TRUE, max.rows = max.results) - - return(mz) -}) - -################ -# GET NB PEAKS # -################ - -# Inherited from MassdbConn. -MassFiledbConn$methods( getNbPeaks = function(mode = NULL, compound.ids = NULL) { - - # Get peaks - peaks <- .self$.select(cols = BIODB.PEAK.MZTHEO, mode = mode, compound.ids = compound.ids) - - return(length(peaks)) -})
--- a/MassbankConn.R Wed Apr 19 10:00:05 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,122 +0,0 @@ -##################### -# CLASS DECLARATION # -##################### - -MassbankConn <- methods::setRefClass("MassbankConn", contains = c("RemotedbConn", "MassdbConn"), fields = list( .url = "character" )) - -############### -# CONSTRUCTOR # -############### - -MassbankConn$methods( initialize = function(url = NA_character_, ...) { - - # Set URL - .url <<- if (is.null(url) || is.na(url)) BIODB.MASSBANK.EU.WS.URL else url - - callSuper(...) -}) - -########################## -# GET ENTRY CONTENT TYPE # -########################## - -MassbankConn$methods( getEntryContentType = function() { - return(BIODB.TXT) -}) - -##################### -# GET ENTRY CONTENT # -##################### - -MassbankConn$methods( getEntryContent = function(ids) { - - # Debug - .self$.print.debug.msg(paste0("Get entry content(s) for ", length(ids)," id(s)...")) - - URL.MAX.LENGTH <- 2083 - - # Initialize return values - content <- rep(NA_character_, length(ids)) - - # Loop on all - n <- 0 - while (n < length(ids)) { - - # Get list of accession ids to retrieve - accessions <- ids[(n + 1):length(ids)] - - # Create URL request - x <- get.entry.url(class = BIODB.MASSBANK, accession = accessions, content.type = BIODB.TXT, max.length = URL.MAX.LENGTH, base.url = .self$.url) - - # Debug - .self$.print.debug.msg(paste0("Send URL request for ", x$n," id(s)...")) - - # Send request - xmlstr <- .self$.get.url(x$url) - - # Increase number of entries retrieved - n <- n + x$n - - # Parse XML and get text - if ( ! is.na(xmlstr)) { - xml <- xmlInternalTreeParse(xmlstr, asText = TRUE) - ns <- c(ax21 = "http://api.massbank/xsd") - returned.ids <- xpathSApply(xml, "//ax21:id", xmlValue, namespaces = ns) - if (length(returned.ids) > 0) - content[match(returned.ids, ids)] <- xpathSApply(xml, "//ax21:info", xmlValue, namespaces = ns) - } - - # Debug - .self$.print.debug.msg(paste0("Now ", length(ids) - n," id(s) left to be retrieved...")) - } - - return(content) -}) - -################ -# CREATE ENTRY # -################ - -# Creates a Spectrum instance from file content. -# content A file content, downloaded from the public database. -# RETURN A spectrum instance. -MassbankConn$methods( createEntry = function(content, drop = TRUE) { - return(createMassbankEntryFromTxt(content, drop = drop)) -}) - -################# -# GET MZ VALUES # -################# - -MassbankConn$methods( getMzValues = function(mode = NULL, max.results = NA_integer_) { -}) - -################# -# GET ENTRY IDS # -################# - -MassbankConn$methods( getEntryIds = function(max.results = NA_integer_) { - - # Set URL - url <- paste0(.self$.url, 'searchPeak?mzs=1000&relativeIntensity=100&tolerance=1000&instrumentTypes=all&ionMode=Both') - url <- paste0(url, '&maxNumResults=', (if (is.na(max.results)) 0 else max.results)) - - # Send request - xmlstr <- .self$.get.url(url) - - # Parse XML and get text - if ( ! is.na(xmlstr)) { - xml <- xmlInternalTreeParse(xmlstr, asText = TRUE) - ns <- c(ax21 = "http://api.massbank/xsd") - returned.ids <- xpathSApply(xml, "//ax21:id", xmlValue, namespaces = ns) - return(returned.ids) - } -}) - -################## -# GET NB ENTRIES # -################## - -MassbankConn$methods( getNbEntries = function() { - return(length(.self$getEntryIds())) -})
--- a/MassbankEntry.R Wed Apr 19 10:00:05 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,129 +0,0 @@ -########################### -# MASSBANK SPECTRUM CLASS # -########################### - -MassbankEntry <- methods::setRefClass("MassbankEntry", contains = "BiodbEntry") - -########### -# FACTORY # -########### - -createMassbankEntryFromTxt <- function(contents, drop = TRUE) { - - entries <- list() - - # Define fields regex - regex <- character() - regex[[BIODB.ACCESSION]] <- "^ACCESSION: (.+)$" - regex[[BIODB.MSDEV]] <- "^AC\\$INSTRUMENT: (.+)$" - regex[[BIODB.MSDEVTYPE]] <- "^AC\\$INSTRUMENT_TYPE: (.+)$" - regex[[BIODB.MSTYPE]] <- "^AC\\$MASS_SPECTROMETRY: MS_TYPE (.+)$" - regex[[BIODB.MSPRECMZ]] <- "^MS\\$FOCUSED_ION: PRECURSOR_M/Z (.+)$" - regex[[BIODB.NB.PEAKS]] <- "^PK\\$NUM_PEAK: ([0-9]+)$" - regex[[BIODB.MSPRECANNOT]] <- "^MS\\$FOCUSED_ION: PRECURSOR_TYPE (.+)$" - regex[[BIODB.CHEBI.ID]] <- "^CH\\$LINK: CHEBI\\s+(.+)$" - regex[[BIODB.KEGG.ID]] <- "^CH\\$LINK: KEGG\\s+(.+)$" - regex[[BIODB.INCHI]] <- "^CH\\$IUPAC:\\s+(.+)$" - regex[[BIODB.INCHIKEY]] <- "^CH\\$LINK: INCHIKEY\\s+(.+)$" - regex[[BIODB.CHEMSPIDER.ID]] <- "^CH\\$LINK: CHEMSPIDER\\s+(.+)$" - regex[[BIODB.CAS.ID]] <- "^CH\\$LINK: CAS\\s+(.+)$" - regex[[BIODB.FORMULA]] <- "^CH\\$FORMULA:\\s+(.+)$" - regex[[BIODB.SMILES]] <- "^CH\\$SMILES:\\s+(.+)$" - regex[[BIODB.MASS]] <- "^CH\\$EXACT_MASS:\\s+(.+)$" - regex[[BIODB.PUBCHEMCOMP.ID]] <- "^CH\\$LINK: PUBCHEM\\s+.*CID:([0-9]+)" - regex[[BIODB.PUBCHEMSUB.ID]] <- "^CH\\$LINK: PUBCHEM\\s+.*SID:([0-9]+)" - - for (text in contents) { - - # Create instance - entry <- MassbankEntry$new() - - if ( ! is.null(text) && ! is.na(text)) { - - # Read text - lines <- strsplit(text, "\n") - for (s in lines[[1]]) { - - # Test generic regex - parsed <- FALSE - for (field in names(regex)) { - g <- stringr::str_match(s, regex[[field]]) - if ( ! is.na(g[1,1])) { - entry$setField(field, g[1,2]) - parsed <- TRUE - break - } - } - if (parsed) - next - - # Name - if (is.na(entry$getField(BIODB.NAME))) { - g <- stringr::str_match(s, "^CH\\$NAME:\\s+(.+)$") - if ( ! is.na(g[1,1])) - entry$setField(BIODB.NAME, g[1,2]) - } - - # PubChem - g <- stringr::str_match(s, "^CH\\$LINK: PUBCHEM\\s+([0-9]+)$") - if ( ! is.na(g[1,1])) - entry$setField(BIODB.PUBCHEMSUB.ID, g[1,2]) - - # MS MODE - g <- stringr::str_match(s, "^AC\\$MASS_SPECTROMETRY: ION_MODE (.+)$") - if ( ! is.na(g[1,1])) { - entry$setField(BIODB.MSMODE, if (g[1,2] == 'POSITIVE') BIODB.MSMODE.POS else BIODB.MSMODE.NEG) - next - } - - # PEAKS - if (.parse.peak.line(entry, s)) - next - } - } - - entries <- c(entries, entry) - } - - # Replace elements with no accession id by NULL - entries <- lapply(entries, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x) - - # If the input was a single element, then output a single object - if (drop && length(contents) == 1) - entries <- entries[[1]] - - return(entries) -} - -################### -# PARSE PEAK LINE # -################### - -.parse.peak.line <- function(entry, line) { - - peaks <- BIODB.PEAK.DF.EXAMPLE - - # Annotation - g <- stringr::str_match(line, "^\\s+([0-9][0-9.]*) ([A-Z0-9+-]+) ([0-9]+) ([0-9][0-9.]*) ([0-9][0-9.]*)$") - if ( ! is.na(g[1,1])) - peaks[1, c(BIODB.PEAK.MZ, BIODB.PEAK.FORMULA, BIODB.PEAK.FORMULA.COUNT, BIODB.PEAK.MASS, BIODB.PEAK.ERROR.PPM)] <- list(as.double(g[1,2]), g[1,3], as.integer(g[1,4]), as.double(g[1,5]), as.double(g[1,6])) - - # Peak - g <- stringr::str_match(line, "^\\s+([0-9][0-9.]*) ([0-9][0-9.]*) ([0-9]+)$") - if ( ! is.na(g[1,1])) - peaks[1, c(BIODB.PEAK.MZ, BIODB.PEAK.INTENSITY, BIODB.PEAK.RELATIVE.INTENSITY)] <- list(as.double(g[1,2]), as.double(g[1,3]), as.integer(g[1,4])) - - if (nrow(peaks) > 0) { - - # Get curent peaks and merge with new peaks - current.peaks <- entry$getField(BIODB.PEAKS) - if ( ! is.null(current.peaks)) - peaks <- rbind(current.peaks, peaks) - - entry$setField(BIODB.PEAKS, peaks) - - return(TRUE) - } - - return(FALSE) -}
--- a/MassdbConn.R Wed Apr 19 10:00:05 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,130 +0,0 @@ -##################### -# CLASS DECLARATION # -##################### - -MassdbConn <- methods::setRefClass("MassdbConn", contains = "BiodbConn") - -############################### -# GET CHROMATOGRAPHIC COLUMNS # -############################### - -# Get a list of chromatographic columns contained in this database. -# compound.ids A list of compound IDs used to filter results. -# The returned value is a data.frame with two columns : one for the ID (BIODB.ID) and another one for the title (BIODB.TITLE). -MassdbConn$methods( getChromCol = function(compound.ids = NULL) { - stop("Method getChromCol() is not implemented in concrete class.") -}) - -################# -# GET MZ VALUES # -################# - -# Returns a numeric vector of all masses stored inside the database. -MassdbConn$methods( getMzValues = function(mode = NULL, max.results = NA_integer_) { - stop("Method getMzValues() not implemented in concrete class.") -}) - -################ -# GET NB PEAKS # -################ - -# Returns the number of peaks contained in the database -MassdbConn$methods( getNbPeaks = function(mode = NULL, compound.ids = NULL) { - stop("Method getNbPeaks() not implemented in concrete class.") -}) - -######################### -# FIND COMPOUND BY NAME # -######################### - -# Find a molecule by name -# name A vector of molecule names to search for. -# Return an integer vector of the same size as the name input vector, containing the found molecule IDs, in the same order. -MassdbConn$methods( findCompoundByName = function(name) { - stop("Method findCompoundByName() not implemented in concrete class.") -}) - -#################################### -# FIND SPECTRA IN GIVEN MASS RANGE # -#################################### -# Find spectra in the given mass range. -# rtype the type of return, objects, dfspecs data.frame of spectra, dfpeaks data.frame of peaks. -MassdbConn$methods( searchMzRange = function(mzmin, mzmax, rtype = c("objects","dfspecs","dfpeaks")){ - stop("Method searchMzRange() not implemented in concrete class.") -}) - -#################################### -# FIND SPECTRA IN GIVEN MASS RANGE # -#################################### -MassdbConn$methods( searchMzTol = function(mz, tol, tolunit=BIODB.MZTOLUNIT.PLAIN, rtype = c("objects","dfspecs","dfpeaks")){ - stop("Method searchMzTol() not implemented in concrete class.") -}) - -###################################################### -# FIND A MOLECULES WITH PRECURSOR WITHIN A TOLERANCE # -###################################################### - MassdbConn$methods( searchSpecPrecTol = function(mz, tol, tolunit=BIODB.MZTOLUNIT.PLAIN, mode = NULL){ - stop("Method searchSpecPrecTol not implemented in concrete class.") - }) - -################################# -#perform a database MS-MS search# -################################# - -### spec : the spec to match against the database. -### precursor : the mass/charge of the precursor to be looked for. -### mtol : the size of the windows arounf the precursor to be looked for. -### ppm : the matching ppm tolerance. -### fun : -### dmz : the mass tolerance is taken as the minium between this quantity and the ppm. -### npmin : the minimum number of peak to detect a match (2 recommended) - -MassdbConn$methods( msmsSearch = function(spec, precursor, mztol, tolunit, - ppm, fun = BIODB.MSMS.DIST.WCOSINE, - params = list(), npmin=2, dmz = 0.001, - mode = BIODB.MSMODE.POS, return.ids.only = TRUE){ - - - # TODO replace by msms precursor search when available. - lspec <- .self$searchSpecPrecTol( precursor, mztol, BIODB.MZTOLUNIT.PLAIN, mode = mode) - rspec <- lapply(lspec,function(x){ - peaks <- x$getFieldValue(BIODB.PEAKS) - - ####Getting the correct fields - vcomp <- c(BIODB.PEAK.MZ, BIODB.PEAK.RELATIVE.INTENSITY, BIODB.PEAK.INTENSITY) - - foundfields <- vcomp %in% colnames(peaks) - if(sum(foundfields ) < 2){ - stop(paste0("fields can't be coerced to mz and intensity : ",colnames(peaks))) - } - - peaks <- peaks[ , vcomp[which( foundfields ) ] ] - - peaks - }) - - # TODO Import compareSpectra into biodb and put it inside massdb-helper.R or hide it as a private method. - res <- compareSpectra(spec, rspec, npmin = npmin, fun = fun, params = params) - - if(is.null(res)) return(NULL) # To decide at MassdbConn level: return empty list (or empty data frame) or NULL. - ###Adiing the matched peaks and the smimlarity values to spectra. - - lret <-vector(length(lspec),mode = "list") - vsimilarity <- numeric( length( lspec ) ) - vmatched <- vector( mode = "list", length( lspec ) ) - - if( return.ids.only ){ - lret <- sapply( lspec, function( x ) { - x$getFieldValue( BIODB.ACCESSION ) - }) - }else{ - ###TODO implement three types of return. - lret <- lspec - } - - ###Reordering the list. - lret <- lret[ res$ord ] - - - return( list(measure = res$similarity[ res$ord ], matchedpeaks = res$matched [ res$ord ], id = lret)) -})
--- a/Ms4TabSqlDb.R Wed Apr 19 10:00:05 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,351 +0,0 @@ -if ( ! exists('Ms4TabSqlDb')) { # Do not load again if already loaded - - library('methods') - source('msdb-common.R') - source('MsDb.R') - - ##################### - # CLASS DECLARATION # - ##################### - - Ms4TabSqlDb <- setRefClass("Ms4TabSqlDb", contains = "MsDb", fields = list(.host = "character", .port = "integer", .dbname = "character", .user = "character", .password = "character", .drv = "ANY", .conn = "ANY")) - - ############### - # CONSTRUCTOR # - ############### - - Ms4TabSqlDb$methods( initialize = function(host = NA_character_, port = NA_integer_, dbname = NA_character_, user = NA_character_, password = NA_character_, ...) { - - # Initialize members - .host <<- if ( ! is.null(host)) host else NA_character_ - .port <<- if ( ! is.null(port)) port else NA_integer_ - .dbname <<- if ( ! is.null(dbname)) dbname else NA_character_ - .user <<- if ( ! is.null(user)) user else NA_character_ - .password <<- if ( ! is.null(password)) password else NA_character_ - .drv <<- NULL - .conn <<- NULL - - callSuper(...) - }) - - ################## - # GET CONNECTION # - ################## - - Ms4TabSqlDb$methods( .get.connection = function() { - - # Initialize connection - if (is.null(.self$.conn)) { - library('RPostgreSQL') - .drv <<- dbDriver("PostgreSQL") - .conn <<- dbConnect(.self$.drv, host = .self$.host, port = .self$.port, dbname = .self$.dbname, user = .self$.user, password = .self$.password) - } - - return(.self$.conn) - }) - - ############## - # SEND QUERY # - ############## - - Ms4TabSqlDb$methods( .send.query = function(query) { - conn <- .self$.get.connection() # Call it first separately, so library RPostgreSQL is loaded. - rs <- try(dbSendQuery(conn, query)) - return(rs) - }) - - #################### - # GET MOLECULE IDS # - #################### - - Ms4TabSqlDb$methods( getMoleculeIds = function() { - - rs <- .self$.send.query('select pkmol.molecule_id as id from peaklist_name as pkmol;') - ids <- fetch(rs,n=-1) - ids <- ids[['id']] # Get 'id' column - ids <- vapply(ids, function(x) { if (substring(x, 1, 1) == 'N') as.integer(substring(x, 2)) else as.integer(x) } , FUN.VALUE = 1, USE.NAMES = FALSE) - ids <- (sort(ids)) - - return(ids) - }) - - #################### - # GET NB MOLECULES # - #################### - - Ms4TabSqlDb$methods( getNbMolecules = function() { - - rs <- .self$.send.query('select count(*) from peaklist_name;') - df <- fetch(rs,n=-1) - n <- df[[1]] - - return(n) - }) - - ##################### - # GET MOLECULE NAME # - ##################### - - Ms4TabSqlDb$methods( getMoleculeName = function(molid) { - - # Build request - where <- paste0(vapply(molid, function(id) paste0("pkmol.molecule_id = 'N", id, "'"), FUN.VALUE = ''), collapse = ' or ') - request <- paste0('select pkmol.molecule_id as id, pkmol.name from peaklist_name as pkmol where ', where, ';') - - # Run request - rs <- .self$.send.query(request) - df <- fetch(rs,n=-1) - - # Get IDs - ids <- vapply(df[['id']], function(x) as.integer(substring(x, 2)), FUN.VALUE = 1, USE.NAMES = FALSE) - - # Get names in the same order as the input vector - names <- df[['name']][order(ids)[order(molid)]] - - return(if (is.null(names)) NA_character_ else names) - }) - - - ############################### - # GET CHROMATOGRAPHIC COLUMNS # - ############################### - - Ms4TabSqlDb$methods( getChromCol = function(molid = NULL) { - - # Get all columns - if (is.null(molid)) { - request <- 'select name from method;' - - # Get columns of the specified molecules - } else { - where_molids <- paste0(vapply(molid, function(id) paste0("pkmol.molecule_id = 'N", id, "'"), FUN.VALUE = ''), collapse = ' or ') - where <- paste0('pk.name_id = pkmol.id and pk.id = pkret.id_peak and pkret.id_method = method.id and (', where_molids, ')') - request <- paste0('select distinct method.name from method, peaklist as pk, peaklist_name as pkmol, peaklist_ret as pkret where ', where, ';') - } - - # Run request - rs <- .self$.send.query(request) - df <- fetch(rs,n=-1) - - # Gets column list - cols <- df[['name']] - - # Remove FIA - cols <- cols[ cols != 'FIA'] - - # Normalize names - cols <- vapply(cols, .normalize_column_name, FUN.VALUE = '', USE.NAMES = FALSE) - - # Remove duplicates - cols <- cols[ ! duplicated(cols)] - - # Make data frame - cols <- data.frame(id = cols, title = cols, stringsAsFactors = FALSE) - - return(cols) - }) - - ################ - # FIND BY NAME # - ################ - - Ms4TabSqlDb$methods( findByName = function(name) { - - if (is.null(name)) return(NA_integer_) - - # Put names in uppercase - uname <- toupper(name) - - # Build request - where <- paste0(vapply(uname, function(n) paste0("upper(pkmol.name) = '", gsub("'", "''", n, perl = TRUE), "'"), FUN.VALUE = '', USE.NAMES = FALSE), collapse = ' or ') - request <- paste0('select pkmol.molecule_id as id, pkmol.name from peaklist_name as pkmol where ', where, ';') - - # Run request - rs <- .self$.send.query(request) - df <- fetch(rs,n=-1) - - # Adds missing names/IDs - missing_names <- uname[ ! uname %in% toupper(df[['name']])] - df <- rbind(df, data.frame(id = rep(NA_integer_, length(missing_names)), name = missing_names)) - - # Get IDs and names - ids <- vapply(df[['id']], function(x) as.integer(substring(x, 2)), FUN.VALUE = 1, USE.NAMES = FALSE) - names <- toupper(as.character(df[['name']])) - - # Get IDs in the same order as the input vector - ids[order(uname)] <- ids[order(names)] - - return(if (is.null(ids)) NA_integer_ else ids) - }) - - ####################### - # GET RETENTION TIMES # - ####################### - - Ms4TabSqlDb$methods( getRetentionTimes = function(molid, col = NA_character_) { - - if (is.null(molid) || is.na(molid) || length(molid) != 1) - stop("The parameter molid must consist only in a single integer.") - - # Build request - request <- paste0("select distinct method.name as col, (pkret.retention * 60) as ret from peaklist as pk, peaklist_name as pkmol, peaklist_ret as pkret, method where pkret.id_peak = pk.id and pkmol.id = pk.name_id and pkret.id_method = method.id and pkmol.molecule_id = 'N", molid, "'") - if ( ! is.na(col)) { - where_cols <- paste0(vapply(col, function(c) paste0("method.name = '", c, "'"), FUN.VALUE = ''), collapse = ' or ') - request <- paste0(request, ' and (', where_cols, ')') - } - request <- paste0(request, ';') - - # Run request - rs <- .self$.send.query(request) - df <- fetch(rs,n=-1) - - # Remove FIA - df <- df[df[['col']] != 'FIA', ] - - # Normalize names - df[['col']] <- vapply(df[['col']], .normalize_column_name, FUN.VALUE = '', USE.NAMES = FALSE) - - # Build output list - lst <- list() - if (nrow(df) > 0) - for (i in 1:nrow(df)) { - c <- df[i, 'col'] - lst[[c]] <- c(lst[[c]], df[i, 'ret']) - } - - return(lst) - }) - - ################ - # GET NB PEAKS # - ################ - - Ms4TabSqlDb$methods( getNbPeaks = function(molid = NA_integer_, type = NA_character_) { - - # Build request - request <- paste0("select count(*) from peaklist as pk, peaklist_name as pkmol where pkmol.id = pk.name_id") - if ( length(molid) > 1 || ! is.na(molid)) { - where_molids <- paste0(vapply(molid, function(id) paste0("pkmol.molecule_id = 'N", id, "'"), FUN.VALUE = ''), collapse = ' or ') - request <- paste0(request, ' and (', where_molids, ')') - } - if ( ! is.na(type)) { - request <- paste0(request, ' and ', if (type == MSDB.TAG.POS) '' else 'not ', 'ion_pos') - } - request <- paste0(request, ';') - - # Run request - rs <- .self$.send.query(request) - df <- fetch(rs,n=-1) - - return(df[1,1]) - }) - - ############################### - # GET CHROMATOGRAPHIC COLUMNS # - ############################### - - Ms4TabSqlDb$methods( .to.dbcols = function(col) { - - # Get all column names - request <- 'select name from method;' - rs <- .self$.send.query(request) - df <- fetch(rs,n=-1) - - # Get database column names - dbcols <- df[['name']] - dbcols <- dbcols[ dbcols != 'FIA'] - - # Get normalize names - normcols <- vapply(dbcols, .normalize_column_name, FUN.VALUE = '', USE.NAMES = FALSE) - - return(dbcols[normcols == tolower(col)]) - }) - - ################# - # GET MZ VALUES # - ################# - - # Returns a numeric vector of all masses stored inside the database. - Ms4TabSqlDb$methods( getMzValues = function(mode = NULL, max.results = NA_integer_) { - - # Build request - select <- paste0("select distinct pk.mass as ", MSDB.TAG.MZTHEO) - from <- " from peaklist as pk" - where <- "" - if ( ! is.null(mode)) - where <- paste0(" where ", if (mode == MSDB.TAG.POS) '' else 'not ', 'pk.ion_pos') - limit <- "" - if ( ! is.na(NA_integer_)) - limit <- paste(" limit", max.results) - - # Assemble request - request <- paste0(select, from, where, ';') - - # Run request - rs <- .self$.send.query(request) - df <- fetch(rs, n=-1) - - return(df[[MSDB.TAG.MZTHEO]]) - }) - - ########## - # SEARCH # - ########## - - Ms4TabSqlDb$methods( .do.search.for.mz.rt.bounds = function(mode, mz.low, mz.high, rt.low = NULL, rt.high = NULL, col = NULL, attribs = NULL, molids = NULL) { - - # Build request - select <- paste0("select pkmol.molecule_id as ", MSDB.TAG.MOLID, ", pkmol.name as ", MSDB.TAG.MOLNAMES,", pk.mass as ", MSDB.TAG.MZTHEO, ", pk.composition as ", MSDB.TAG.COMP,", pk.attribution as ", MSDB.TAG.ATTR) - from <- " from peaklist as pk, peaklist_name as pkmol" - where <- paste0(" where pkmol.id = pk.name_id and pk.mass >= ", mz.low, " and pk.mass <= ", mz.high) - where <- paste0(where, ' and ', if (mode == MSDB.TAG.POS) '' else 'not ', 'pk.ion_pos') - - # Insert where clause on attribs - if ( ! is.null(attribs)) { - where.attribs <- paste0(vapply(attribs, function(a) paste0("pk.attribution = '", a, "'"), FUN.VALUE = '', USE.NAMES = FALSE), collapse = " or ") - where <- paste0(where, ' and (', where.attribs, ')') - } - - # Insert where clause on molids - if ( ! is.null(molids)) { - where.molids <- paste0(vapply(molids, function(id) paste0("pkmol.molecule_id = 'N", id, "'"), FUN.VALUE = ''), collapse = ' or ') - where <- paste0(where, ' and (', where.molids, ')') - } - - # Insert where clause on columns - if ( ! is.null(col)) { - dbcols <- .self$.to.dbcols(col) - if ( ! is.null(dbcols)) { - - # Can't find specified columns - if (length(dbcols) == 0 && length(col) > 0) - return(.get.empty.result.df(rt = TRUE)) - - select <- paste0(select, ", (60 * pkret.retention) as ", MSDB.TAG.COLRT, ", method.name as ", MSDB.TAG.COL) - from <- paste0(from, ", method, peaklist_ret as pkret") - where.cols <- if (length(dbcols) == 0) 'TRUE' else paste0(vapply(dbcols, function(c) paste0("method.name = '", c, "'"), FUN.VALUE = '', USE.NAMES = FALSE), collapse = " or ") - where <- paste0(where, " and pk.id = pkret.id_peak and pkret.id_method = method.id and (", where.cols, ")") - if (! is.null(rt.low) && ! is.null(rt.high)) - where <- paste0(where, " and pkret.retention * 60 >= ", rt.low, " and pkret.retention * 60 <= ", rt.high) - } - } - - # Assemble request - request <- paste0(select, from, where, ';') - - # Run request - rs <- .self$.send.query(request) - df <- fetch(rs,n=-1) - - # No results - - # Remove N prefix from IDs - if (nrow(df) > 0) - df[[MSDB.TAG.MOLID]] <- vapply(df[[MSDB.TAG.MOLID]], function(x) substring(x, 2), FUN.VALUE = '', USE.NAMES = FALSE) - else if (nrow(df) == 0) - df <- .get.empty.result.df(rt = ! is.null(col)) - - return(df) - }) - -} # end of load safe guard
--- a/MsBioDb.R Wed Apr 19 10:00:05 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,100 +0,0 @@ -if ( ! exists('MsBioDb')) { # Do not load again if already loaded - - library(methods) - source('MsDb.R') - source('BiodbObject.R', chdir = TRUE) - source('BiodbFactory.R', chdir = TRUE) - - ##################### - # CLASS DECLARATION # - ##################### - - MsBioDb <- setRefClass("MsBioDb", contains = "MsDb", fields = list(.massdb = "ANY")) - - ############### - # CONSTRUCTOR # - ############### - - MsBioDb$methods( initialize = function(massdb = NULL, ...) { - - # Check bio database - ! is.null(massdb) || stop("You must set a bio database.") - inherits(massdb, "MassdbConn") || stop("The bio database must inherit from MassdbConn class.") - .massdb <<- massdb - - callSuper(...) - }) - - #################### - # HANDLE COMPOUNDS # - #################### - - MsBioDb$methods( handleCompounds = function() { - return(.self$.massdb$handlesEntryType(BIODB.COMPOUND)) - }) - - #################### - # GET MOLECULE IDS # - #################### - - MsBioDb$methods( getMoleculeIds = function(max.results = NA_integer_) { - return(.self$.massdb$getEntryIds(type = BIODB.COMPOUND, max.results = max.results)) - }) - - #################### - # GET NB MOLECULES # - #################### - - MsBioDb$methods( getNbMolecules = function() { - return(.self$.massdb$getNbEntries(type = BIODB.COMPOUND)) - }) - - ################# - # GET MZ VALUES # - ################# - - MsBioDb$methods( getMzValues = function(mode = NULL, max.results = NA_integer_) { - return(.self$.massdb$getMzValues(mode = mode, max.results = max.results)) - }) - - ##################### - # GET MOLECULE NAME # - ##################### - - MsBioDb$methods( getMoleculeName = function(molid) { - return(.self$.massdb$getMoleculeName(molid)) - }) - - ############################### - # GET CHROMATOGRAPHIC COLUMNS # - ############################### - - MsBioDb$methods( getChromCol = function(molid = NULL) { - return(.self$.massdb$getChromCol(molid)) - }) - - ################ - # FIND BY NAME # - ################ - - MsBioDb$methods( findByName = function(name) { - return(.self$.massdb$findCompoundByName(name)) - }) - - ####################### - # GET RETENTION TIMES # - ####################### - - MsBioDb$methods( getRetentionTimes = function(molid, col = NA_character_) { - return(.self$.massdb$getRetentionTimes(molid, chrom.cols = col)) - }) - - ################ - # GET NB PEAKS # - ################ - - MsBioDb$methods( getNbPeaks = function(molid = NA_integer_, mode = NA_character_) { - return(.self$.massdb$getNbPeaks(compound.ids = molid, mode = mode)) - }) - -}
--- a/MsDb.R Wed Apr 19 10:00:05 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,500 +0,0 @@ -if ( ! exists('MsDb')) { # Do not load again if already loaded - - library('methods') - source('msdb-common.R') - source('MsDbObserver.R') - source('MsDbOutputStream.R') - - ##################### - # CLASS DECLARATION # - ##################### - - MsDb <- setRefClass("MsDb", fields = list(.observers = "ANY", .prec = "list", .output.streams = "ANY", .input.stream = "ANY", .mz.tol.unit = "character", .rt.unit = "character")) - - ############### - # CONSTRUCTOR # - ############### - - MsDb$methods( initialize = function(...) { - - callSuper(...) - - .observers <<- NULL - .output.streams <<- NULL - .input.stream <<- NULL - .prec <<- MSDB.DFT.PREC - .mz.tol.unit <<- MSDB.DFT.MZTOLUNIT - .rt.unit <<- MSDB.RTUNIT.SEC - }) - - #################### - # SET INPUT STREAM # - #################### - - MsDb$methods( setInputStream = function(stream) { - - # Check types of input stream - if ( ! inherits(stream, "MsDbInputStream") && ! is.null(stream)) - stop("Input stream must inherit from MsDbInputStream class.") - - # Save current stream - cur.stream <- .self$.input.stream - - # Set stream - .input.stream <<- stream - - return(cur.stream) - }) - - ###################### - # ADD OUTPUT STREAMS # - ###################### - - MsDb$methods( addOutputStreams = function(stream) { - - # Check types of output streams - if ( ( ! is.list(stream) && ! inherits(stream, "MsDbOutputStream")) || (is.list(stream) && any( ! vapply(stream, function(s) inherits(s, "MsDbOutputStream"), FUN.VALUE = TRUE)))) - stop("Output streams must inherit from MsDbOutputStream class.") - - # Add streams to current list - .output.streams <<- if (is.null(.self$.output.streams)) c(stream) else c(.self$.output.streams, stream) - }) - - ######################### - # REMOVE OUTPUT STREAMS # - ######################### - - MsDb$methods( removeOutputStreams = function(stream) { - - # Check types of output streams - if ( ( ! is.list(stream) && ! inherits(stream, "MsDbOutputStream")) || (is.list(stream) && any( ! vapply(stream, function(s) inherits(s, "MsDbOutputStream"), FUN.VALUE = TRUE)))) - - # Remove streams from current list - .output.streams <<- .self$.output.streams[ ! stream %in% .self$.output.streams] - }) - - ######################## - # RESET OUTPUT STREAMS # - ######################## - - MsDb$methods( resetOutputStreams = function(stream) { - .output.streams <<- NULL - }) - - ################# - # ADD OBSERVERS # - ################# - - MsDb$methods( addObservers = function(obs) { - - # Check types of observers - if ( ( ! is.list(obs) && ! inherits(obs, "MsDbObserver")) || (is.list(obs) && any( ! vapply(obs, function(o) inherits(o, "MsDbObserver"), FUN.VALUE = TRUE)))) - stop("Observers must inherit from MsDbObserver class.") - - # Add observers to current list - .observers <<- if (is.null(.self$.observers)) c(obs) else c(.self$.observers, obs) - }) - - ################## - # SET PRECURSORS # - ################## - - MsDb$methods( setPrecursors = function(prec) { - .prec <<- prec - }) - - ################# - # SET DB FIELDS # - ################# - - MsDb$methods( areDbFieldsSettable = function() { - return(FALSE) - }) - - MsDb$methods( setDbFields = function(fields) { - stop("Method setDbFields() not implemented in concrete class.") - }) - - ################ - # SET MS MODES # - ################ - - MsDb$methods( areDbMsModesSettable = function() { - return(FALSE) - }) - - MsDb$methods( setDbMsModes = function(modes) { - stop("Method setDbMsModes() not implemented in concrete class.") - }) - - ################### - # SET MZ TOL UNIT # - ################### - - MsDb$methods( setMzTolUnit = function(mztolunit) { - - if ( ! mztolunit %in% MSDB.MZTOLUNIT.VALS) - stop(paste0("M/Z tolerance unit must be one of: ", paste(MSDB.MZTOLUNIT.VALS, collapse = ', '), ".")) - - .mz.tol.unit <<- mztolunit - }) - - ############### - # SET RT UNIT # - ############### - - MsDb$methods( setRtUnit = function(unit) { - - if ( ! unit %in% MSDB.RTUNIT.VALS) - stop(paste0("RT unit must be one of: ", paste(MSDB.RTUNIT.VALS, collapse = ', '), ".")) - - .rt.unit <<- unit - }) - - ############### - # GET RT UNIT # - ############### - - MsDb$methods( getRtUnit = function(unit) { - return(.self$.rt.unit) - }) - - #################### - # HANDLE COMPOUNDS # - #################### - - # Returns TRUE if this database handles compounds directly (by IDs) - MsDb$methods( handleCompounds = function() { - return(TRUE) - }) - - #################### - # GET MOLECULE IDS # - #################### - - # Returns an integer vector of all molecule IDs stored inside the database. - MsDb$methods( getMoleculeIds = function(max.results = NA_integer_) { - stop("Method getMoleculeIds() not implemented in concrete class.") - }) - - #################### - # GET NB MOLECULES # - #################### - - # Returns the number of molecules in the database. - MsDb$methods( getNbMolecules = function() { - stop("Method getNbMolecules() not implemented in concrete class.") - }) - - ################# - # GET MZ VALUES # - ################# - - # Returns a numeric vector of all masses stored inside the database. - MsDb$methods( getMzValues = function(mode = NULL, max.results = NA_integer_) { - stop("Method getMzValues() not implemented in concrete class.") - }) - - ##################### - # GET MOLECULE NAME # - ##################### - - # Get molecule names - # molid An integer vector of molecule IDs. - # Returns a character vector containing the names of the molecule IDs, in the same order as the input vector. - MsDb$methods( getMoleculeName = function(molid) { - stop("Method getMoleculeName() not implemented in concrete class.") - }) - - ############################### - # GET CHROMATOGRAPHIC COLUMNS # - ############################### - - # Get chromatographic columns. - # Returns a vector of character listing the chromatographic column names. The name must be formatted in lowercase as following: uplc(-c8)?(-20min)?. - MsDb$methods( getChromCol = function(molid = NULL) { - stop("Method getChromCol() not implemented in concrete class.") - }) - - ################ - # FIND BY NAME # - ################ - - # Find a molecule by name - # name A vector of molecule names to search for. - # Return an integer vector of the same size as the name input vector, containing the found molecule IDs, in the same order. - MsDb$methods( findByName = function(name) { - stop("Method findByName() not implemented in concrete class.") - }) - - ####################### - # GET RETENTION TIMES # - ####################### - - # Get the retention times of a molecule. - # Returns a list of numeric vectors. The list has for keys/names the columns, and for values vectors of numerics (the retention times). If no retention times are registered for this molecule, then returns an empty list. - MsDb$methods( getRetentionTimes = function(molid, col = NA_character_) { - stop("Method getRetentionTimes() not implemented in concrete class.") - }) - - ################ - # GET NB PEAKS # - ################ - - # Get the total number of MS peaks stored inside the database. - # molid The ID of the molecule. - # type The MS type. - MsDb$methods( getNbPeaks = function(molid = NA_integer_, type = NA_character_) { - stop("Method getNbPeaks() not implemented in concrete class.") - }) - - ################## - # GET PEAK TABLE # - ################## - - MsDb$methods( getPeakTable = function(molid = NA_integer_, mode = NA_character_) { - stop("Method getPeakTable() not implemented in concrete class.") - }) - - ########## - # SEARCH # - ########## - - # Find molecule MS peaks whose m/z matches the submitted m/z in the tolerance specified. - # mode The mode to use: either MSDB.TAG.POS or MSDB.TAG.NEG. - # shift The m/z shift to use, in ppm. - # prec The m/z precision to use, in ppm. - # col The chromatographic column used. - # rt.tol Simple retention tolerance parameter: rtinf = rt - rt.tol and rtsup = rt + rt.tol - # rt.tol.x Tolerance parameter for the equations : rtinf = rt - rt.tol.x - rt ^ rt.tol.y and rtsup = rt + rt.tol.x + rt ^ rt.tol.y - # rt.tol.y Tolerance parameter. See rt.tol.x parameter. - # attribs Only search for peaks whose attribution is among this set of attributions. - # molids Only search for peaks whose molecule ID is among this vector of integer molecule IDs. Can also be a data frame with a retention time column x.colnames$rt and a molecule ID column MSDB.TAG.MOLID. - # molids.rt.tol Retention time tolerance used when molids parameter is a data frame (rt, id) - # precursor.match Remove peaks whose molecule precursor peak has not also been matched. - # precursor.rt.tol - # Returns a data frame, listing m/z values provided in input. Several matches can be found for an m/z value, in which case several lines (the same number as the number of matches found) with the same m/z value repeated will be inserted. The m/z values will be listed in the same order as in the input. The columns of the data.frame are: mz, rt (only if present in the input), id, mztheo, col, colrt, composition, attribution. - MsDb$methods( searchForMzRtList = function(x = NULL, mode, shift = NULL, prec = NULL, col = NULL, rt.tol = NULL, rt.tol.x = NULL, rt.tol.y = NULL, molids = NULL, molids.rt.tol = NULL, attribs = NULL, precursor.match = FALSE, precursor.rt.tol = NULL, same.cols = FALSE, same.rows = FALSE, peak.table = FALSE) { - - # Use provided data frame - old.input <- NULL - tmp.output <- NULL - if ( ! is.null(x)) { - tmp.input <- MsDbInputDataFrameStream$new(df = x) - tmp.output <- MsDbOutputDataFrameStream$new() - old.input <- .self$setInputStream(tmp.input) - .self$addOutputStreams(tmp.output) - } - - if (precursor.match) { - # Get IDs of all molecules whose precursor peak matches one of the mz in the list - precursors.df <- .self$.doSearchForMzRtList(mode = mode, shift = shift, prec = prec, col = col, rt.tol = rt.tol, rt.tol.x = rt.tol.x, rt.tol.y = rt.tol.y, attribs = .self$.prec[[mode]], output.to.stream = FALSE) - cols.to.keep <- if (is.null(col)) MSDB.TAG.MOLID else c(MSDB.TAG.MOLID, MSDB.TAG.COL, MSDB.TAG.COLRT) - precursors.ids <- precursors.df[, cols.to.keep, drop = FALSE] - precursors.ids <- precursors.ids[ ! is.na(precursors.ids[[MSDB.TAG.MOLID]]), , drop = FALSE] - precursors.ids <- precursors.ids[ ! duplicated(precursors.ids), ] - - # Get all matching peaks whose molecule is inside the previously obtained list of molecules - df <- .self$.doSearchForMzRtList(mode = mode, shift = shift, prec = prec, col = col, rt.tol = NULL, rt.tol.x = NULL, rt.tol.y = NULL, molids = precursors.ids, molids.rt.tol = precursor.rt.tol, same.cols = same.cols, same.rows = same.rows, peak.table = peak.table) -# TODO -# -# peaks <- if (peak.table) results[['peaks']] else results -# -# # Merge results with the column/rt found for precursors. -# if ( ! is.null(col) && ! is.null(peaks)) { -# precursors.ids <- precursors.df[, c(MSDB.TAG.MOLID, MSDB.TAG.col, MSDB.TAG.COLRT)] -# precursors.ids <- precursors.ids[ ! is.na(precursors.ids[[MSDB.TAG.MOLID]]), ] -# -# # Get rows where ID is NA -# peaks.na <- peaks[is.na(peaks[[MSDB.TAG.MOLID]]), ] -# -# # Get rows where ID is found (i.e.: not NA) -# peaks <- peaks[, !(colnames(peaks) %in% c(MSDB.TAG.COL, MSDB.TAG.COLRT))] # drop col and colrt columns -# peaks.not.na <- peaks[! is.na(peaks[[MSDB.TAG.MOLID]]), ] -# -# # Add col and colrt values to found peaks -# peaks <- merge(peaks.not.na, precursors.ids, by = MSDB.TAG.MOLID) -# -# # Put back unfound peaks -# peaks <- rbind(peaks, peaks.na) -# -# # Sort -# peaks <- peaks[order(peaks[[x.colnames$mz]], peaks[[x.colnames$rt]], peaks[[MSDB.TAG.MOLID]], peaks[[MSDB.TAG.COL]]), ] -# -# # Remove rownames -# rownames(peaks) <- NULL -# -# # Reorder columns -# peaks <- peaks[unlist(.self$.output.fields[names(.PEAK.TABLE.COLS)])] -# } -# -# # Remove duplicates -# if ( ! is.null(peaks)) -# peaks <- peaks[ ! duplicated(peaks), ] -# -# if (peak.table) -# results[['peaks']] <- peaks -# else -# results <- peaks -# -# return(results) - } - else - .self$.doSearchForMzRtList(mode = mode, shift = shift, prec = prec, col = col, rt.tol = rt.tol, rt.tol.x = rt.tol.x, rt.tol.y = rt.tol.y, molids = molids, molids.rt.tol = molids.rt.tol, attribs = attribs, same.cols = same.cols, same.rows = same.rows, peak.table = peak.table) - - if ( ! is.null(x)) { - results <- tmp.output$getDataFrame() - .self$removeOutputStreams(tmp.output) - .self$setInputStream(old.input) - return(results) - } - }) - - MsDb$methods( .doSearchForMzRtList = function(mode, shift = NULL, prec = NULL, col = NULL, rt.tol = NULL, rt.tol.x = NULL, rt.tol.y = NULL, molids = NULL, molids.rt.tol = NULL, attribs = NULL, same.cols = FALSE, same.rows = FALSE, peak.table = FALSE, output.to.stream = TRUE) { - -# # Choose columns to keep from x -# x.cols <- if (same.cols) colnames(x) else intersect(if (is.null(col)) c(x.colnames$mz) else c(x.colnames$mz, x.colnames$rt), colnames(x)) -# -# # Create a peak fake data frame for defining columns -# peaks.fake <- data.frame(stringsAsFactors = FALSE) -# for (field in names(.PEAK.TABLE.COLS)) -# if ( ! is.null(col) || ! field %in% .RT.MATCHING.COLS) -# peaks.fake[.self$.output.fields[[field]]] <- vector(mode = .PEAK.TABLE.COLS[[field]], length = 0) -# -# # Initialize y data frame, so when x contains no rows an empty y data frame is returned with all the columns set with right type. -# if (same.rows) { -# y <- peaks.fake[, if (is.null(col)) c(MSDB.TAG.MZ) else c(MSDB.TAG.MZ, MSDB.TAG.RT), drop = FALSE] -# y[MSDB.TAG.MSMATCHING] <- character() -# } -# else -# y <- peaks.fake -# y <- cbind(y, x[NULL, ! x.cols %in% colnames(y), drop = FALSE]) -# if (peak.table) { -# z <- peaks.fake -# z <- cbind(z, x[NULL, ! x.cols %in% colnames(z), drop = FALSE]) -# } - - # Loop on all lines of input - peaks <- NULL - .self$.input.stream$reset() - while (.self$.input.stream$hasNextValues()) { - - .self$.input.stream$nextValues() - - # Search for m/z - results <- .self$searchForMzRtTols(mode = mode, mz = .self$.input.stream$getMz(), shift = shift, prec = prec, rt = .self$.input.stream$getRt(), col = col, rt.tol = rt.tol, rt.tol.x = rt.tol.x, rt.tol.y = rt.tol.y, attribs = attribs, molids = molids, molids.rt.tol = molids.rt.tol) - - # Call output streams - if (output.to.stream && ! is.null(.self$.output.streams)) - for (s in .self$.output.streams) - s$matchedPeaks(mz = .self$.input.stream$getMz(), rt = if (is.null(col)) NULL else .self$.input.stream$getRt(), peaks = results, unused = .self$.input.stream$getAll(but = if (is.null(col)) c(MSDB.TAG.MZ) else c(MSDB.TAG.MZ, MSDB.TAG.RT))) - - # Append to peak list - peaks <- rbind(peaks, results) - -# # Add results to output -# r <- nrow(y) + 1 -# x.lines <- x[i, x.cols, drop = FALSE] -# x.lines <- rename.col(x.lines, unlist(x.colnames), unlist(.self$.output.fields[names(x.colnames)])) -# if (nrow(results) == 0) { -# y[r, colnames(x.lines)] <- x.lines -# } -# else { -# if (same.rows) { -# y[r, colnames(x.lines)] <- x.lines -# ids <- results[[MSDB.TAG.MOLID]] -# ids <- ids[ ! duplicated(ids)] # Remove duplicated values -# y[r, MSDB.TAG.msmatching] <- paste(ids, collapse = .self$.molids.sep) -# } -# if ( ! same.rows || peak.table) { -# new.rows <- cbind(x.lines, results, row.names = NULL) -# if ( ! same.rows) { -# rows <- r:(r+nrow(results)-1) -# y[rows, colnames(new.rows)] <- new.rows -# } -# if (peak.table) { -# zr <- nrow(z) + 1 -# zrows <- zr:(zr+nrow(results)-1) -# z[zrows, colnames(new.rows)] <- new.rows -# } -# } -# } - } - -# results <- if (peak.table) list(main = y, peaks = z) else y - -# return(results) - return(peaks) - }) - - # rt Retention time in seconds. - # molids An option vector of molecule IDs, used to restrict the search. - MsDb$methods( searchForMzRtTols = function(mode, mz, rt = NULL, shift = NULL, prec = NULL, col = NULL, rt.tol = NULL, rt.tol.x = NULL, rt.tol.y = NULL, attribs = NULL, molids = NULL, molids.rt.tol = NULL, colnames = MSDB.DFT.INPUT.FIELDS) { - - # Set M/Z bounds - if (.self$.mz.tol.unit == MSDB.MZTOLUNIT.PPM) { - mz.low <- mz * (1 + (- shift - prec) * 1e-6) - mz.high <- mz * (1 + (- shift + prec) * 1e-6) - } - else { # PLAIN - mz.low <- mz - shift - prec - mz.high <- mz - shift + prec - } - - # Set retention time bounds - rt.low <- NULL - rt.high <- NULL - if ( ! is.null(rt.tol)) { - low <- rt - rt.tol - high <- rt + rt.tol - rt.low <- if (is.null(rt.low)) low else max(low, rt.low) - rt.high <- if (is.null(rt.high)) high else min(high, rt.high) - } - if ( ! is.null(rt.tol.x)) { - low <- rt - rt.tol.x - rt ^ rt.tol.y - high <- rt + rt.tol.x + rt ^ rt.tol.y - rt.low <- if (is.null(rt.low)) low else max(low, rt.low) - rt.high <- if (is.null(rt.high)) high else min(high, rt.high) - } - - # List molecule IDs - if ( ! is.null(molids.rt.tol) && is.data.frame(molids)) { - ids <- molids[(rt >= molids[[MSDB.TAG.COLRT]] - molids.rt.tol) & (rt <= molids[[MSDB.TAG.COLRT]] + molids.rt.tol), MSDB.TAG.MOLID] - if (length(ids) == 0) - # No molecule ID match for this retention time - return(data.frame()) # return empty result set - } else { - ids <- molids - } - - return(.self$searchForMzRtBounds(mode, - mz.low = mz * (1 + (- shift - prec) * 1e-6), - mz.high = mz * (1 + (- shift + prec) * 1e-6), - rt.low = rt.low, - rt.high = rt.high, - col = col, - attribs = attribs, - molids = ids)) - }) - - # rt.low Lower bound of the retention time in seconds. - # rt.high Higher bound of the retention time in seconds. - MsDb$methods( searchForMzRtBounds = function(mode, mz.low, mz.high, rt.low = NULL, rt.high = NULL, col = NULL, attribs = NULL, molids = NULL) { - - results <- .self$.do.search.for.mz.rt.bounds(mode = mode, mz.low = mz.low, mz.high = mz.high, rt.low = rt.low, rt.high = rt.high, col = col, attribs = attribs, molids = molids) - - return(results) - }) - - # TODO Write description of output: data frame with which columns ? - MsDb$methods( .do.search.for.mz.rt.bounds = function(mode, mz.low, mz.high, rt.low = NULL, rt.high = NULL, col = NULL, attribs = NULL, molids = NULL) { - stop("Method .do.search.for.mz.rt.bounds() not implemented in concrete class.") - }) - - # DEPRECATED - MsDb$methods( searchForMz = function(x, mode, tol = 5, col = NULL, rt.tol.x = 5, rt.tol.y = 0.80) { - warning("Method searchForMz() is deprecated. Use searchForMzRtList() instead.") - .self$searchForMzRtList(x = x, mode = mode, prec = tol, col = col, rt.tol.x = rt.tol.x, rt.tol.y = rt.tol.y) - }) - -} # end of load safe guard
--- a/MsDbChecker.R Wed Apr 19 10:00:05 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,44 +0,0 @@ -if ( ! exists('MsDbChecker')) { # Do not load again if already loaded - - source('MsDbObserver.R') - - ##################### - # CLASS DECLARATION # - ##################### - - MsDbChecker <- setRefClass("MsDbChecker", contains = 'MsDbObserver', fields = list(.fail = 'logical')) - - ############### - # CONSTRUCTOR # - ############### - - # fail If set to TRUE, will fail (i.e.: quit application with a status set to 1) on error. - MsDbChecker$methods( initialize = function(fail = FALSE, ...) { - - .fail <<- if ( ! is.null(fail) && ! is.na(fail)) fail else FALSE - - callSuper(...) # calls super-class initializer with remaining parameters - }) - - ########### - # WARNING # - ########### - - MsDbChecker$methods( warning = function(msg) { - write(paste('WARNING: ', msg), stderr()) - }) - - ######### - # ERROR # - ######### - - MsDbChecker$methods( error = function(msg) { - - write(paste('ERROR:', msg), stderr()) - - # Fail - if (.self$.fail) - quit(status = 1) - }) - -} # end of load safe guard
--- a/MsDbInputDataFrameStream.R Wed Apr 19 10:00:05 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,99 +0,0 @@ -if ( ! exists('MsDbInputDataFrameStream')) { # Do not load again if already loaded - - library(methods) - source('MsDbInputStream.R') - - ##################### - # CLASS DECLARATION # - ##################### - - MsDbInputDataFrameStream <- setRefClass("MsDbInputDataFrameStream", contains = 'MsDbInputStream', fields = list( .df = "ANY", .i = "integer", .rtunit = 'character')) - - ############### - # CONSTRUCTOR # - ############### - - MsDbInputDataFrameStream$methods( initialize = function(df = data.frame(), input.fields = msdb.get.dft.input.fields(), rtunit = MSDB.RTUNIT.SEC, ...) { - - callSuper(input.fields = input.fields, ...) - - .df <<- df - .i <<- 0L - .rtunit <<- rtunit - }) - - ########## - # GET MZ # - ########## - - MsDbInputDataFrameStream$methods( getMz = function() { - - if (.self$.i > 0 && .self$.i <= nrow(.self$.df) && ! is.null(.self$.input.fields[[MSDB.TAG.MZ]])) - return(.self$.df[.self$.i, .self$.input.fields[[MSDB.TAG.MZ]]]) - - return(NULL) - }) - - ########## - # GET RT # - ########## - - MsDbInputDataFrameStream$methods( getRt = function() { - - rt <- NULL - - if (.self$.i > 0 && .self$.i <= nrow(.self$.df) && ! is.null(.self$.input.fields[[MSDB.TAG.RT]])) { - rt <- .self$.df[.self$.i, .self$.input.fields[[MSDB.TAG.RT]]] - if (.self$.rtunit == MSDB.RTUNIT.MIN) - rt <- rt * 60 - } - - return(rt) - }) - - ########### - # GET ALL # - ########### - - MsDbInputDataFrameStream$methods( getAll = function(but = NULL) { - - if (.self$.i > 0 && .self$.i <= nrow(.self$.df)) { - - vals <- .self$.df[.self$.i, , drop = FALSE] - - if ( ! is.null(but)) - vals <- vals[, ! colnames(vals) %in% .self$.input.fields[but], drop = FALSE] - - return(vals) - } - - return(NULL) - }) - - ############### - # NEXT VALUES # - ############### - - MsDbInputDataFrameStream$methods( nextValues = function() { - - if (.self$.i <= nrow(.self$.df)) - .i <<- .self$.i + 1L - }) - - ################### - # HAS NEXT VALUES # - ################### - - MsDbInputDataFrameStream$methods( hasNextValues = function() { - return(.self$.i < nrow(.self$.df)) - }) - - ######### - # RESET # - ######### - - MsDbInputDataFrameStream$methods( reset = function() { - .i <<- 0L - }) - -} # end of load safe guard
--- a/MsDbInputStream.R Wed Apr 19 10:00:05 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,63 +0,0 @@ -if ( ! exists('MsDbInputStream')) { # Do not load again if already loaded - - library('methods') - source('msdb-common.R') - - ##################### - # CLASS DECLARATION # - ##################### - - MsDbInputStream <- setRefClass("MsDbInputStream", fields = list(.input.fields = "ANY")) - - ############### - # CONSTRUCTOR # - ############### - - MsDbInputStream$methods( initialize = function(input.fields = msdb.get.dft.input.fields(), ...) { - - .input.fields <<- input.fields - - callSuper(...) - }) - - ########## - # GET MZ # - ########## - - MsDbInputStream$methods( getMz = function() { - stop("Method getMz() not implemented in concrete class.") - }) - - ########## - # GET RT # - ########## - - MsDbInputStream$methods( getRt = function() { - stop("Method getRt() not implemented in concrete class.") - }) - - ########### - # GET ALL # - ########### - - MsDbInputStream$methods( getAll = function(but = NULL) { - stop("Method getUnused() not implemented in concrete class.") - }) - - ############### - # NEXT VALUES # - ############### - - MsDbInputStream$methods( nextValues = function() { - stop("Method nextValues() not implemented in concrete class.") - }) - - ################### - # HAS NEXT VALUES # - ################### - - MsDbInputStream$methods( hasNextValues = function() { - stop("Method hasNextValues() not implemented in concrete class.") - }) - -} # end of load safe guard
--- a/MsDbLogger.R Wed Apr 19 10:00:05 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,32 +0,0 @@ -if ( ! exists('MsDbLogger')) { # Do not load again if already loaded - - source('MsDbObserver.R') - - ##################### - # CLASS DECLARATION # - ##################### - - MsDbLogger <- setRefClass("MsDbLogger", contains = 'MsDbObserver', fields = list(.verbose = 'numeric', .file = 'ANY' )) - - ############### - # CONSTRUCTOR # - ############### - - MsDbLogger$methods( initialize = function(verbose = 1, file = NULL, ...) { - - .verbose <<- if ( ! is.null(verbose) && ! is.na(verbose)) verbose else 1 - .file <<- if ( ! is.null(file) && ! is.na(file)) file else stderr() - - callSuper(...) # calls super-class initializer with remaining parameters - }) - - ############ - # PROGRESS # - ############ - - MsDbLogger$methods( progress = function(msg, level = 1) { - if (.self$.verbose >= level) - cat(msg, "\n", sep = '', file = .self$.file) - }) - -} # end of load safe guard
--- a/MsDbObserver.R Wed Apr 19 10:00:05 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,32 +0,0 @@ -if ( ! exists('MsDbObserver')) { # Do not load again if already loaded - - library('methods') - - ##################### - # CLASS DECLARATION # - ##################### - - MsDbObserver <- setRefClass("MsDbObserver", fields = list()) - - ############ - # PROGRESS # - ############ - - MsDbObserver$methods( progress = function(msg, level = 1) { - }) - - ########### - # WARNING # - ########### - - MsDbObserver$methods( warning = function(msg) { - }) - - ######### - # ERROR # - ######### - - MsDbObserver$methods( error = function(msg) { - }) - -} # end of load safe guard
--- a/MsDbOutputDataFrameStream.R Wed Apr 19 10:00:05 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,145 +0,0 @@ -if ( ! exists('MsDbOutputDataFrameStream')) { # Do not load again if already loaded - - library(methods) - source('MsDbOutputStream.R') - source('dfhlp.R', chdir = TRUE) - - ##################### - # CLASS DECLARATION # - ##################### - - MsDbOutputDataFrameStream <- setRefClass("MsDbOutputDataFrameStream", contains = 'MsDbOutputStream', fields = list( .df = "ANY", .output.fields = "ANY")) - - ############### - # CONSTRUCTOR # - ############### - - MsDbOutputDataFrameStream$methods( initialize = function(keep.unused = FALSE, one.line = FALSE, match.sep = MSDB.DFT.MATCH.SEP, output.fields = NULL, multval.field.sep = MSDB.DFT.OUTPUT.MULTIVAL.FIELD.SEP, first.val = FALSE, ascii = FALSE, noapostrophe = FALSE, noplusminus = FALSE, nogreek = FALSE, ...) { - - callSuper(keep.unused = keep.unused, one.line = one.line, match.sep = match.sep, multval.field.sep = multval.field.sep, first.val = first.val, ascii = ascii, noapostrophe = noapostrophe, noplusminus = noplusminus, nogreek = nogreek, ...) - - .df <<- data.frame() - .output.fields <<- output.fields - }) - - ################## - # GET DATA FRAME # - ################## - - MsDbOutputDataFrameStream$methods( getDataFrame = function() { - - # Put at least a column name if empty - if (nrow(.self$.df) == 0) - .self$.df[[.self$.output.fields[[MSDB.TAG.MZ]]]] <- numeric() - - return(.self$.df) - }) - - # Move columns to beginning {{{1 - - MsDbOutputDataFrameStream$methods( moveColumnsToBeginning = function(cols) { - all.cols <- colnames(.self$.df) - other.cols <- all.cols[ ! all.cols %in% cols] - cols <- cols[cols %in% all.cols] - .df <<- .self$.df[c(cols, other.cols)] - }) - - ################# - # MATCHED PEAKS # - ################# - - MsDbOutputDataFrameStream$methods( matchedPeaks = function(mz, rt = NULL, unused = NULL, peaks = NULL) { - - library(plyr) - - # Set input values - x <- data.frame(mz = mz) - colnames(x) <- MSDB.TAG.MZ - if ( ! is.null(rt)) { - x.rt <- data.frame(rt = rt) - colnames(x.rt) <- MSDB.TAG.RT - if (.self$.rtunit == MSDB.RTUNIT.MIN) - x.rt[[MSDB.TAG.RT]] <- x.rt[[MSDB.TAG.RT]] / 60 - x <- cbind(x, x.rt) - } - - - # Merge input values with matched peaks - if ( ! is.null(peaks)) { - - # No rows - if (nrow(peaks) == 0) { - # Add NA values - peaks[1, ] <- NA - - # Process existing rows - } else { - - # Convert RT - if (.self$.rtunit == MSDB.RTUNIT.MIN) - if (MSDB.TAG.COLRT %in% colnames(peaks)) - peaks[[MSDB.TAG.COLRT]] <- peaks[[MSDB.TAG.COLRT]] / 60 - - # Process multi-value fields - for (c in colnames(peaks)) - if (c %in% MSDB.MULTIVAL.FIELDS) { - - # Keep only first value in multi-value fields - if (.self$.first.val) - peaks[[c]] <- vapply(peaks[[c]], function(s) split.str(s, sep = MSDB.MULTIVAL.FIELD.SEP, unlist = TRUE)[[1]], FUN.VALUE = '') - - # Change separator - else - peaks[[c]] <- vapply(peaks[[c]], function(s) paste0(split.str(s, sep = MSDB.MULTIVAL.FIELD.SEP, unlist = TRUE), collapse = .self$.multval.field.sep), FUN.VALUE = '') - - } - - # Concatenate results in one line - if (.self$.one.line) { - # For each column, concatenate all values in one string. - for (c in seq(peaks)) { - v <- peaks[[c]] - v <- v[ ! is.na(v)] # remove NA values - v <- v[ ! duplicated(v)] # remove duplicates - peaks[1, c] <- paste0(v, collapse = .self$.match.sep, FUN.VALUE = '') - } - peaks <- peaks[1, ] # Keep only first line - } - } - - # Merge - x <- cbind(x, peaks, row.names = NULL) - } - - # Rename columns for output - x <- rename.col(x, names(.self$.output.fields), .self$.output.fields) - - # Add unused columns - if ( .self$.keep.unused && ! is.null(unused)) { - x <- cbind(x, unused, row.names = NULL) - } - - # Convert strings to ASCII - if (.self$.ascii || .self$.noapostrophe || .self$.noplusminus || .self$.nogreek) - for (c in seq(x)) - if (class(x[[c]]) == 'character') { - if (.self$.noapostrophe) - x[[c]] <- gsub("'", 'prime', x[[c]], perl = TRUE) - if (.self$.noplusminus) - x[[c]] <- gsub('±', '+-', x[[c]], perl = TRUE) - if (.self$.nogreek) { - x[[c]] <- gsub('α', 'alpha', x[[c]], perl = TRUE) - x[[c]] <- gsub('β', 'beta', x[[c]], perl = TRUE) - x[[c]] <- gsub('γ', 'gamma', x[[c]], perl = TRUE) - x[[c]] <- gsub('δ', 'delta', x[[c]], perl = TRUE) - } - if (.self$.ascii) { - x[[c]] <- gsub('[^\u0001-\u007F]', '_', x[[c]], perl = TRUE) - } - } - - # Add new rows to data frame - .df <<- rbind.fill(.self$.df, x) - }) - -} # end of load safe guard
--- a/MsDbOutputStream.R Wed Apr 19 10:00:05 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,47 +0,0 @@ -if ( ! exists('MsDbOutputStream')) { # Do not load again if already loaded - - library('methods') - source('msdb-common.R') - - ##################### - # CLASS DECLARATION # - ##################### - - MsDbOutputStream <- setRefClass("MsDbOutputStream", fields = list(.keep.unused = "logical", .one.line = "logical", .match.sep = "character", .multval.field.sep = "character", .first.val = "logical", .ascii = "logical", .noapostrophe = "logical", .noplusminus = "logical", .nogreek = "logical", .rtunit = 'character')) - - ############### - # CONSTRUCTOR # - ############### - - #' Constructor. - #' - #' @param keep.unused Set to \code{TRUE} if you want to keep in the output, unused columns of the input. - #' @param one.line Set to \code{TRUE} if you want to output only one line for each input line. - #' @return - #' @examples - #' stream <- MsDbOutputDataFrameStream$new(one.line = TRUE) - MsDbOutputStream$methods( initialize = function(keep.unused = FALSE, one.line = FALSE, match.sep = MSDB.DFT.MATCH.SEP, multval.field.sep = MSDB.DFT.OUTPUT.MULTIVAL.FIELD.SEP, first.val = FALSE, ascii = FALSE, noapostrophe = FALSE, noplusminus = FALSE, nogreek = FALSE, rtunit = MSDB.RTUNIT.SEC, ...) { - - callSuper(...) - - .keep.unused <<- keep.unused - .one.line <<- one.line - .match.sep <<- match.sep - .multval.field.sep <<- multval.field.sep - .first.val <<- first.val - .ascii <<- ascii - .noapostrophe <<- noapostrophe - .noplusminus <<- noplusminus - .nogreek <<- nogreek - .rtunit <<- rtunit - }) - - ################# - # MATCHED PEAKS # - ################# - - MsDbOutputStream$methods( matchedPeaks = function(mz, rt = NULL, unused = NULL, peaks = NULL) { - stop("Method matchedPeaks() not implemented in concrete class.") - }) - -} # end of load safe guard
--- a/MsFileDb.R Wed Apr 19 10:00:05 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,485 +0,0 @@ -if ( ! exists('MsFileDb')) { # Do not load again if already loaded - - library('methods') - source('MsDb.R') - source('msdb-common.R') - source('search.R', chdir = TRUE) - - ##################### - # CLASS DECLARATION # - ##################### - - MsFileDb <- setRefClass("MsFileDb", contains = "MsDb", fields = list(.file = "character", .db = "ANY", .fields = "list", .modes = "list", .name.to.id = "ANY")) - - ############### - # CONSTRUCTOR # - ############### - - MsFileDb$methods( initialize = function(file = NA_character_, ...) { - - # Initialize members - .file <<- if ( ! is.null(file)) file else NA_character_ - .db <<- NULL - .fields <<- msdb.get.dft.db.fields() - .modes <<- MSDB.DFT.MODES - .name.to.id <<- NULL - - callSuper(...) - }) - - ################# - # SET DB FIELDS # - ################# - - MsFileDb$methods( areDbFieldsSettable = function() { - return(TRUE) - }) - - MsFileDb$methods( setDbFields = function(fields) { - .fields <<- as.list(fields) - }) - - ################ - # CHECK FIELDS # - ################ - - MsFileDb$methods( .check.fields = function(fields) { - - if (is.null(fields)) - stop("No fields specified for .check.fields()") - - # Check that fields are defined in the fields list - unknown <- fields[ ! fields %in% names(.self$.fields)] - if (length(unknown) > 0) - stop(paste0("Database field", if (length(unknown) == 1) "" else "s", " \"", paste(unkown, collapse = ", "), "\" ", if (length(unknown) == 1) "is" else "are", " not defined.")) - - # Check that field values are real columns inside the database - .self$.init.db() - db.col.names <- fields #vapply(fields, function(s) .self$.fields[[s]], FUN.VALUE = '') - unknown.cols <- db.col.names[ ! db.col.names %in% colnames(.self$.db)] - if (length(unknown.cols) > 0) - stop(paste0("Column", if (length(unknown.cols) == 1) "" else "s", " \"", paste(unknown.cols, collapse = ", "), "\" ", if (length(unknown.cols) == 1) "is" else "are", " not defined inside the database \"", .self$.file, "\".")) - }) - - ################ - # SET MS MODES # - ################ - - MsFileDb$methods( areDbMsModesSettable = function() { - return(TRUE) - }) - - MsFileDb$methods( setDbMsModes = function(modes) { - .modes <<- as.list(modes) - }) - - ########### - # INIT DB # - ########### - - MsFileDb$methods( .init.db = function() { - - if (is.null(.self$.db)) { - - # Load database - .db <<- read.table(.self$.file, sep = "\t", quote = "\"", header = TRUE, stringsAsFactors = FALSE, row.names = NULL, check.names = FALSE, comment.char = '') - - # Check that colnames are unique - dupcol <- duplicated(colnames(.self$.db)) - if (any(dupcol)) - stop(paste("Database header contains duplicated names: ", paste(unique(colnames(.self$.db)[dupcol]), collapse = ', '), ".")) - - # Check that columns names supplied through field map are unique - dupfields <- duplicated(.self$.fields) - if (any(dupfields)) - stop(paste("Some db column names supplied are duplicated: ", paste(unique(.self$.fields[dupfields]), collapse = ', '), ".")) - - # Rename columns - colnames(.self$.db) <- vapply(colnames(.self$.db), function(c) if (c %in% .self$.fields) names(.self$.fields)[.self$.fields %in% c] else c, FUN.VALUE = '') - } - }) - - ############ - # GET DATA # - ############ - - MsFileDb$methods( .get = function(db = NULL, col = NULL) { - - # Init db - if (is.null(db)) { - .self$.init.db() - db <- .self$.db - } - - # Check fields - .self$.check.fields(col) - - # Get database columns -# db.cols <- unlist(.self$.fields[col]) - - return(db[, col]) - }) - - ########### - # GET ROW # - ########### - - MsFileDb$methods( .get.row = function(row, cols = NULL) { - - # Init db - .self$.init.db() - - # Check fields - if ( ! is.null(cols)) - .self$.check.fields(cols) - - if ( ! is.null(cols)) { - #cols <- vapply(cols, function(c) .self$.fields[[c]], FUN.VALUE = '') - return(.self$.db[row, cols]) - } - - return(.self$.db[row, ]) - }) - - ########### - # GET COL # - ########### - - MsFileDb$methods( .get.col = function(col) { - - # Init db - .self$.init.db() - - # Check fields - .self$.check.fields(col) - - #return(.self$.db[[.self$.fields[[col]]]]) - return(.self$.db[[col]]) - }) - - #################### - # GET MOLECULE IDS # - #################### - - MsFileDb$methods( getMoleculeIds = function(max.results = NA_integer_) { - - # Init db - .self$.init.db() - - # Get IDs - mol.ids <- as.character(.self$.get.col(MSDB.TAG.MOLID)) - mol.ids <- mol.ids[ ! duplicated(mol.ids)] - mol.ids <- sort(mol.ids) - - # Cut results - if ( ! is.na(max.results) && length(mol.ids) > max.results) - mol.ids <- mol.ids[1:max.results] - - return(mol.ids) - }) - - #################### - # GET NB MOLECULES # - #################### - - # Returns the number of molecules in the database. - MsFileDb$methods( getNbMolecules = function() { - - # Init db - .self$.init.db() - - # Get IDs - mol.ids <- .self$.get.col(MSDB.TAG.MOLID) - mol.ids <- mol.ids[ ! duplicated(mol.ids)] - - return(length(mol.ids)) - }) - - ##################### - # GET MOLECULE NAME # - ##################### - - MsFileDb$methods( .get.name.from.id = function(db, id) { - - if(is.na(id)) - return(NA_character_) - - # Get names - names <- db[db[[MSDB.TAG.MOLID]] %in% id, MSDB.TAG.MOLNAMES] - if (length(names) == 0) - return(NA_character_) - - # Each molecule has potentially several names. Since we must return only one name for each molecule, we choose the first one. - name <- strsplit(names, ';')[[1]][[1]] - - return(name) - }) - - # Get molecule names - # molid An integer vector of molecule IDs. - # Returns a character vector containing the names of the molecule IDs, in the same order as the input vector. - MsFileDb$methods( getMoleculeName = function(molid) { - - if (is.null(molid)) - return(NA_character_) - - # Init db - .self$.init.db() - - # Get database - db <- .self$.db[, c(MSDB.TAG.MOLID, MSDB.TAG.MOLNAMES)] - - # Remove duplicates - db <- db[! duplicated(db[[MSDB.TAG.MOLID]]), ] - - # Look for ids - names <- vapply(molid, function(i) .self$.get.name.from.id(db, i), FUN.VALUE = '') - - return(names) - }) - - ################### - # INIT NAME TO ID # - ################### - - MsFileDb$methods( .init.name.to.id = function() { - - if (is.null(.self$.name.to.id)) { - - # Create data frame - .name.to.id <<- data.frame(name = character(), id = character(), stringsAsFactors = FALSE) - - # Init db - .self$.init.db() - - # Get database subset (columns name and id only). - db <- .self$.db[, c(MSDB.TAG.MOLID, MSDB.TAG.MOLNAMES)] - - # Remove duplicate IDs - db <- db[! duplicated(db[[MSDB.TAG.MOLID]]), ] - - # Loop on all - for(i in seq(db[[MSDB.TAG.MOLID]])) { - i.id <- db[i, MSDB.TAG.MOLID] - i.names <- split.str(db[i, MSDB.TAG.MOLNAMES], ';', unlist = TRUE) - .name.to.id <<- rbind(.self$.name.to.id, data.frame(name = toupper(i.names), id = rep(i.id, length(i.names)), stringsAsFactors = FALSE)) - } - - # Order by name - .name.to.id <<- .self$.name.to.id[order(.self$.name.to.id[['name']]), ] - } - }) - - #################### - # GET ID FROM NAME # - #################### - - MsFileDb$methods( .get.id.from.name = function(name) { - - # Initialize name.to.id search tree - .self$.init.name.to.id() - - # Search for name - i <- binary.search(toupper(name), .self$.name.to.id[['name']]) - - # Get ID - id <- if (is.na(i)) NA_character_ else as.character(.self$.name.to.id[i, 'id']) - - return(id) - }) - - ################ - # FIND BY NAME # - ################ - - # Find a molecule by name - # name A vector of molecule names to search for. - # Return a vector of the same size as the name input vector, containing the found molecule IDs, in the same order. - MsFileDb$methods( findByName = function(name) { - - if (is.null(name)) - return(NA_character_) - - # Look for molecules with this name - ids <- list() - for (n in name) - ids <- c(ids, list(.self$.get.id.from.name(n))) - - return(ids) - }) - - ############################### - # GET CHROMATOGRAPHIC COLUMNS # - ############################### - - MsFileDb$methods( getChromCol = function(molid = NULL) { - - # Init db - .self$.init.db() - - # Get database - db <- .self$.db[, c(MSDB.TAG.MOLID, MSDB.TAG.COL)] - - # Filter on molecule IDs - if ( ! is.null(molid)) - db <- db[db[[MSDB.TAG.MOLID]] %in% molid,] - - # Get column names - cols <- db[[MSDB.TAG.COL]] - - # Remove duplicates - cols <- cols[ ! duplicated(cols)] - - # Make data frame - cols <- data.frame(id = cols, title = cols, stringsAsFactors = FALSE) - - return(cols) - }) - - ################ - # GET NB PEAKS # - ################ - - # Get the total number of MS peaks stored inside the database. - # molid The ID of the molecule. - # type The MS type. - MsFileDb$methods( getNbPeaks = function(molid = NA_integer_, type = NA_character_) { - - # Init db - .self$.init.db() - - # Get database - db <- .self$.db[, c(MSDB.TAG.MOLID, MSDB.TAG.MODE, MSDB.TAG.MZTHEO)] - - # Filter on mode - if ( ! is.null(type) && ! is.na(type)) - db <- db[db[[MSDB.TAG.MODE]] == (if (type == MSDB.TAG.POS) .self$.modes$pos else .self$.modes$neg), ] - - # Filter on molecule IDs - if ( ! is.null(molid) && ! is.na(molid)) - db <- db[db[[MSDB.TAG.MOLID]] %in% molid,] - - # Get mz values - mz <- db[[MSDB.TAG.MZTHEO]] - - # Count number of unique values - n <- sum(as.integer(! duplicated(mz))) - - return(n) - }) - - ########## - # SEARCH # - ########## - - MsFileDb$methods( .do.search.for.mz.rt.bounds = function(mode, mz.low, mz.high, rt.low = NULL, rt.high = NULL, col = NULL, attribs = NULL, molids = NULL) { - - # Init db - .self$.init.db() - db <- .self$.db - - # Filter on mode - if ( ! is.null(mode) && ! is.na(mode)) - db <- db[db[[MSDB.TAG.MODE]] == (if (mode == MSDB.TAG.POS) .self$.modes$pos else .self$.modes$neg), ] - - # Filter on molecule IDs - if ( ! is.null(molids)) - db <- db[db[[MSDB.TAG.MOLID]] %in% molids,] - - # Filter on attributions - if ( ! is.null(attribs) && ! is.na(attribs)) - db <- db[db[[MSDB.TAG.ATTR]] %in% attribs,] - - # Filter on columns - if ( ! is.null(col) && ! is.na(col)) - db <- db[db[[MSDB.TAG.COL]] %in% col,] - - # Filter on retention time - if ( ! is.null(rt.low) && ! is.na(rt.low) && ! is.null(rt.high) && ! is.na(rt.high)) { - scale <- if (.self$getRtUnit() == MSDB.RTUNIT.MIN) 60 else 1 - db <- db[db[[MSDB.TAG.COLRT]] * scale >= rt.low & db[[MSDB.TAG.COLRT]] * scale <= rt.high, ] - } - - # Remove retention times and column information - if (is.null(col) || is.na(col) || is.null(rt.low) || is.na(rt.low) || is.null(rt.high) || is.na(rt.high)) { - db <- db[, ! (colnames(db) %in% c(MSDB.TAG.COL, MSDB.TAG.COLRT))] - - # Remove duplicates - db <- db[ ! duplicated(db), ] - } - - # Filter on mz - db <- db[db[[MSDB.TAG.MZTHEO]] >= mz.low & db[[MSDB.TAG.MZTHEO]] <= mz.high, ] - - return(db) - }) - - ################# - # GET MZ VALUES # - ################# - - # Returns a numeric vector of all masses stored inside the database. - MsFileDb$methods( getMzValues = function(mode = NULL, max.results = NA_integer_) { - - # Init db - .self$.init.db() - db <- .self$.db - - # Filter on mode - if ( ! is.null(mode) && ! is.na(mode)) { - mode.tag <- if (mode == MSDB.TAG.POS) .self$.modes$pos else .self$.modes$neg - selected.lines <- (.self$.get(db, col = MSDB.TAG.MODE) == mode.tag) - db <- db[selected.lines, ] - } - - # Get masses - mz <- .self$.get(db, col = MSDB.TAG.MZTHEO) - - # Remove duplicates - mz <- mz[ ! duplicated(mz)] - - # Apply cut-off - if ( ! is.na(max.results)) - mz <- mz[1:max.results] - - return(mz) - }) - - ####################### - # GET RETENTION TIMES # - ####################### - - # Get the retention times of a molecule. - # Returns a list of numeric vectors. The list has for keys/names the columns, and for values vectors of numerics (the retention times). If no retention times are registered for this molecule, then returns an empty list. - MsFileDb$methods( getRetentionTimes = function(molid, col = NA_character_) { - - if (is.null(molid) || is.na(molid)) - return(list()) - - # Init db - .self$.init.db() - db <- .self$.db[, c(MSDB.TAG.MOLID, MSDB.TAG.COL, MSDB.TAG.COLRT)] - - # Filter on molecule ID - if ( ! is.null(molid) && ! is.na(molid)) - db <- db[db[[MSDB.TAG.MOLID]] %in% molid,] - - # Remove duplicates - db <- db[! duplicated(db), ] - - # Build retention time list - rt <- list() - cols <- db[[MSDB.TAG.COL]] - cols <- cols[ ! duplicated(cols)] - for (col in cols) { - colrts <- db[db[[MSDB.TAG.COL]] %in% col, MSDB.TAG.COLRT] - rt[col] <- list(colrts) - } - - if (.self$getRtUnit() == MSDB.RTUNIT.MIN) - rt <- 60 * rt - - return(rt) - }) - -} # end of load safe guard
--- a/MsPeakForestDb.R Wed Apr 19 10:00:05 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,325 +0,0 @@ -if ( ! exists('MsPeakForestDb')) { # Do not load again if already loaded - - library(methods) - source('MsDb.R') - source('UrlRequestScheduler.R') - - ##################### - # CLASS DECLARATION # - ##################### - - MsPeakForestDb <- setRefClass("MsPeakForestDb", contains = "MsDb", fields = list(.url = "character", .url.scheduler = "ANY", .token = "character")) - - ############### - # CONSTRUCTOR # - ############### - - MsPeakForestDb$methods( initialize = function(url = NA_character_, useragent = NA_character_, token = NA_character_, ...) { - - callSuper(...) - - # Check URL - if (is.null(url) || is.na(url)) - stop("No URL defined for new MsPeakForestDb instance.") - - if (substring(url, nchar(url) - 1, 1) == '/') - url <- substring(url, nchar(url) - 1) - .url <<- url - .url.scheduler <<- UrlRequestScheduler$new(n = 3, useragent = useragent) - .self$.url.scheduler$setVerbose(1L) - .token <<- token - .rt.unit <<- MSDB.RTUNIT.MIN - }) - - ########### - # GET URL # - ########### - - MsPeakForestDb$methods( .get.url = function(url, params = NULL, ret.type = 'json') { - - res <- NULL - - # Add url prefix - if (substring(url, 1, 1) == '/') - url <- substring(url, 2) - url <- paste(.self$.url, url, sep = '/') - - # Add token - if ( ! is.na(.self$.token)) - params <- c(params, token = .self$.token) - - # Get URL - content <- .self$.url.scheduler$getUrl(url = url, params = params) - - if (ret.type == 'json') { - - res <- jsonlite::fromJSON(content, simplifyDataFrame = FALSE) - - if (is.null(res)) { - param.str <- if (is.null(params)) '' else paste('?', vapply(names(params), function(p) paste(p, params[[p]], sep = '='), FUN.VALUE = ''), collapse = '&', sep = '') - stop(paste0("Failed to run web service. URL was \"", url, param.str, "\".")) - } - } else { - if (ret.type == 'integer') { - if (grepl('^[0-9]+$', content, perl = TRUE)) - res <- as.integer(content) - else { - res <- jsonlite::fromJSON(content, simplifyDataFrame = FALSE) - } - } - } - - return(res) - }) - - #################### - # GET MOLECULE IDS # - #################### - - MsPeakForestDb$methods( getMoleculeIds = function() { - - ids <- as.character(.self$.get.url(url = 'compounds/all/ids')) - - return(ids) - }) - - #################### - # GET NB MOLECULES # - #################### - - MsPeakForestDb$methods( getNbMolecules = function() { - - n <- .self$.get.url(url = 'compounds/all/count', ret.type = 'integer') - - return(n) - }) - - ############################### - # GET CHROMATOGRAPHIC COLUMNS # - ############################### - - MsPeakForestDb$methods( getChromCol = function(molid = NULL) { - - # Set URL - params <- NULL - if ( ! is.null(molid)) - params <- list(molids = paste(molid, collapse = ',')) - - # Call webservice - wscols <- .self$.get.url(url = 'metadata/lc/list-code-columns', params = params) - - # Build data frame - cols <- data.frame(id = character(), title = character()) - for(id in names(wscols)) - cols <- rbind(cols, data.frame(id = id, title = wscols[[id]]$name, stringsAsFactors = FALSE)) - - return(cols) - }) - - ####################### - # GET RETENTION TIMES # - ####################### - - MsPeakForestDb$methods( getRetentionTimes = function(molid, col = NA_character_) { - - if (is.null(molid) || is.na(molid) || length(molid) != 1) - stop("The parameter molid must consist only in a single value.") - - rt <- list() - - # Set URL - params <- NULL - if ( ! is.null(molid)) - params <- list(molids = paste(molid, collapse = ',')) - - # Call webservice - spectra <- .self$.get.url(url = 'spectra/lcms/search', params = params) - if (class(spectra) == 'list' && length(spectra) > 0) { - for (s in spectra) - if (is.na(col) || s$liquidChromatography$columnCode %in% col) { - ret.time <- (s$RTmin + s$RTmax) / 2 - ret.time <- ret.time * 60 # Retention time are in minutes in Peakforest, but we want them in seconds - c <- s$liquidChromatography$columnCode - if (c %in% names(rt)) { - if ( ! ret.time %in% rt[[c]]) - rt[[c]] <- c(rt[[c]], ret.time) - } else - rt[[c]] <- ret.time - } - } - - return(rt) - }) - - ##################### - # GET MOLECULE NAME # - ##################### - - MsPeakForestDb$methods( getMoleculeName = function(molid) { - - library(RJSONIO) - - if (is.null(molid)) - return(NA_character_) - - # Initialize names - names <- as.character(molid) - - # Get non NA values - non.na.molid <- molid[ ! is.na(molid)] - - if (length(non.na.molid) > 0) { - # Set URL - params <- c(molids = paste(non.na.molid, collapse = ',')) - - # Call webservice - names[ ! is.na(molid)] <- .self$.get.url(url = 'compounds/all/names', params = params) - } - - return(names) - }) - - ################ - # FIND BY NAME # - ################ - - MsPeakForestDb$methods( findByName = function(name) { - - if (is.null(name)) - return(NA_character_) - - ids <- list() - - for (n in name) { - - if (is.na(n)) - ids <- c(ids, NA_character_) - - else { - compounds <- .self$.get.url(url = paste0('search/compounds/name/', curlEscape(n)))$compoundNames - ids <- c(ids, list(vapply(compounds, function(c) as.character(c$compound$id), FUN.VALUE = ''))) - } - } - - return(ids) - }) - - ################# - # GET NB PEAKS # - ################# - - MsPeakForestDb$methods( getNbPeaks = function(molid = NA_integer_, type = NA_character_) { - - # Build URL - params <- NULL - if ( ! is.na(type)) - params <- c(params, mode = if (type == MSDB.TAG.POS) 'pos' else 'neg') - if ( ! is.null(molid) && (length(molid) > 1 || ! is.na(molid))) - params <- c(params, molids = paste(molid, collapse = ',')) - - # Run request - n <- .self$.get.url(url = 'spectra/lcms/count-peaks', params = params, ret.type = 'integer') - - return(sum(n)) - }) - - ################# - # GET MZ VALUES # - ################# - - MsPeakForestDb$methods( getMzValues = function(mode = NULL, max.results = NA_integer_) { - - # Query params - params <- NULL - if ( ! is.null(mode)) - params <- c(params, mode = if (mode == MSDB.TAG.POS) 'positive' else 'negative') - - # Get MZ valuels - mz <- .self$.get.url(url = 'spectra/lcms/peaks/list-mz', params = params) - - # Apply cut-off - if ( ! is.na(max.results)) - mz <- mz[1:max.results] - - return(mz) - }) - - ############################## - # DO SEARCH FOR MZ RT BOUNDS # - ############################## - - MsPeakForestDb$methods( .do.search.for.mz.rt.bounds = function(mode, mz.low, mz.high, rt.low = NULL, rt.high = NULL, col = NULL, attribs = NULL, molids = NULL) { - - # Build URL for mz search - url <- paste0('spectra/lcms/peaks/get-range/', mz.low, '/', mz.high) - - # Get spectra - spectra <- .self$.get.url(url = url) - - # Build result data frame - results <- data.frame(MSDB.TAG.MOLID = character(), MSDB.TAG.MOLNAMES = character(), MSDB.TAG.MOLMASS = numeric(), MSDB.TAG.MZTHEO = numeric(), MSDB.TAG.COMP = character(), MSDB.TAG.ATTR = character(), MSDB.TAG.INCHI = character(), MSDB.TAG.INCHIKEY = character(), MSDB.TAG.CHEBI = character(), MSDB.TAG.HMDB = character(), MSDB.TAG.KEGG = character(), MSDB.TAG.PUBCHEM = character()) - for (x in spectra) { - if ('source' %in% names(x) && is.list(x$source)) - mztheo <- if ('mz' %in% names(x) && ! is.null(x$mz)) as.numeric(x$mz) else NA_real_ - comp <- if ('composition' %in% names(x) && ! is.null(x$composition)) x$composition else NA_character_ - attr <- if ('attribution' %in% names(x) && ! is.null(x$attribution)) x$attribution else NA_character_ - if ('listOfCompounds' %in% names(x$source)) { - molids <- vapply(x$source$listOfCompounds, function(c) if ('id' %in% names(c) && ! is.null(c$id)) as.character(c$id) else NA_character_, FUN.VALUE = '') - molnames <- vapply(x$source$listOfCompounds, function(c) if ('names' %in% names(c) && ! is.null(c$names)) paste(c$names, collapse = MSDB.MULTIVAL.FIELD.SEP) else NA_character_, FUN.VALUE = '') - mass <- vapply(x$source$listOfCompounds, function(c) if ( ! 'averageMass' %in% names(c) || is.null(c$averageMass)) NA_real_ else as.double(c$averageMass), FUN.VALUE = 0.0) - inchi <- vapply(x$source$listOfCompounds, function(c) if ( ! 'inChI' %in% names(c) || is.null(c$inChI)) NA_character_ else as.character(c$inChI), FUN.VALUE = '') - inchikey <- vapply(x$source$listOfCompounds, function(c) if ( ! 'inChIKey' %in% names(c) || is.null(c$inChIKey)) NA_character_ else as.character(c$inChIKey), FUN.VALUE = '') - chebi <- vapply(x$source$listOfCompounds, function(c) if ('ChEBI' %in% names(c) && ! is.null(c$ChEBI)) as.character(c$ChEBI) else NA_character_, FUN.VALUE = '') - chebi[chebi == 'CHEBI:null'] <- NA_character_ - hmdb <- vapply(x$source$listOfCompounds, function(c) if ('HMDB' %in% names(c) && ! is.null(c$HMDB)) as.character(c$HMDB) else NA_character_, FUN.VALUE = '') - hmdb[hmdb == 'HMDBnull'] <- NA_character_ - kegg <- vapply(x$source$listOfCompounds, function(c) if ( ! 'KEGG' %in% names(c) || is.null(c$KEGG)) NA_character_ else as.character(c$KEGG), FUN.VALUE = '') - pubchem <- vapply(x$source$listOfCompounds, function(c) if ( ! 'PubChemCID' %in% names(c) || is.null(c$PubChemCID)) NA_character_ else as.character(c$PubChemCID), FUN.VALUE = '') - if (length(molids) > 0 && length(molids) == length(molnames)) - results <- rbind(results, data.frame(MSDB.TAG.MOLID = molids, MSDB.TAG.MOLNAMES = molnames, MSDB.TAG.MOLMASS = mass, MSDB.TAG.MZTHEO = mztheo, MSDB.TAG.COMP = comp, MSDB.TAG.ATTR = attr, MSDB.TAG.INCHI = inchi, MSDB.TAG.INCHIKEY = inchikey, MSDB.TAG.CHEBI = chebi, MSDB.TAG.HMDB = hmdb, MSDB.TAG.KEGG = kegg, MSDB.TAG.PUBCHEM = pubchem, stringsAsFactors = FALSE)) - } - } - - # RT search - if ( ! is.null(rt.low) && ! is.null(rt.high)) { - - rt.res <- data.frame(MSDB.TAG.MOLID = character(), MSDB.TAG.COL = character(), MSDB.TAG.COLRT = numeric()) - - if (nrow(results) > 0) { - - # Build URL for rt search - url <- paste0('spectra/lcms/range-rt-min/', rt.low / 60, '/', rt.high / 60) - params <- NULL - if ( ! is.null(col)) - params <- c(columns = paste(col, collapse = ',')) - - # Run query - rtspectra <- .self$.get.url(url = url, params = params) - - # Get compound/molecule IDs - for (x in rtspectra) - if (all(c('listOfCompounds', 'liquidChromatography') %in% names(x))) { - molids <- vapply(x$listOfCompounds, function(c) if ('id' %in% names(c) && ! is.null(c$id)) as.character(c$id) else NA_character_, FUN.VALUE = '') - if (length(molids) > 0) { - col <- if ('columnCode' %in% names(x$liquidChromatography) && ! is.null(x$liquidChromatography$columnCode)) as.character(x$liquidChromatography$columnCode) else NA_character_ - rtmin <- if ('RTmin' %in% names(x) && ! is.null(x$RTmin)) as.double(x$RTmin) else NA_real_ - rtmax <- if ('RTmax' %in% names(x) && ! is.null(x$RTmax)) as.double(x$RTmax) else NA_real_ - colrt <- (rtmin + rtmax) / 2 - rt.res <- rbind(rt.res, data.frame(MSDB.TAG.MOLID = molids, - MSDB.TAG.COL = col, - MSDB.TAG.COLRT = colrt * 60, - stringsAsFactors = FALSE)) - } - } - } - - # Add retention times and column info - results <- merge(results, rt.res) - } - - # Rename columns with proper names - colnames(results) <- vapply(colnames(results), function(s) eval(parse(text=s)), FUN.VALUE = '') - - return(results) - }) -}
--- a/MsXlsDb.R Wed Apr 19 10:00:05 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,852 +0,0 @@ -if ( ! exists('MsXlsDb')) { # Do not load again if already loaded - - library('methods') - library('stringr') - source('msdb-common.R') - source('MsDb.R') - source('strhlp.R', chdir = TRUE) - source('dfhlp.R', chdir = TRUE) - source('search.R', chdir = TRUE) - source('excelhlp.R', chdir = TRUE) - - ############# - # CONSTANTS # - ############# - - .THIS.FILE.PATH <- getwd() # We suppose that the file has been sourced with the option chdir = TRUE - - .XLS_PEAKS_ROW_OFFSET <- 8 - .XLS_PEAKS_RT_COL_START <- 11 - .XLS_MSPOS_TAB <- 'MS_POS' - .XLS_MSNEG_TAB <- 'MS_NEG' - .XLS_MZ_COL <- 1 - .XLS_INTENSITY_COL <- 2 - .XLS_RELATIVE_COL <- 3 - .XLS_THEORETICAL_MZ_COL <- 5 - .XLS_COMPOSITION_COL <- 8 - .XLS_ATTRIBUTION_COL <- 9 - - ##################### - # CLASS DECLARATION # - ##################### - - MsXlsDb <- setRefClass("MsXlsDb", contains = "MsDb", fields = list(.mz.index = "ANY", .name_index = "ANY", .db_dir = "character", .limit = "numeric", .files = "ANY", .cache_dir = "character", .db = "ANY")) - - ############### - # CONSTRUCTOR # - ############### - - MsXlsDb$methods( initialize = function(db_dir = NA_character_, limit = NA_integer_, cache_dir = NA_character_, cache = FALSE, ...) { - - # Initialize members - # TODO check that db_dir is not null neither na, and tests that it exists and is a directory. - .db_dir <<- if ( ! is.null(db_dir)) db_dir else NA_character_ - .limit <<- if ( ! is.null(limit) && ! is.na(limit) && limit > 0) limit else NA_integer_ - cache_dir <- if (cache && is.na(cache_dir) && ! is.na(db_dir)) file.path(db_dir, 'cache') else cache_dir - .cache_dir <<- if ( cache || ! is.null(cache_dir)) cache_dir else NA_character_ - .files <<- NULL - .db <<- NULL - .mz.index <<- NULL - .name_index <<- NULL - - callSuper(...) - }) - - #################### - # GET MOLECULE IDS # - #################### - - MsXlsDb$methods( getMoleculeIds = function(max.results = NA_integer_) { - - # Init file list - .self$.init.file.list() - - # Get IDs - mol.ids <- as.integer(which( ! is.na(.self$.files))) - - # Cut - if ( ! is.na(max.results) && length(mol.ids) > max.results) - mol.ids <- mol.ids[max.results, ] - - return(mol.ids) - }) - - #################### - # GET NB MOLECULES # - #################### - - # Returns a list of all molecule names - MsXlsDb$methods( getNbMolecules = function() { - return(length(.self$getMoleculeIds())) - }) - - ##################### - # GET MOLECULE NAME # - ##################### - - MsXlsDb$methods( getMoleculeName = function(molid) { - return(vapply(molid, function(m) .self$.get.mol.name(m), FUN.VALUE = "")) - }) - - ############################### - # GET CHROMATOGRAPHIC COLUMNS # - ############################### - - # Returns a list of all chromatographic columns used - MsXlsDb$methods( getChromCol = function(molid = NULL) { - - cn <- character() - - # If no molecule IDs provided, then look at all molecules - if (is.null(molid)) - molid <- .self$getMoleculeIds() - - # Loop on molecules - for (mid in molid) { - - rt <- .self$getRetentionTimes(mid) - - if ( ! is.null(rt)) - cn <- c(cn, names(rt)) - } - - # Remove duplicates - cn <- cn[ ! duplicated(cn)] - - # Make data frame - cn <- data.frame(id = cn, title = cn, stringsAsFactors = FALSE) - - return(cn) - }) - - ################ - # FIND BY NAME # - ################ - - MsXlsDb$methods( findByName = function(name) { - - # NULL entry - if (is.null(name)) - return(NA_integer_) - - # Initialize output list - ids <- NULL - - for (n in name) { - - id <- NA_integer_ - - if ( ! is.na(n)) { - - # Get index - index <- .self$.get.name.index() - - # Search for name in index - i <- binary.search(toupper(n), index[['name']]) - - id <- if (is.na(i)) NA_integer_ else index[i, 'id'] - } - - ids <- c(ids, id) - } - - return(ids) - }) - - ####################### - # GET RETENTION TIMES # - ####################### - - MsXlsDb$methods( getRetentionTimes = function(molid, col = NA_character_) { - - if (is.null(molid) || is.na(molid)) - return(NULL) - - # Find it in memory - rt <- .self$.mem.get(molid, 'rt') - - if (is.null(rt)) { - - # Call observers - if ( ! is.null(.self$.observers)) - for (obs in .self$.observers) - obs$progress(paste0("Loading retention times of file", .self$.get.file(molid), "."), level = 2) - - rt <- NULL - - # Load from cache file - cache_file <- NA_character_ - if ( ! is.na(.self$.get.cache.dir())) { - cache_file <- file.path(.self$.get.cache.dir(), paste0('rt-', molid, '.bin')) - if (file.exists(cache_file)) - load(file = cache_file) # load rt - } - - if (is.null(rt)) { - - # Get retention times of both positive and negative mode tabs - mspos_rt <- .self$.parse_retention_times(molid, .XLS_MSPOS_TAB) - msneg_rt <- .self$.parse_retention_times(molid, .XLS_MSNEG_TAB) - - # Retention times stored in negative and positive modes - if ( ! is.null(mspos_rt) && ! is.null(msneg_rt)) { - - # Warn observers when both retention time lists are not identical - if ( ! identical(mspos_rt, msneg_rt)) - for (obs in .self$.observers) - obs$warning(paste0("Retention times in negative and positive modes are different in file ", .self$.get.file(molid), ".")) - - # Merge both lists - rt <- mspos_rt - for (c in names(msneg_rt)) - if (c %in% names(rt)) { - v <- c(rt[[c]], msneg_rt[[c]]) - rt[[c]] <- v[ ! duplicated(v)] - } - else - rt[[c]] <- msneg_rt[[c]] - } - else - # Set retention times - rt <- if (is.null(mspos_rt)) msneg_rt else mspos_rt - - if (is.null(rt)) rt <- list() - - # Write in cache - if ( ! is.na(cache_file)) { - - # Call observers - if ( ! is.null(.self$.observers)) - for (obs in .self$.observers) - obs$progress(paste0("Caching retention times of file ", .self$.get.file(molid), ".")) - - save(rt, file = cache_file) - } - } - - # Store in memory - .self$.mem.set(rt, molid, 'rt') - } - - # Select only one column if asked - if ( ! is.na(col)) rt <- rt[[col]] - - return(rt) - }) - - ################# - # GET NB PEAKS # - ################# - - MsXlsDb$methods( getNbPeaks = function(molid = NA_integer_, type = NA_character_) { - - # Initialize parameters - if (is.null(molid) || (length(molid) == 1 && is.na(molid))) - molid <- .self$getMoleculeIds() - if (is.na(type)) - type <- c(MSDB.TAG.POS, MSDB.TAG.NEG) - - return(sum(vapply(molid, function(m) { if (is.na(m)) 0 else sum(vapply(type, function(t) { peaks <- .self$.get.peaks(m, t) ; if (is.null(peaks)) 0 else nrow(peaks) }, FUN.VALUE = 1)) }, FUN.VALUE = 1))) - }) - - ################## - # GET PEAK TABLE # - ################## - - MsXlsDb$methods( getPeakTable = function(molid = NA_integer_, mode = NA_character_) { - - peaks <- NULL - - # Set default molecule IDs - if (is.null(molid) || (length(molid) == 1 && is.na(molid))) - molid <- .self$getMoleculeIds() - - # Set default modes - if (is.null(mode) || (length(mode) == 1 && is.na(mode))) - mode <- c(MSDB.TAG.POS, MSDB.TAG.NEG) - - # Loop on all molecules - for (mol in molid) { - - # Loop on all modes - for (mod in mode) { - m.peaks <- .self$.get.peaks(mol, mod) - if ( ! is.null(m.peaks) && nrow(m.peaks) > 0) { - m.peaks[[MSDB.TAG.MOLID]] <- mol - m.peaks[[MSDB.TAG.MODE]] <- mod - peaks <- if (is.null(peaks)) m.peaks else rbind(peaks, m.peaks) - peaks <- df.move.col.first(peaks, c(MSDB.TAG.MOLID, MSDB.TAG.MODE)) - } - } - } - - return(peaks) - }) - - ################# - # GET MZ VALUES # - ################# - - # Returns a numeric vector of all masses stored inside the database. - MsXlsDb$methods( getMzValues = function(mode = NULL, max.results = NA_integer_) { - - mz <- numeric() - - # Get all mz values of all molecules - for(molid in .self$getMoleculeIds()) - for (m in (if (is.null(mode) || is.na(mode)) c(MSDB.TAG.POS, MSDB.TAG.NEG) else mode)) - mz <- c(mz, .self$.get.peaks(molid, m)[[MSDB.TAG.MZTHEO]]) - - # Remove duplicated - mz <- mz[ ! duplicated(mz)] - - # Apply cut-off - if ( ! is.na(max.results)) - mz <- mz[1:max.results] - - return(mz) - }) - - ############# - # GET PEAKS # - ############# - - MsXlsDb$methods( .get.peaks = function(molid, mode) { - - tab <- if (mode == MSDB.TAG.POS) .XLS_MSPOS_TAB else .XLS_MSNEG_TAB - - # Find it in memory - peak_df <- .self$.mem.get(molid, 'peaks', mode) - - if (is.null(peak_df)) { - # Call observers - if ( ! is.null(.self$.observers)) - for (obs in .self$.observers) - obs$progress(paste0("Loading peaks of tab ", tab, " of file ", .self$.get.file(molid), "."), level = 2) - - peak_df <- NULL - - # Load from cache file - cache_file <- NA_character_ - if ( ! is.na(.self$.get.cache.dir())) { - cache_file <- file.path(.self$.get.cache.dir(), paste0('peaks-', molid, '-', tab, '.csv')) - if (file.exists(cache_file)) - peak_df <- read.csv(cache_file, header = TRUE, stringsAsFactors = FALSE) - } - - # Read from XLS file, if not in cache - if (is.null(peak_df)) { - - # Load tab (peaks start at row 8) - if (.self$.tab.exists(.self$.get.file(molid), tab)) { - - peaks <- read.excel(.self$.get.file(molid), tab, start.row = .XLS_PEAKS_ROW_OFFSET, stringsAsFactors = FALSE) - if ( ! is.null(peaks)) - peaks <- peaks[ ! is.na(peaks[.XLS_MZ_COL]), , drop = FALSE] # Remove rows where m/z is not defined. TODO maybe call observer for notify a line with non NA values but without m/z value. - - # Instantiate peaks - if ( ! is.null(peaks) && nrow(peaks) > 0) { - peak_df <- peaks[1:length(peaks[[.XLS_MZ_COL]]), c(.XLS_MZ_COL, .XLS_THEORETICAL_MZ_COL, .XLS_INTENSITY_COL, .XLS_RELATIVE_COL, .XLS_COMPOSITION_COL, .XLS_ATTRIBUTION_COL), drop = FALSE] - colnames(peak_df) <- c(MSDB.TAG.MZEXP, MSDB.TAG.MZTHEO, MSDB.TAG.INT, MSDB.TAG.REL, MSDB.TAG.COMP, MSDB.TAG.ATTR) - } - - # Set default data frame (important for cache file writing, because we need a correct header to be written in order for loading) - else { - peak_df <- data.frame(stringsAsFactors = FALSE) - peak_df[MSDB.TAG.MZEXP] <- numeric() - peak_df[MSDB.TAG.MZTHEO] <- numeric() - peak_df[MSDB.TAG.INT] <- numeric() - peak_df[MSDB.TAG.REL] <- numeric() - peak_df[MSDB.TAG.COMP] <- character() - peak_df[MSDB.TAG.ATTR] <- character() - } - - if (is.null(peak_df)) peak_df <- data.frame() - - # Write in cache - if ( ! is.na(cache_file)) { - - # Call observers - if ( ! is.null(.self$.observers)) - for (obs in .self$.observers) - obs$progress(paste0("Caching peaks of tab ", tab, " of file ", .self$.get.file(molid), ".")) - - write.csv(peak_df, cache_file, row.names = FALSE) - } - } - } - - # Store in memory - .self$.mem.set(peak_df, molid, 'peaks', mode) - } - - return(peak_df) - }) - - ############################## - # GET FULL MS PEAK M/Z INDEX # - ############################## - - # Get mz index for full ions, creating it if necessary. - MsXlsDb$methods( .get.mz.index = function(mode) { - - if (is.null(.self$.mz.index[[mode]])) { - - # Initialize data frame - mzi <- data.frame(stringsAsFactors = FALSE) - mzi[MSDB.TAG.MZTHEO] <- numeric() - mzi[MSDB.TAG.MOLID] <- character() - mzi[MSDB.TAG.COMP] <- character() - mzi[MSDB.TAG.ATTR] <- character() - - # Loop on all molecules - for(molid in .self$getMoleculeIds()) { - - # Get all peaks of this molecule - peaks <- .self$.get.peaks(molid, mode) - - # Remove rows whose mz is NA. - peaks <- peaks[ ! is.na(peaks[[MSDB.TAG.MZTHEO]]), ] - - if (nrow(peaks) > 0) { - - # Add id column - peaks[MSDB.TAG.MOLID] <- molid - - # Append peaks - r <- nrow(mzi) + 1 - rows <- r:(r+nrow(peaks)-1) - mzi[rows, ] <- peaks[colnames(mzi)] - } - } - - # Sort by M/Z - sorted_indices <- order(mzi[[MSDB.TAG.MZTHEO]]) - - # Group in a data frame - .self$.mz.index[[mode]] <- mzi[sorted_indices, ] - } - - return(.self$.mz.index[[mode]]) - }) - - ###################### - # SEARCH FOR MZ & RT # - ###################### - - MsXlsDb$methods( .do.search.for.mz.rt.bounds = function(mode, mz.low, mz.high, rt.low = NULL, rt.high = NULL, col = NULL, attribs = NULL, molids = NULL) { - - # Search for m/z - results <- .self$.do.search.for.mz(mode, mz.low, mz.high) - - # Filter on attributions - if ( ! is.null(attribs)) { - results <- results[results[[MSDB.TAG.ATTR]] %in% attribs, ] - } - - # Filer on molecule IDs - if ( ! is.null(molids)) { - results <- results[results[[MSDB.TAG.MOLID]] %in% molids, ] - } - - # Use retention time - if ( ! is.null(col) && ! is.null(rt.low) && ! is.null(rt.high)) { - - # Get list of unique IDs - ids <- results[[MSDB.TAG.MOLID]] - ids <- ids[ ! duplicated(ids)] - rt <- .self$.search.for.rt(mols = ids, rt.low = rt.low, rt.high = rt.high, col = col) - results <- results[results[[MSDB.TAG.MOLID]] %in% rt[[MSDB.TAG.MOLID]], ] - results <- merge(results, rt) - } - - return(results) - }) - - ############################## - # SEARCH FOR M/Z IN MS PEAKS # - ############################## - - MsXlsDb$methods( .do.search.for.mz = function(mode, mz.low, mz.high) { - - results <- data.frame(stringsAsFactors = FALSE) - results[MSDB.TAG.MZTHEO] <- numeric() - results[MSDB.TAG.MOLID] <- character() - results[MSDB.TAG.MOLNAMES] <- character() - results[MSDB.TAG.COMP] <- character() - results[MSDB.TAG.ATTR] <- character() - - # Create m/z index - mz_index <- .self$.get.mz.index(mode) - - # Find molecules - low_bound <- binary.search(mz.low, mz_index[[MSDB.TAG.MZTHEO]], lower = FALSE) - high_bound <- binary.search(mz.high, mz_index[[MSDB.TAG.MZTHEO]], lower = TRUE) - - # Get results - if ( ! is.na(high_bound) && ! is.na(low_bound) && low_bound <= high_bound) - results <- mz_index[low_bound:high_bound,] - - # Remove row names - rownames(results) <- NULL - - return(results) - }) - - ################ - # GET MOL NAME # - ################ - - MsXlsDb$methods( .get.mol.name = function(molid) { - - if (is.na(molid)) - return(NA_character_) - - # Find it in memory - name <- .self$.mem.get(molid, 'name') - - if (is.null(name)) { - - # Load molecule - mol <- .self$.load.molecule(molid) - - # Look for name in tabs - for (tab in c(.XLS_MSPOS_TAB, .XLS_MSNEG_TAB)) { - hdr <- mol[[tab]][['header']] - if ( ! is.null(hdr)) - name <- hdr[[1]] - if ( ! is.null(name) && ! is.na(name)) break - } - - # Store in memory - if (is.null(name)) name <- NA_character_ - .self$.mem.set(name, molid, 'name') - } - - return(name) - }) - - ################## - # GET NAME INDEX # - ################## - - # Get name index. - MsXlsDb$methods( .get.name.index = function() { - - if (is.null(.self$.name_index)) { - - # Get names - names <- vapply(.self$getMoleculeIds(), function(id) toupper(.self$getMoleculeName(id)), FUN.VALUE = "") - - # Get molecule IDs - id <- .self$getMoleculeIds() - - # Sort by names - sorted_indices <- order(names) - - # Group in a data frame - .self$.name_index <- data.frame(name = rbind(names)[, sorted_indices], - id = rbind(id)[, sorted_indices], - stringsAsFactors = FALSE) - } - - return(.self$.name_index) - }) - - ################## - # INIT FILE LIST # - ################## - - MsXlsDb$methods( .init.file.list = function() { - - if (is.null(.self$.files)) { - - # List all files - files <- Sys.glob(file.path(.self$.db_dir, '*.xls')) - - # Limit the size of the database - if ( ! is.na(.self$.limit)) - files <- head(files, .self$.limit) - - # Get IDs - ids <- vapply(files, function(f) .extract_molecule_id_from_filename(f), FUN.VALUE = 1) - - # Use ids as indices to build the vector of files - .files <<- rep(NA_character_, max(ids)) - .files[ids] <<- files - } - }) - - ################# - # GET CACHE DIR # - ################# - - MsXlsDb$methods( .get.cache.dir = function() { - - if ( ! is.na(.self$.cache_dir) && ! file.exists(.self$.cache_dir)) - dir.create(.self$.cache_dir) - - return(.self$.cache_dir) - }) - - ################# - # LOAD MOLECULE # - ################# - - MsXlsDb$methods( .load.molecule = function(molid) { - - # Init local variables - mol <- NULL - cache_file <- NA_character_ - excel_file <- .self$.get.file(molid) - - # Call observers - if ( ! is.null(.self$.observers)) - for (obs in .self$.observers) - obs$progress(paste0("Loading molecule ", molid, "."), level = 2) - - # Load from cache - if ( ! is.na(.self$.get.cache.dir())) { - cache_file <- file.path(.self$.get.cache.dir(), paste0(molid, '.bin')) - if (file.exists(cache_file)) - load(file = cache_file) # load mol variable - } - - # Load from Excel file & write to cache - if (is.null(mol) && ! is.na(excel_file)) { - - source(file.path(.THIS.FILE.PATH, 'excelhlp.R'), chdir = TRUE) # we use the path set when sourcing the file, since when calling this method, the current path could be different. - - # Load from Excel file - for(tab in c(.XLS_MSPOS_TAB, .XLS_MSNEG_TAB)) { - - # Test that tab exists - if (.self$.tab.exists(excel_file, tab)) { - header <- read.excel(excel_file, tab, start.row = 1, end.row = .XLS_PEAKS_ROW_OFFSET - 1, header = FALSE, stringsAsFactors = FALSE, trim.values = TRUE, col.index = c(1))[[1]] - peaks <- read.excel(excel_file, tab, start.row = .XLS_PEAKS_ROW_OFFSET) - mol[[tab]] <- list(header = header, peaks = peaks) - } - - # Missing tab - else { - for (obs in .self$.observers) - obs$warning(paste0("No excel tab ", tab, " in file ", excel_file, ".")) - } - } - - # Write in cache - if ( ! is.na(cache_file)) { - - # Call observers - if ( ! is.null(.self$.observers)) - for (obs in .self$.observers) - obs$progress(paste0("Caching file ", excel_file, ".")) - - save(mol, file = cache_file) - } - } - - return(mol) - }) - - ######################## - # DOES EXCEL TAB EXIST # - ######################## - - MsXlsDb$methods( .tab.exists = function(file, tab) { - - source(file.path(.THIS.FILE.PATH, 'excelhlp.R'), chdir = TRUE) # we use the path set when sourcing the file, since when calling this method, the current path could be different. - - if ( ! tab.exists(file, tab)) { - - # Warn observers - for (obs in .self$.observers) - obs$warning(paste0("No excel tab ", tab, " in file ", file, ".")) - - return(FALSE) - } - - return(TRUE) - }) - - ######################### - # PARSE RETENTION TIMES # - ######################### - - MsXlsDb$methods( .parse_retention_times = function(id, tab) { - - rt <- NULL - - if (.self$.tab.exists(.self$.get.file(id), tab)) { - peaks <- read.excel(.self$.get.file(id), tab, start.row = .XLS_PEAKS_ROW_OFFSET) - - # Get retention times - if ( ! is.null(peaks) && length(peaks) > 0 && ! is.na(peaks[[1]][[1]])) - for (c in .XLS_PEAKS_RT_COL_START:length(names(peaks))) - if ( ! is.na(peaks[[c]][[1]])) { - - # Check retention times of all different m/z peaks for the same column. - .self$.check_retention_times(id, tab, names(peaks)[[c]], peaks[[c]], sum( ! is.na(peaks[[1]]))) - - # Add retention time - # TODO The column names are transformed through the read.xlsx call. For instance: - # HPLC (C18) 25mn QTOF (Bis) --> HPLC..C18..25mn.QTOF..Bis. - # ZICpHILIC 150*5*2.1 Shimadzu-Exactive-42mn --> ZICpHILIC.150.5.2.1.Shimadzu.Exactive.42mn - # This can be an issue, since we loose the formating. - col_id <- names(peaks)[[c]] - time <- peaks[[c]][[1]] * 60 # Read and convert retention time in seconds. - if (is.null(rt) || ! col_id %in% names(rt)) - rt[[col_id]] <- list(time) - else - rt[[col_id]] <- c(rt[[col_id]], time) - } - } - - return(rt) - }) - - ######################### - # CHECK RETENTION TIMES # - ######################### - - MsXlsDb$methods( .check_retention_times = function(id, tab_name, column_name, rt, n) { - - if (n >= 1 && ! is.null(.self$.observers) && length(.self$.observers) > 0) - - # Check column only if there is at least one value inside - if (sum( ! is.na(rt)) > 0) - - # Loop on all values - for(i in 1:n) { - - # Check that it's defined - if (i > 1 && is.na(rt[[i]])) - for (obs in .self$.observers) - obs$warning(paste0("Retention times undefined for column ", column_name, " at row ", i + .XLS_PEAKS_ROW_OFFSET, " of tab ", tab_name, " in file ", .self$.get.file(id), ".")) - - else if (i > 1) - # Check the value (it must be constant) - if (rt[[i-1]] != rt[[i]]) - for (obs in .self$.observers) - obs$error(paste0("Retention times not constant for column ", column_name, " between row ", i - 1 + .XLS_PEAKS_ROW_OFFSET, " and row ", i + .XLS_PEAKS_ROW_OFFSET, "o tab", tab_name, "in file", .self$.get.file(id))) - } - }) - - #################### - # GET FILE FROM ID # - #################### - - MsXlsDb$methods( .get.file = function(id) { - - # List files - .self$.init.file.list() - - return( if (id > 0 && id <= length(.self$.files)) .self$.files[id] else NA_character_) - }) - - ########### - # MEM GET # - ########### - - # Get database data from memory - MsXlsDb$methods( .mem.get = function(molid, field, second.field = NA_character_) { - - data <- .self$.db[[as.character(molid)]][[field]] - - if ( ! is.na(second.field)) - data <- data[[second.field]] - - return(data) - }) - - ########### - # MEM SET # - ########### - - # Set database data into memory - MsXlsDb$methods( .mem.set = function(data, molid, field, second.field = NA_character_) { - - id <- as.character(molid) - - # Create db - if (is.null(.self$.db)) - .db <<- list() - - # Create first level - if (is.null(.self$.db[[id]])) - .self$.db[[id]] <- list() - - # Create second level - if ( ! is.na(second.field) && is.null(.self$.db[[id]][[field]])) - .self$.db[[id]][[field]] <- list() - - # Store data - if (is.na(second.field)) { - .self$.db[[id]][[field]] <- data - } else { - .self$.db[[id]][[field]][[second.field]] <- data - } - }) - - ################# - # SEARCH FOR RT # - ################# - - # Find molecules matching a certain retention time. - # col A list of chromatographic columns to use. - # rt.low The lower bound of the rt value. - # rt.high The higher bound of the rt value. - # mols A list of molecule IDs to process. If unset, then take all molecules. - # Return a data frame with the following columns: id, col, colrt. - MsXlsDb$methods( .search.for.rt = function(col, rt.low, rt.high, mols = NULL) { - - # Use all molecules if no list is provided - if (is.null(mols)) - mols <- .self$getMoleculeIds() - - results <- data.frame(id = integer(), col = character(), colrt = double(), stringsAsFactors = FALSE) - colnames(results) <- c(MSDB.TAG.MOLID, MSDB.TAG.COL, MSDB.TAG.COLRT) - - # Loop on all molecules - for (molid in mols) { - no.col <- TRUE - for (c in col) { - molrts <- .self$getRetentionTimes(molid, c) - if ( ! is.null(molrts)) { - no.col <- FALSE - for (molrt in molrts) { - if (molrt >= rt.low && molrt <= rt.high) { - r <- nrow(results) + 1 - results[r, ] <- c(id = molid, col = c, colrt = molrt) - } - } - } - } - - if (no.col) { - r <- nrow(results) + 1 - results[r, c(MSDB.TAG.MOLID)] <- c(id = molid) - } - } - - return(results) - }) - - ############################ - # EXTRACT ID FROM FILENAME # - ############################ - - .extract_molecule_id_from_filename <- function(filename) { - - id <- NA_integer_ - - if ( ! is.na(filename)) { - g <- str_match(filename, "N(\\d+)[._-]") - if ( ! is.na(g[1,1])) - id <- as.numeric(g[1,2]) - } - - return(id) - } - -} # end of load safe guard
--- a/PeakforestConn.R Wed Apr 19 10:00:05 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,176 +0,0 @@ -##################### -# CLASS DECLARATION # -##################### -#'A class to connect to peakforest -#'@export -#'@field .url An urel to the database -PeakforestConn <- methods::setRefClass("PeakforestConn", contains = c("RemotedbConn","MassdbConn"), fields = list( .url = "character" )) # TODO Inherits also from MassdbConn - -########################## -# GET ENTRY CONTENT TYPE # -########################## - -PeakforestConn$methods( getEntryContentType = function(type) { - return(BIODB.JSON) -}) - -##################### -# GET ENTRY CONTENT # -##################### - -PeakforestConn$methods( getEntryContent = function(id) { - - - # Initialize return values - content <- rep(NA_character_, length(id)) - # Request - - url <- get.entry.url(BIODB.PEAKFOREST, id[i], BIODB.JSON,token = .self$.token) - jsonstr <- .self$.get.url(url) - if(startsWith("<html>", jsonstr) ){ - next - } - - return(content) -}) - - -########################################## -# SEARCH FOR SPECTRA IN GIVEN MASS RANGE # -########################################## - -PeakforestConn$methods( searchMzRange = function(mzmin, mzmax, rtype = c("object","spec","peak")){ - - rtype <- match.arg(rtype) - if(mzmin>mzmax){ - stop("mzmin shloud be inferior to mzmax in searchMzRange.") - } - - url <- paste0("https://rest.peakforest.org/spectra/lcms/peaks/get-range/",mzmin,"/",mzmax) - - contents <- .self$.get.url(url) - - jsontree <- fromJSON(contents) - - ###No match form the output. - if( length(jsontree)==0 ) return(NULL) - - # Getting a list of all the id. - lid <- sapply(jsontree,function(x){ - x$source$id - }) - - # Returning the content for all the spectra - contents <- .self$getEntryContent(lid) - - entries <- .self$createEntry(contents) - - # Checking the return type - if( rtype=="object" ){ - return( entries ) - } - - ### XXXX See if we don't want to reduce the output and factorize this shit. - toreturn <- NULL - if( rtype=="spec" ){ - toreturn <- sapply(entries,function(x){ - x$getFieldsAsDataFrame() - }) - } - if( rtype=="peak" ){ - toreturn <- lapply(entries,function(x){ - temp <- as.data.frame( x$getFieldValue( BIODB.PEAKS )) - temp$accession = x$getFieldValue( BIODB.ACCESSION) - return(temp) - - }) - } - ###Trying to convert in data.frame - if(!is.data.frame(toreturn)){ - temp <- colnames(toreturn[[1]]) - toreturn <- do.call("rbind.fill",toreturn) - colnames(toreturn) <- temp - } - - return(toreturn) -}) - - -################################################# -# SEARCH FOR SPECTRA IN A TOLERANCE AROUND A MZ # -################################################# - -PeakforestConn$methods( searchMzTol = function(mz, tol, tolunit=BIODB.MZTOLUNIT.VALS, - rtype = c("object","spec","peak")){ - - rtype <- match.arg(rtype) - tolunit <- match.arg(tolunit) - - if( tolunit == BIODB.MZTOLUNIT.PPM){ - tol <- tol * mz * 10^-6 - } - - mzmin <- mz - tol - mzmax <- mz + tol - - return(.self$searchMzRange(mzmin, mzmax, rtype = rtype)) - -}) - -################################################## -# SEARCH FOR MSMS SPECTRA PRECUSOR AROUND A MASS # -################################################## - - -PeakforestConn$methods( - searchSpecPrecTol = function(mz, - tol, - tolunit = "plain", - mode = NULL) { - #TODO handle the units - #tolunit <- match.arg(tolunit) - - strmode <- '' - - if (!is.null(mode)) { - if (mode %in% c(BIODB.MSMODE.NEG, BIODB.MSMODE.POS)) { - strmode <- paste0('?polarity=', mode) - } - - } - - if (tolunit == BIODB.MZTOLUNIT.PPM) { - tol <- tol * mz * 10 ^ -6 - } - - ##Request which return peak and not spectra. - url <- - paste0( - "https://rest.peakforest.org/spectra/lcms/search-naive/", - mz, - "/", - tol, - strmode - ) - contents <- .self$.get.url(url) - entries <- .self$createReducedEntry(contents, drop = TRUE) - return(entries) - } -) - - -################ -# CREATE ENTRY # -################ - -# Creates a Spectrum instance from file content. -# content A file content, downloaded from the public database. -# RETURN A spectrum instance. -PeakforestConn$methods( createEntry = function(content, drop = TRUE) { - return(createPeakforestSpectraFromJSON(content, drop = drop)) -}) - -PeakforestConn$methods( createReducedEntry = function(content , drop = TRUE){ - entries <- createReducedSpectraFromJSON(content, drop = drop) - return(entries) -})
--- a/PeakforestEntry.R Wed Apr 19 10:00:05 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,250 +0,0 @@ -##################### -# CLASS DECLARATION # -##################### - -# TODO Create class PeakforestCompoundEntry -PeakForestSpectrumEntry <- methods::setRefClass("PeakForestSpectrumEntry", contains = "BiodbEntry") - -PeakForestCompoundEntry <- methods::setRefClass("PeakForestCompoundEntry", contains = "BiodbEntry") - - -########### -# FACTORY # -########### - - -###Arg is jcontent ot indicate that the content is already a json. -createPeakforestCompoundFromJSON <- function(contents, drop = FALSE) { - - if(is.character(contents)) - contents <- jsonlite::fromJSON(contents, simplifyDataFrame=FALSE) - - jsonfields <- list() - jsonfields[[BIODB.ACCESSION]] <- "id" - jsonfields[[BIODB.PUBCHEMCOMP.ID]] <- "PubChemCID" - jsonfields[[BIODB.CHEBI.ID]] <- "ChEBI" - jsonfields[[BIODB.HMDB.ID]] <- "HMDB" - jsonfields[[BIODB.KEGG.ID]] <- "KEGG" - jsonfields[[BIODB.FORMULA]] <- "formula" - jsonfields[[BIODB.SMILES]] <- "canSmiles" - jsonfields[[BIODB.AVERAGE.MASS]] <- "averageMass" - jsonfields[[BIODB.MONOISOTOPIC.MASS]] <- "monoisotopicMass" - jsonfields[[BIODB.INCHI]] <- "inChI" - jsonfields[[BIODB.INCHIKEY]] <- "inchiIKey" - jsonfields[[BIODB.NAME]] <- "mainName" - - entries <- vector(length(contents),mode="list") - - for (i in seq_along(contents)){ - - jsontree <- contents[[i]] - entry <- PeakForestCompoundEntry$new() - - - for(field in names(jsonfields)){ - - tosearch <- jsonfields[[field]] - value <- jsontree$tosearch - entry$setField(field,value) - } - - entries[[i]] <- entry - } - - - if (drop && length(contents) == 1) - entries <- entries[[1]] - - entries -} - -createPeakforestSpectraFromJSON <- function(contents, drop = FALSE, checkSub = TRUE) { - - entries <- vector(length(contents),mode="list") - jsonfields <- character() - jsonfields[[BIODB.ACCESSION]] <- "id" # TODO Use BIODB.ACCESSION instead - jsonfields[[BIODB.MSMODE]] <- "polarity" - - - ###Checking that it's a list. - if(length(contents) == 1){ - if(startsWith(contents[[1]], "<html>") ){ - return(NULL) - }else{ - contents <- jsonlite::fromJSON(contents[[1]],simplifyDataFrame=FALSE) - - } - } - - for (i in seq_along(contents)){ - - content <- contents[[i]] - jsontree <- NULL - if(typeof(content) == "character"){ - if(startsWith(content, "<html>")|content=="null"){ - entries[[i]] <- NULL - next - } - jsontree <- jsonlite::fromJSON(content,simplifyDataFrame=FALSE) - }else{ - jsontree <- content - } - cnames <- c(BIODB.PEAK.MZ, BIODB.PEAK.RELATIVE.INTENSITY, BIODB.PEAK.FORMULA, BIODB.PEAK.MZTHEO, BIODB.PEAK.ERROR.PPM) - - entry <- PeakForestSpectrumEntry$new() - #####Setting thz mass analyzer - entry$setField(BIODB.MSDEV,jsontree$analyzerMassSpectrometerDevice$instrumentName) - entry$setField(BIODB.MSDEVTYPE,jsontree$analyzerMassSpectrometerDevice$ionAnalyzerType) - - - - for(field in names(jsonfields)){ - - tosearch <- jsonfields[[field]] - value <- jsontree$tosearch - entry$setField(field,value) - } - - ###################### - # TREATING THE PEAKS # - ###################### - - entry$setField(BIODB.NB.PEAKS,length(jsontree$peaks)) - peaks <- data.frame( matrix( 0,ncol = length(cnames), nrow = 0)) - colnames(peaks) <- cnames - ###Parsing peaks. - if(length(jsontree$peaks) != 0){ - peaks <- sapply(jsontree$peaks,function(x){ - return(list(as.double(x$mz), - as.integer(x$ri), - as.character(x$composition), - as.double(x$theoricalMass), - as.double(x$deltaPPM) - )) - }) - ###Removing all whitespaces from the formule. - peaks[3,]<-vapply(peaks[3,],function(x){ - gsub(" ","",trimws(x)) - },FUN.VALUE = NA_character_) - - peaks<-t(peaks) - colnames(peaks)<-cnames - } - - entry$setField(BIODB.PEAKS,peaks) - - ################################## - # TREATING THE LIST OF COMPOUNDS # - ################################## - - entry$setField(BIODB.NB.COMPOUNDS,length(jsontree$listOfCompounds)) - compounds <- list() - - ###Parsing compounds. - if( length( jsontree$listOfCompounds) != 0){ - compounds <- lapply( jsontree$listOfCompounds, function(x){ - createPeakforestCompoundFromJSON(x) - }) - } - - entry$setField(BIODB.COMPOUNDS, compounds) - - - entries[[i]] <- entry - } - - - if (drop && length(contents) == 1) - entries <- entries[[1]] - - entries -} - - -####TDO CLEAN THIS - -createReducedSpectraFromJSON <- function(contents, - drop = FALSE, - checkSub = TRUE) { - entries <- vector(length(contents), mode = "list") - jsonfields <- character() - # jsonfields[[BIODB.ACCESSION]] <- - # "id" # TODO Use BIODB.ACCESSION instead - - - ###Checking that it's a list. - if (length(contents) == 1) { - if (startsWith(contents[[1]], "<html>")) { - return(NULL) - } else{ - contents <- jsonlite::fromJSON(contents[[1]], simplifyDataFrame=FALSE) - - } - } - - for (i in seq_along(contents)) { - content <- contents[[i]] - jsontree <- NULL - if (typeof(content) == "character") { - if (startsWith(content, "<html>") | content == "null") { - entries[[i]] <- NULL - next - } - jsontree <- jsonlite::fromJSON(content, simplifyDataFrame=FALSE) - } else{ - jsontree <- content - } - - - cnames <- - c( - BIODB.PEAK.MZ, - BIODB.PEAK.RELATIVE.INTENSITY, - BIODB.PEAK.FORMULA, - BIODB.PEAK.MZTHEO, - BIODB.PEAK.ERROR.PPM - ) - - entry <- PeakForestSpectrumEntry$new() - entry$setField(BIODB.ACCESSION, jsontree$id) - - ###################### - # TREATING THE PEAKS # - ###################### - - entry$setField(BIODB.NB.PEAKS, length(jsontree$peaks)) - peaks <- data.frame(matrix(0, ncol = length(cnames), nrow = 0)) - colnames(peaks) <- cnames - ###Parsing peaks. - if (length(jsontree$peaks) != 0) { - peaks <- sapply(jsontree$peaks, function(x) { - return( - list( - as.double(x$mz), - as.integer(x$ri), - as.character(x$composition), - as.double(x$theoricalMass), - as.double(x$deltaPPM) - ) - ) - }) - ###Removing all whitespaces from the formule. - peaks[3, ] <- vapply(peaks[3, ], function(x) { - gsub(" ", "", trimws(x)) - }, FUN.VALUE = NA_character_) - - peaks <- as.data.frame(t(peaks)) - colnames(peaks) <- cnames - } - - entry$setField(BIODB.PEAKS, peaks) - - entries[[i]] <- entry - } - - - if (drop && length(contents) == 1) - entries <- entries[[1]] - - entries -}
--- a/README.md Wed Apr 19 10:00:05 2017 -0400 +++ b/README.md Fri Feb 22 16:04:22 2019 -0500 @@ -9,28 +9,41 @@ For more information, see the galaxy tool page, help section, available inside `galaxy/lcmsmatching.xml`. -## search-mz +## lcmsmatching script -This is the script, included in this repository, that allows run on command line an MZ matching on one of the available database types. +This is the script, included in this repository, that allows to run on command line an MZ matching on one of the available database types. -Please run `search-mz -h` for a help page listing all options and presenting some examples. +Please run `lcmsmatching -h` for a help page listing all options and presenting some examples. ## Dependencies - * `libssl-dev`. - * `libcurl4-openssl-dev`. - * `libxml2-dev`. - * `R` version `3.2.2`. + * `R` version `3.5.1`. * `R` packages: - `getopt` >= `1.20.0`. - - `stringr` >= `1.0.0`. - - `plyr` >= `1.8.3`. - - `XML` >= `3.98`. - - `bitops` >= `1.0_6`. - - `RCurl` >= `1.95`. - - `jsonlite` >= `1.1`. + - `biodb` >= `1.2.0rc2`. + +## Changelog + +### 4.0.2 + + * Increase getopt version to 1.20.2. + +### 4.0.1 + + * Downgrade to Galaxy 18.05. Test in both 18.05 and 18.09. -## Updates +### 4.0.0 + + * Switch to biodb R library (<http://github.com/pkrog/biodb>). + * Remove Excel and 4TabSql databases from script. + * Remove all dynamic fields in XML (i.e.: fields computed using python scripts, like the list of chromatogaphic columns). + * Use now a single field for in-house file databases column names, whose value is a comma separated list of key/value pairs. + * Update Peakforest URL. + +### 3.4.3 + + * Returns empty match in case of NA values in mz.low and mz.high. + * Speed up HTML output writing. ### 3.3.1
--- a/UrlRequestScheduler.R Wed Apr 19 10:00:05 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,126 +0,0 @@ -if ( ! exists('UrlRequestScheduler')) { # Do not load again if already loaded - - ############# - # CONSTANTS # - ############# - - RLIB.GET <- 'GET' - RLIB.POST <- 'POST' - - ##################### - # CLASS DECLARATION # - ##################### - - UrlRequestScheduler <- setRefClass("UrlRequestScheduler", fields = list(.n = "numeric", .t = "numeric", .time.of.last.request = "ANY", .useragent = "character", .ssl.verifypeer = "logical", .nb.max.tries = "integer", .verbose = "integer")) - - # n: number of connections - # t: time (in seconds) - - # The scheduler restrict the number of connections at n per t seconds. - - ############### - # CONSTRUCTOR # - ############### - - UrlRequestScheduler$methods( initialize = function(n = 1, t = 1, useragent = NA_character_, ssl.verifypeer = TRUE, ...) { - .n <<- n - .t <<- t - .time.of.last.request <<- -1 - .useragent <<- useragent - .nb.max.tries <<- 10L - .ssl.verifypeer <<- ssl.verifypeer - .verbose <<- 0L - callSuper(...) # calls super-class initializer with remaining parameters - }) - - ################## - # SET USER AGENT # - ################## - - UrlRequestScheduler$methods( setUserAgent = function(useragent) { - .useragent <<- useragent - }) - - ############### - # SET VERBOSE # - ############### - - UrlRequestScheduler$methods( setVerbose = function(verbose) { - .verbose <<- verbose - }) - - ################## - # WAIT AS NEEDED # - ################## - - # Wait the specified between two requests. - UrlRequestScheduler$methods( .wait.as.needed = function() { - - # Compute minimum waiting time between two URL requests - waiting_time <- .self$.t / .self$.n - - # Wait, if needed, before previous URL request and this new URL request. - if (.self$.time.of.last.request > 0) { - spent_time <- Sys.time() - .self$.time.of.last.request - if (spent_time < waiting_time) - Sys.sleep(waiting_time - spent_time) - } - - # Store current time - .time.of.last.request <<- Sys.time() - }) - - #################### - # GET CURL OPTIONS # - #################### - - UrlRequestScheduler$methods( .get_curl_opts = function(url) { - opts <- curlOptions(useragent = .self$.useragent, timeout.ms = 60000, verbose = FALSE) - return(opts) - }) - - ########### - # GET URL # - ########### - - UrlRequestScheduler$methods( .doGetUrl = function(url, params = NULL, method = RLIB.GET) { - - content <- NA_character_ - - # Use form to send URL request - if ( ! is.null(params) && ! is.na(params)) - switch(method, - GET = { content <- getForm(url, .opts = .self$.get_curl_opts(), .params = params) }, - POST = { content <- postForm(url, .opts = .self$.get_curl_opts(), .params = params) }, - stop(paste('Unknown method "', method, '".')) - ) - - # Get URL normally - else - content <- getURL(url, .opts = .self$.get_curl_opts(), ssl.verifypeer = .self$.ssl.verifypeer) - - return(content) - }) - - UrlRequestScheduler$methods( getUrl = function(url, params = NULL, method = RLIB.GET) { - - # Load library here and not inside .doGetUrl() since it is called from inside a try/catch clause, hence if library is missing the error will be ignored. - library(bitops) - library(RCurl) - - content <- NA_character_ - - # Wait required time between two requests - .self$.wait.as.needed() - - # Run query - for (i in seq(.self$.nb.max.tries)) { - tryCatch({ content <- .self$.doGetUrl(url, params = params, method = method) }, - error = function(e) { if (.self$.verbose > 0) print("Retry connection to server...") } ) - if ( ! is.na(content)) - break - } - - return(content) - }) -}
--- a/biodb-common.R Wed Apr 19 10:00:05 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,350 +0,0 @@ -if ( ! exists('BIODB.XML')) { - - ############### - # CACHE MODES # - ############### - - BIODB.CACHE.READ.ONLY <- 'read-only' - BIODB.CACHE.READ.WRITE <- 'read-write' - BIODB.CACHE.WRITE.ONLY <- 'write-only' - - ####################### - # ENTRY CONTENT TYPES # - ####################### - - BIODB.HTML <- 'html' - BIODB.TXT <- 'txt' - BIODB.XML <- 'xml' - BIODB.CSV <- 'csv' - BIODB.DATAFRAME <- 'dataframe' - BIODB.JSON <- 'json' - - ############# - # DATABASES # - ############# - - BIODB.CHEBI <- 'chebi' - BIODB.KEGG <- 'kegg' - BIODB.PUBCHEMCOMP <- 'pubchemcomp' # Compound database - BIODB.PUBCHEMSUB <- 'pubchemsub' # Substance database - BIODB.HMDB <- 'hmdb' - BIODB.CHEMSPIDER <- 'chemspider' - BIODB.ENZYME <- 'enzyme' - BIODB.LIPIDMAPS <- 'lipidmaps' - BIODB.MIRBASE <- 'mirbase' - BIODB.NCBIGENE <- 'ncbigene' - BIODB.NCBICCDS <- 'ncbiccds' - BIODB.UNIPROT <- 'uniprot' - BIODB.MASSBANK <- 'massbank' - BIODB.MASSFILEDB <- 'massfiledb' - BIODB.PEAKFOREST <- 'peakforest' - - BIODB.DATABASES <- c(BIODB.CHEBI, BIODB.KEGG, BIODB.PUBCHEMCOMP, BIODB.PUBCHEMSUB, BIODB.HMDB, BIODB.CHEMSPIDER, BIODB.ENZYME, BIODB.LIPIDMAPS, BIODB.MIRBASE, BIODB.NCBIGENE, BIODB.NCBICCDS, BIODB.UNIPROT, BIODB.MASSBANK, BIODB.MASSFILEDB, BIODB.PEAKFOREST) - - ########## - # FIELDS # - ########## - - BIODB.ACCESSION <- 'accession' - BIODB.DESCRIPTION <- 'description' - BIODB.PROTEIN.DESCRIPTION <- 'protdesc' - BIODB.NAME <- 'name' - BIODB.COMP.IUPAC.NAME.ALLOWED <- 'comp.iupac.name.allowed' - BIODB.COMP.IUPAC.NAME.TRAD <- 'comp.iupac.name.trad' - BIODB.COMP.IUPAC.NAME.SYST <- 'comp.iupac.name.syst' - BIODB.COMP.IUPAC.NAME.PREF <- 'comp.iupac.name.pref' - BIODB.COMP.IUPAC.NAME.CAS <- 'comp.iupac.name.cas' - BIODB.FULLNAMES <- 'fullnames' - BIODB.SYNONYMS <- 'synonyms' - BIODB.SYMBOL <- 'symbol' - BIODB.GENE.SYMBOLS <- 'genesymbols' - BIODB.CHEBI.ID <- 'chebiid' - BIODB.LIPIDMAPS.ID <- 'lipidmapsid' - BIODB.KEGG.ID <- 'keggid' - BIODB.HMDB.ID <- 'hmdbid' - BIODB.ENZYME.ID <- 'enzymeid' - BIODB.NCBI.CCDS.ID <- 'ncbiccdsid' - BIODB.NCBI.GENE.ID <- 'ncbigeneid' - BIODB.PUBCHEMCOMP.ID <- 'pubchemcompid' - BIODB.PUBCHEMSUB.ID <- 'pubchemsubid' - BIODB.CHEMSPIDER.ID <- 'chemspiderid' - BIODB.UNIPROT.ID <- 'uniprotid' - BIODB.CAS.ID <- 'casid' - BIODB.PEAKFOREST.ID <- 'peakforestid' - BIODB.SMILES <- 'smiles' - BIODB.INCHI <- 'inchi' - BIODB.INCHIKEY <- 'inchikey' - BIODB.MSDEV <- 'msdev' - BIODB.MSDEVTYPE <- 'msdevtype' - BIODB.MSTYPE <- 'mstype' - BIODB.MSMODE <- 'msmode' - BIODB.MSPRECMZ <- 'msprecmz' # numeric - BIODB.MSPRECANNOT <- 'msprecannot' - BIODB.FORMULA <- 'formula' - BIODB.SUPER.CLASS <- 'superclass' - BIODB.MASS <- 'mass' - BIODB.AVERAGE.MASS <- 'averagemass' - BIODB.MONOISOTOPIC.MASS <- 'monoisotopicmass' - BIODB.SEQUENCE <- 'sequence' - BIODB.LOCATION <- 'location' - BIODB.LENGTH <- 'length' - BIODB.NB.PEAKS <- 'nbpeaks' - BIODB.PEAKS <- 'peaks' - BIODB.COMPOUNDS <- 'compounds' - BIODB.NB.COMPOUNDS <- 'nbcompounds' - BIODB.COMPOUND.ID <- 'compoundid' - BIODB.COMPOUND.MASS <- 'compoundmass' - BIODB.COMPOUND.COMP <- 'compoundcomp' - BIODB.CHROM.COL <- 'chromcol' # Chromatographic column - BIODB.CHROM.COL.RT <- 'chromcolrt' # Retention time measured on chromatographic column - BIODB.ID <- 'id' - BIODB.TITLE <- 'title' - BIODB.PEAK.MZ <- 'mz' - BIODB.PEAK.RT <- 'rt' - BIODB.PEAK.MZEXP <- 'mzexp' - BIODB.PEAK.MZTHEO <- 'mztheo' - BIODB.PEAK.FORMULA <- 'formula' - BIODB.PEAK.FORMULA.COUNT <- 'formula.count' - BIODB.PEAK.COMP <- 'peakcomp' # Peak composition - BIODB.PEAK.ATTR <- 'peakattr' # Peak attribution - BIODB.PEAK.MASS <- 'mass' -# BIODB.PEAK.ATTR <- 'attr' - BIODB.PEAK.ERROR.PPM <- 'error.ppm' - BIODB.PEAK.INTENSITY <- 'intensity' - BIODB.PEAK.RELATIVE.INTENSITY <- 'relative.intensity' - - # Mode values - BIODB.MSMODE.NEG <- 'neg' - BIODB.MSMODE.POS <- 'pos' - - # Tolerance values - BIODB.TOL <- 'mztol' - BIODB.MZTOLUNIT.PPM <- 'ppm' - BIODB.MZTOLUNIT.PLAIN <- 'plain' # same as mz: mass-to-charge ratio - BIODB.MZTOLUNIT.VALS <- c(BIODB.MZTOLUNIT.PPM, BIODB.MZTOLUNIT.PLAIN) - - ######################## - # MS-MS MEASURE VALUES # - ######################## - - BIODB.MSMS.DIST.COS <- "cosine" - BIODB.MSMS.DIST.WCOSINE <- "wcosine" - BIODB.MSMS.DIST.PKERNEL <- "pkernel" - BIODB.MSMS.DIST <- c(BIODB.MSMS.DIST.COS, BIODB.MSMS.DIST.WCOSINE, BIODB.MSMS.DIST.PKERNEL) - - - ################# - # CARDINALITIES # - ################# - - BIODB.CARD.ONE <- '1' - BIODB.CARD.MANY <- '*' - - ##################### - #INTENSITy NOTATIONS# - ##################### - - BIODB.GROUP.INTENSITY<-c(BIODB.PEAK.INTENSITY,BIODB.PEAK.RELATIVE.INTENSITY) - - ########################## - # ENTRY FIELD ATTRIBUTES # - ########################## - # FIELD NAME CLASS CARDINALITY TYPE - BIODB.FIELDS <- data.frame(matrix(c( - BIODB.ACCESSION, 'character', BIODB.CARD.ONE, 'none', - BIODB.DESCRIPTION, 'character', BIODB.CARD.ONE, 'none', - BIODB.NAME, 'character', BIODB.CARD.ONE, 'name', - BIODB.COMP.IUPAC.NAME.ALLOWED, 'character', BIODB.CARD.ONE, 'name', - BIODB.COMP.IUPAC.NAME.TRAD, 'character', BIODB.CARD.ONE, 'name', - BIODB.COMP.IUPAC.NAME.SYST, 'character', BIODB.CARD.ONE, 'name', - BIODB.COMP.IUPAC.NAME.PREF, 'character', BIODB.CARD.ONE, 'name', - BIODB.COMP.IUPAC.NAME.CAS, 'character', BIODB.CARD.ONE, 'name', - BIODB.FULLNAMES, 'character', BIODB.CARD.MANY, 'name', - BIODB.SYNONYMS, 'character', BIODB.CARD.MANY, 'name', - BIODB.PROTEIN.DESCRIPTION, 'character', BIODB.CARD.ONE, 'none', - BIODB.SYMBOL, 'character', BIODB.CARD.ONE, 'none', - BIODB.GENE.SYMBOLS, 'character', BIODB.CARD.MANY, 'none', - BIODB.NB.COMPOUNDS, 'integer', BIODB.CARD.ONE, 'none', - BIODB.COMPOUNDS, 'object', BIODB.CARD.MANY, 'none', - BIODB.CHEBI.ID, 'character', BIODB.CARD.ONE, 'none', - BIODB.LIPIDMAPS.ID, 'character', BIODB.CARD.ONE, 'none', - BIODB.KEGG.ID, 'character', BIODB.CARD.ONE, 'none', - BIODB.HMDB.ID, 'character', BIODB.CARD.ONE, 'none', - BIODB.ENZYME.ID, 'character', BIODB.CARD.ONE, 'none', - BIODB.PUBCHEMCOMP.ID, 'character', BIODB.CARD.ONE, 'none', - BIODB.PUBCHEMSUB.ID, 'character', BIODB.CARD.ONE, 'none', - BIODB.PEAKFOREST.ID, 'character', BIODB.CARD.ONE, 'none', - BIODB.UNIPROT.ID, 'character', BIODB.CARD.ONE, 'none', - BIODB.NCBI.CCDS.ID, 'character', BIODB.CARD.ONE, 'none', - BIODB.NCBI.GENE.ID, 'character', BIODB.CARD.ONE, 'none', - BIODB.INCHI, 'character', BIODB.CARD.ONE, 'none', - BIODB.INCHIKEY, 'character', BIODB.CARD.ONE, 'none', - BIODB.MSDEV, 'character', BIODB.CARD.ONE, 'none', - BIODB.MSDEVTYPE, 'character', BIODB.CARD.ONE, 'none', - BIODB.MSTYPE, 'character', BIODB.CARD.ONE, 'none', - BIODB.MSMODE, 'character', BIODB.CARD.ONE, 'none', - BIODB.MSPRECMZ, 'double', BIODB.CARD.ONE, 'none', - BIODB.PEAK.MZTHEO, 'double', BIODB.CARD.ONE, 'none', - BIODB.MSPRECANNOT, 'character', BIODB.CARD.ONE, 'none', - BIODB.FORMULA, 'character', BIODB.CARD.ONE, 'none', - BIODB.SUPER.CLASS, 'character', BIODB.CARD.ONE, 'none', - BIODB.MASS, 'double', BIODB.CARD.ONE, 'none', - BIODB.AVERAGE.MASS, 'double', BIODB.CARD.ONE, 'none', - BIODB.MONOISOTOPIC.MASS, 'double', BIODB.CARD.ONE, 'none', - BIODB.SEQUENCE, 'character', BIODB.CARD.ONE, 'none', - BIODB.LENGTH, 'integer', BIODB.CARD.ONE, 'none', - BIODB.LOCATION, 'character', BIODB.CARD.ONE, 'none', - BIODB.NB.PEAKS, 'integer', BIODB.CARD.ONE, 'none', - BIODB.PEAKS, 'data.frame', BIODB.CARD.ONE, 'none', - BIODB.SMILES, 'character', BIODB.CARD.ONE, 'none', - BIODB.CHEMSPIDER.ID, 'character', BIODB.CARD.ONE, 'none', - BIODB.CAS.ID, 'character', BIODB.CARD.ONE, 'none' - ), byrow = TRUE, ncol = 4), stringsAsFactors = FALSE) - colnames(BIODB.FIELDS) <- c('name', 'class', 'cardinality', 'type') - - ######################### - # GET DATABASE ID FIELD # - ######################### - - biodb.get.database.id.field <- function(database) { - - id.field <- NA_character_ - - if (database %in% BIODB.DATABASES) { - id.field <- paste0(database, 'id') - if ( ! id.field %in% BIODB.FIELDS[['name']]) - stop(paste0('No ID field defined for database ', database, '.')) - } - - return(id.field) - } - - ##################### - # COMPUTABLE FIELDS # - ##################### - - BIODB.FIELD.COMPUTING <- list() - BIODB.FIELD.COMPUTING[[BIODB.INCHI]] <- c(BIODB.CHEBI) - BIODB.FIELD.COMPUTING[[BIODB.INCHIKEY]] <- c(BIODB.CHEBI) - BIODB.FIELD.COMPUTING[[BIODB.SEQUENCE]] <- c(BIODB.NCBICCDS) - - #################### - # PEAKS DATA FRAME # - #################### - - # Example - BIODB.PEAK.DF.EXAMPLE <- data.frame(mz = double(), int = double(), rel.int = integer(), formula = character(), formula.count <- integer(), mass = double(), error = double(), stringsAsFactors = FALSE) - colnames(BIODB.PEAK.DF.EXAMPLE) <- c(BIODB.PEAK.MZ, BIODB.PEAK.INTENSITY, BIODB.PEAK.RELATIVE.INTENSITY, BIODB.PEAK.FORMULA, BIODB.PEAK.FORMULA.COUNT, BIODB.PEAK.MASS, BIODB.PEAK.ERROR.PPM) - - ################# - # GET ENTRY URL # - ################# - - # TODO Let the choice to use either jp or eu - BIODB.MASSBANK.JP.WS.URL <- "http://www.massbank.jp/api/services/MassBankAPI/" - BIODB.MASSBANK.EU.WS.URL <- "http://massbank.eu/api/services/MassBankAPI/" - - .do.get.entry.url <- function(class, accession, content.type = BIODB.HTML, base.url = NA_character_, token = NA_character_) { - - # Only certain databases can handle multiple accession ids - if ( ! class %in% c(BIODB.MASSBANK, BIODB.CHEMSPIDER, BIODB.PUBCHEMCOMP, BIODB.PUBCHEMSUB, BIODB.PEAKFOREST) && length(accession) > 1) - stop(paste0("Cannot build a URL for getting multiple entries for class ", class, ".")) - - # Get URL - url <- switch(class, - chebi = if (content.type == BIODB.HTML) paste0('https://www.ebi.ac.uk/chebi/searchId.do?chebiId=', accession) else NULL, - chemspider = { - token.param <- if (is.na(token)) '' else paste('&token', token, sep = '=') - switch(content.type, - html = paste0('http://www.chemspider.com/Chemical-Structure.', accession, '.html'), - xml = paste0('http://www.chemspider.com/MassSpecAPI.asmx/GetExtendedCompoundInfoArray?', paste(paste0('CSIDs=', accession), collapse = '&'), token.param), - NULL) - }, - enzyme = if (content.type == BIODB.TXT) paste0('http://enzyme.expasy.org/EC/', accession, '.txt') else NULL, - hmdb = switch(content.type, - xml = paste0('http://www.hmdb.ca/metabolites/', accession, '.xml'), - html = paste0('http://www.hmdb.ca/metabolites/', accession), - NULL), - kegg = switch(content.type, - txt = paste0('http://rest.kegg.jp/get/', accession), - html = paste0('http://www.genome.jp/dbget-bin/www_bget?cpd:', accession), - NULL), - lipidmaps = if (content.type == BIODB.CSV) paste0('http://www.lipidmaps.org/data/LMSDRecord.php?Mode=File&LMID=', accession, '&OutputType=CSV&OutputQuote=No') else NULL, - massbank = if (content.type == BIODB.TXT) paste0((if (is.na(base.url)) BIODB.MASSBANK.EU.WS.URL else base.url), 'getRecordInfo?ids=', paste(accession, collapse = ',')) else NULL, - mirbase = if (content.type == BIODB.HTML) paste0('http://www.mirbase.org/cgi-bin/mature.pl?mature_acc=', accession) else NULL, - pubchemcomp = switch(content.type, - xml = paste0('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/', paste(accession, collapse = ','), '/XML'), - html = paste0('http://pubchem.ncbi.nlm.nih.gov/compound/', accession), - NULL), - pubchemsub = switch(content.type, - xml = paste0('https://pubchem.ncbi.nlm.nih.gov/rest/pug/substance/sid/', paste(accession, collapse = ','), '/XML'), - html = paste0('http://pubchem.ncbi.nlm.nih.gov/substance/', accession), - NULL), - ncbigene = if (content.type == BIODB.XML) paste0('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&id=', accession, '&rettype=xml&retmode=text') else NULL, - ncbiccds = if (content.type == BIODB.HTML) paste0('https://www.ncbi.nlm.nih.gov/CCDS/CcdsBrowse.cgi?REQUEST=CCDS&GO=MainBrowse&DATA=', accession), - uniprot = if (content.type == BIODB.XML) paste0('http://www.uniprot.org/uniprot/', accession, '.xml'), - peakforest = switch(content.type, - html= paste0('https://peakforest.org/home?PFs=',accession), - json= paste0('https://peakforest-alpha.inra.fr/rest/spectra/lcms/ids/',paste(accession,sep=','),'?token=',token), - - NULL - ) - ) - return(url) - } - - get.entry.url <- function(class, accession, content.type = BIODB.HTML, max.length = 0, base.url = NA_character_, token = NA_character_) { - - if (length(accession) == 0) - return(NULL) - - full.url <- .do.get.entry.url(class, accession, content.type = content.type, base.url = base.url, token = token) - if (max.length == 0 || nchar(full.url) <= max.length) - return(if (max.length == 0) full.url else list(url = full.url, n = length(accession))) - - # Find max size URL - a <- 1 - b <- length(accession) - while (a < b) { - m <- as.integer((a + b) / 2) - url <- .do.get.entry.url(class, accession[1:m], content.type = content.type, base.url = base.url, token = token) - if (nchar(url) <= max.length && m != a) - a <- m - else - b <- m - } - url <- .do.get.entry.url(class, accession[1:a], content.type = content.type, base.url = base.url, token = token) - - return(list( url = url, n = a)) - } - - ################# - # PRINT MESSAGE # - ################# - - BIODB.DEBUG <- 1 - BIODB.LEVEL.NAMES <- c('DEBUG') - - .print.msg <- function(msg, level = BIODB.DEBUG, class = NA_character_) { - cat(paste0(BIODB.LEVEL.NAMES[[level]], if (is.na(class)) '' else paste0(", ", class), ": ", msg, "\n"), file = stderr()) - } - - ##################### - # BIODB GET ENV VAR # - ##################### - - .biodb.get.env.var <- function(v) { - - # Get all env vars - env <- Sys.getenv() - - # Make env var name - env.var <- paste(c('BIODB', toupper(v)), collapse = '_') - - # Look if this env var exists - if (env.var %in% names(env)) - return(env[[env.var]]) - - return(NA_character_) - } -}
--- a/build.xml Wed Apr 19 10:00:05 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,396 +0,0 @@ -<project name="w4m.tool.lcmsmatching" default="all"> - - <dirname property="this.dir" file="${ant.file.w4m.tool.lcmsmatching}"/> - <property name="conda.dir" value="${user.home}/w4m-conda"/> - <property name="planemo.dir" value="${user.home}/.planemo"/> - - <!--~~~~~~~~~~~~~~~~~ - ~ PUBLIC PROPERTIES ~ - ~~~~~~~~~~~~~~~~~~--> - - <!-- These properties can be set when calling Ant: `ant -DPROP=value ...`. --> - - <property name="TIMESTAMP" value="true"/> - <property name="VERSION" value="true"/> - <property name="DIST.TEST" value="true"/> - <property name="TOOL.PREFIX" value="$__tool_directory__/"/> - <property name="PKG.PREFIX" value="w4m-tool-lcmsmatching"/> - - <!--~~~~~~~~~~~~~~~~~~~ - ~ INTERNAL PROPERTIES ~ - ~~~~~~~~~~~~~~~~~~~~--> - - <!-- Version --> - <property name="version" value="2.1.3"/> - <condition property="version.suffix" value="" else="-${version}"> - <isfalse value="${VERSION}"/> - </condition> - - <!-- Distribution directories --> - <property name="dist.dir" value="dist"/> - <property name="dist.code.dir" value="${dist.dir}/code"/> - <property name="dist.test.dir" value="${dist.dir}/test"/> - - <!-- Tool XML paths --> - <property name="tool.xml" value="lcmsmatching.xml"/> - <property name="orig.tool.xml" value="${tool.xml}"/> - <property name="dest.tool.xml" value="${dist.code.dir}/${tool.xml}"/> - - <!-- Time stamp --> - <tstamp/> - <property name="timestamp" value="${DSTAMP}-${TSTAMP}"/> - <condition property="timestamp.suffix" value="" else="-${timestamp}"> - <isfalse value="${TIMESTAMP}"/> - </condition> - - <!-- Package --> - <property name="pkg.ext" value="tar.gz"/> - <property name="pkg.name" value="${PKG.PREFIX}${version.suffix}${timestamp.suffix}"/> - <property name="pkg.path" value="${dist.dir}/${pkg.name}.${pkg.ext}"/> - - <!--~~~ - ~ ALL ~ - ~~~~~--> - - <target name="all"/> - - <!--~~~~ - ~ DIST ~ - ~~~~~--> - - <target name="dist" depends="dist.code,dist.tar,dist.test"/> - - <!--~~~~~~~~ - ~ DIST W4M ~ - ~~~~~~~~~--> - - <target name="dist.w4m" depends="w4m.code,dist.tar,dist.test"/> - - <!--~~~~~~~~~ - ~ DIST TEST ~ - ~~~~~~~~~~--> - - <target name="dist.test" if="${DIST.TEST}"> - - <!-- Make temp dir --> - <delete dir="${dist.test.dir}"/> - <mkdir dir="${dist.test.dir}"/> - - <!-- Extract package in temp dir --> - <untar src="${pkg.path}" dest="${dist.test.dir}" compression="gzip"/> - <chmod file="${dist.test.dir}/search-mz" perm="u+x"/> <!-- This file should be already executable, since it has been put executable inside the tar. It seems the untar task of Ant does not handle the file permissions. --> - - <!-- Run search-mz on sample input file --> - <exec executable="${dist.test.dir}/search-mz" failonerror="true"> - <arg value="-d"/> - <arg value="file"/> - <arg value="--url"/> - <arg value="test/filedb.tsv"/> - <arg value="-m"/> - <arg value="pos"/> - <arg value="-i"/> - <arg value="test/mzrt-input.tsv"/> - <arg value="-o"/> - <arg value="mzrt-output.tsv"/> - </exec> - - </target> - - <!--~~~~~~~~ - ~ W4M CODE ~ - ~~~~~~~~~--> - - <target name="w4m.code" depends="dist.code"> - - <!-- Copy and transform tool XML file. --> - <copy file="${orig.tool.xml}" tofile="${dest.tool.xml}"/> - - <!-- Copy python script. --> - <copy todir="${dist.code.dir}"> - <fileset dir="." includes="*.py"/> - </copy> - </target> - - <!--~~~~~~~~~ - ~ DIST CODE ~ - ~~~~~~~~~~--> - - <target name="dist.code"> - - <!-- Clean directory --> - <delete dir="${dist.code.dir}"/> - <mkdir dir="${dist.code.dir}"/> - - <!-- Copy R code --> - <copy todir="${dist.code.dir}"> - <fileset dir="." includes="search-mz,*.R"/> - </copy> - - </target> - - <!--~~~~~~~~ - ~ DIST TAR ~ - ~~~~~~~~~--> - - <target name="dist.tar"> - - <!-- Build tar file --> - <tar destfile="${pkg.path}" compression="gzip"> - - <!-- Include script with execution rights --> - <tarfileset dir="${dist.code.dir}" filemode="755"> - <include name="search-mz"/> - </tarfileset> - - <!-- Include remaining code and data files --> - <tarfileset dir="${dist.code.dir}"> - <include name="**"/> - <exclude name="search-mz"/> - </tarfileset> - </tar> - </target> - - <!--~~~~~ - ~ CLEAN ~ - ~~~~~~--> - - <target name="clean"> - <delete dir="${dist.dir}"/> - <delete dir="${conda.dir}"/> - <delete dir="${planemo.dir}"/> - </target> - - <!--~~~~~~~~~~~ - ~ GALAXY TEST ~ - ~~~~~~~~~~~~--> - - <target name="galaxy.test" depends="planemo.lint,planemo.test"/> - - <!--~~~~~~~~~~~~ - ~ PLANEMO LINT ~ - ~~~~~~~~~~~~~--> - - <target name="planemo.lint" depends="planemo.env"> - <exec executable="planemo" dir="${dist.code.dir}" failonerror="true"> - <arg value="lint"/> - <arg value="--no_xsd"/> - <arg value="${tool.xml}"/> - </exec> - </target> - - <!--~~~~~~~~~~~~ - ~ PLANEMO TEST ~ - ~~~~~~~~~~~~~--> - - <target name="planemo.test" depends="planemo.conda.install"> - <exec executable="planemo" dir="${dist.code.dir}" failonerror="true"> - <arg value="test"/> - <arg value="--conda_prefix"/> - <arg value="${conda.dir}"/> - <arg value="--galaxy_branch"/> - <arg value="release_16.01"/> - <arg value="--conda_dependency_resolution"/> - <arg value="${tool.xml}"/> - </exec> - </target> - - <!--~~~~~~~~~~~~~~~~~~~~~ - ~ PLANEMO CONDA INSTALL ~ - ~~~~~~~~~~~~~~~~~~~~~~--> - - <target name="planemo.conda.install" depends="planemo.conda.init,planemo.env"> - <exec executable="planemo" dir="${dist.code.dir}" failonerror="true"> - <arg value="conda_install"/> - <arg value="--conda_prefix"/> - <arg value="${conda.dir}"/> - <arg value="${tool.xml}"/> - </exec> - </target> - - <!--~~~~~~~~~~~~~~~~~~ - ~ PLANEMO CONDA INIT ~ - ~~~~~~~~~~~~~~~~~~~--> - - <target name="planemo.conda.init"> - <exec executable="planemo" failonerror="true"> - <arg value="conda_init"/> - <arg value="--conda_prefix"/> - <arg value="${conda.dir}"/> - </exec> - </target> - - <!--~~~~~~~~~~~~~~~~~ - ~ PLANEMO SHED LINT ~ - ~~~~~~~~~~~~~~~~~~--> - - <target name="planemo.shed.lint" depends="planemo.env"> - <exec executable="planemo" dir="${dist.code.dir}" failonerror="true"> - <arg value="shed_lint"/> - <arg value="--tools"/> - </exec> - </target> - - <!--~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ~ PLANEMO TEST TOOLSHED CREATE ~ - ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~--> - - <target name="planemo.testtoolshed.create" depends="planemo.env"> - <exec executable="planemo" dir="${dist.code.dir}" failonerror="true"> - <arg value="shed_create"/> - <arg value="--shed_target"/> - <arg value="testtoolshed"/> - </exec> - </target> - - <!--~~~~~~~~~~~~~~~~~~~~~~~~~~ - ~ PLANEMO TEST TOOLSHED DIFF ~ - ~~~~~~~~~~~~~~~~~~~~~~~~~~~--> - - <target name="planemo.testtoolshed.diff" depends="planemo.env"> - <exec executable="planemo" dir="${dist.code.dir}" failonerror="true"> - <arg value="shed_diff"/> - <arg value="--shed_target"/> - <arg value="testtoolshed"/> - </exec> - </target> - - <!--~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ~ PLANEMO TEST TOOLSHED UPDATE ~ - ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~--> - - <target name="planemo.testtoolshed.update" depends="planemo.env"> - <exec executable="planemo" dir="${dist.code.dir}" failonerror="true"> - <arg value="shed_update"/> - <arg value="--check_diff"/> - <arg value="--shed_target"/> - <arg value="testtoolshed"/> - </exec> - </target> - - <!--~~~~~~~~~~~~~~~~~~~~~~~~~~ - ~ PLANEMO TEST TOOLSHED TEST ~ - ~~~~~~~~~~~~~~~~~~~~~~~~~~~--> - - <target name="planemo.testtoolshed.test" depends="planemo.env"> - <exec executable="planemo" dir="${dist.code.dir}" failonerror="true"> - <arg value="shed_test"/> - <arg value="--shed_target"/> - <arg value="testtoolshed"/> - <arg value="--install_galaxy"/> - <arg value="--galaxy_branch"/> - <arg value="release_16.01"/> - </exec> - </target> - - <!--~~~~~~~~~~~~~~~~~~~~~~~ - ~ PLANEMO TOOLSHED CREATE ~ - ~~~~~~~~~~~~~~~~~~~~~~~~--> - - <target name="planemo.toolshed.create" depends="planemo.env"> - <exec executable="planemo" dir="${dist.code.dir}" failonerror="true"> - <arg value="shed_create"/> - <arg value="--shed_target"/> - <arg value="toolshed"/> - </exec> - </target> - - <!--~~~~~~~~~~~~~~~~~~~~~ - ~ PLANEMO TOOLSHED DIFF ~ - ~~~~~~~~~~~~~~~~~~~~~~--> - - <target name="planemo.toolshed.diff" depends="planemo.env"> - <exec executable="planemo" dir="${dist.code.dir}" failonerror="true"> - <arg value="shed_diff"/> - <arg value="--shed_target"/> - <arg value="toolshed"/> - </exec> - </target> - - <!--~~~~~~~~~~~~~~~~~~~~~~~ - ~ PLANEMO TOOLSHED UPDATE ~ - ~~~~~~~~~~~~~~~~~~~~~~~~--> - - <target name="planemo.toolshed.update" depends="planemo.env"> - <exec executable="planemo" dir="${dist.code.dir}" failonerror="true"> - <arg value="shed_update"/> - <arg value="--check_diff"/> - <arg value="--shed_target"/> - <arg value="toolshed"/> - </exec> - </target> - - <!--~~~~~~~~~~~ - ~ PLANEMO ENV ~ - ~~~~~~~~~~~~--> - - <target name="planemo.env" depends="w4m.code"> - <chmod file="${dist.code.dir}/search-mz" perm="u+x"/> - <ant dir="test" target="input.files"/> - <mkdir dir="${dist.code.dir}/test-data"/> - <copy todir="${dist.code.dir}/test-data"> - <fileset dir="test" includes="filedb.tsv"/> - <fileset dir="test" includes="mz-input-small.tsv"/> - <fileset dir="test/res" includes="filedb-small-mz-match-*"/> - </copy> - <copy file="shed.yml" tofile="${dist.code.dir}/.shed.yml"/> - </target> - - <!--************************************************ - ******************** DEPRECATED ******************** - *************************************************--> - - <!--~~~~~~~~~~~~~ - ~ UPDATE W4M VM ~ - ~~~~~~~~~~~~~~--> - - <!-- This task is used when developping, for updating quickly the tool inside the local W4M virtual machine. --> - <target name="update.w4m.vm" depends="clean,dist"> - - <property name="w4m.login" value="galaxy@w4m"/> - <property name="tool.path" value="galaxy-pfem/tools/metabolomics/annotation/lcmsmatching"/> - - <!-- Stop Galaxy --> - <exec executable="ssh" failonerror="true"> - <arg value="${w4m.login}"/> - <arg value="/sbin/service galaxy stop"/> - </exec> - - <!-- Remove current tool version --> - <exec executable="ssh" failonerror="true"> - <arg value="${w4m.login}"/> - <arg value="rm -rf ${tool.path}"/> - </exec> - - <!-- Remove old packages on W4M instance --> - <exec executable="ssh" failonerror="true"> - <arg value="${w4m.login}"/> - <arg value="rm -f ${PKG.PREFIX}-*.${pkg.ext}"/> - </exec> - - <!-- Copy new package on W4M instance --> - <exec executable="scp" failonerror="true"> - <arg value="${dist.dir}/${pkg.name}.${pkg.ext}"/> - <arg value="${w4m.login}:."/> - </exec> - - <!-- Make tool directory --> - <exec executable="ssh" failonerror="true"> - <arg value="${w4m.login}"/> - <arg value="mkdir -p ${tool.path}"/> - </exec> - - <!-- Install new tool version --> - <exec executable="ssh" failonerror="true"> - <arg value="${w4m.login}"/> - <arg value="tar -xzf ${pkg.name}.${pkg.ext} -C ${tool.path}"/> - </exec> - - <!-- Restart Galaxy --> - <exec executable="ssh" failonerror="true"> - <arg value="${w4m.login}"/> - <arg value="/sbin/service galaxy start"/> - </exec> - - </target> - -</project>
--- a/dfhlp.R Wed Apr 19 10:00:05 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,101 +0,0 @@ -if ( ! exists('remove.na.rows')) { # Do not load again if already loaded - - source('strhlp.R') - - ################# - # RENAME COLUMN # - ################# - - rename.col <- function(df, cur, new) { - - for (k in seq(cur)) { - i <- which(cur[k] == colnames(df)) - if (length(i) == 1) - colnames(df)[i] <- new[k] - } - - return(df) - } - - ################## - # REMOVE NA ROWS # - ################## - - remove.na.rows <- function(df) { - na.rows <- apply(is.na(df), MARGIN = 1, all) - return(df[ ! na.rows, , drop = FALSE]) - } - - ###################### - # MOVE COLUMNS FIRST # - ###################### - - df.move.col.first <- function(df, cols) { - not.cols <- setdiff(names(df), cols) - df[c(cols, not.cols)] - } - - ##################### - # MOVE COLUMNS LAST # - ##################### - - df.move.col.last <- function(df, cols) { - not.cols <- setdiff(names(df), cols) - df[c(not.cols, cols)] - } - - ################# - # READ CSV FILE # - ################# - - # Read CSV file and return a data.frame. - # file The path to the CSV file. - # header If TRUE, use first line as header line. - # check.names If TRUE, correct header (column) names in the data frame, by replacing non-ASCII characters by dot. - # stringsAsFactors If TRUE, replace string values by factors. - # trim.header If TRUE, remove whitespaces at beginning and of header titles. - # trim.values If TRUE, remove whitespaces at beginning and of string values. - # remove.na.rows If TRUE, remove all lines that contain only NA values. - df.read.csv <- function(file, header = TRUE, remove.na.rows = TRUE, check.names = TRUE, stringsAsFactors = TRUE, trim.header = FALSE, trim.values = FALSE) { - - # Call built-in read.csv() - df <- read.csv(file, header = header, check.names = check.names, stringsAsFactors = stringsAsFactors) - - # Clean data frame - df <- df.clean(df, trim.colnames = trim.header, trim.values = trim.values, remove.na.rows = remove.na.rows) - - return(df) - } - - ################## - # WRITE TSV FILE # - ################## - - df.write.tsv <- function(df, file, row.names = FALSE, col.names = TRUE) { - write.table(df, file = file, row.names = row.names, col.names = col.names, sep = "\t") - } - - #################### - # CLEAN DATA FRAME # - #################### - - df.clean <- function(df, trim.colnames = FALSE, trim.values = FALSE, remove.na.rows = FALSE) { - - # Remove NA lines - if (remove.na.rows) - df <- remove.na.rows(df) - - # Trim header - if (trim.colnames) - colnames(df) <- trim(colnames(df)) - - # Trim values - if (trim.values) - for (c in 1:ncol(df)) - if (typeof(df[[c]]) == 'character') - df[[c]] <- trim(df[[c]]) - - return(df) - } - -} # end of load safe guard
--- a/excelhlp.R Wed Apr 19 10:00:05 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,83 +0,0 @@ -if ( ! exists('read.excel')) { # Do not load again if already loaded - - source('strhlp.R') - source('dfhlp.R') - - ############### - # GET NB ROWS # - ############### - - get.nbrows <- function(file, tab) { - - library(rJava) - library(xlsxjars) - library(xlsx, quietly = TRUE) - - df <- read.xlsx(file, tab) - na_rows <- apply(is.na(df), MARGIN = 1, FUN = all) # look for rows that contain only NA values. - last_row <- tail(which(! na_rows), n = 1) - return(last_row) - } - - ############## - # READ EXCEL # - ############## - - # Read Excel xlsx file - # file The path to the Excel file. - # sheet - # start.row - # end.row - # header If TRUE, use first line as header line. - # check.names If TRUE, correct header (column) names in the data frame, by replacing non-ASCII characters by dot. - # stringsAsFactors If TRUE, replace string values by factors. - # trim.header If TRUE, remove whitespaces at beginning and of header titles. - # trim.values If TRUE, remove whitespaces at beginning and of string values. - # remove.na.rows If TRUE, remove all lines that contain only NA values. - read.excel <- function(file, sheet, start.row = NULL, end.row = NULL, header = TRUE, remove.na.rows = TRUE, check.names = TRUE, stringsAsFactors = TRUE, trim.header = FALSE, trim.values = FALSE, col.index = NULL) { - - library(rJava) - library(xlsxjars) - library(xlsx, quietly = TRUE) - - # Check that start row and end row exist - if ( ! is.null(start.row) || ! is.null(end.row)) { - nb_rows <- get.nbrows(file, sheet) - if ( ! is.null(start.row) && start.row > nb_rows) - return(NULL) - if ( ! is.null(end.row) && end.row > nb_rows) - return(NULL) - } - - # Call xlsx package - df <- read.xlsx(file, sheet, startRow = start.row, endRow = end.row, header = header, check.names = check.names, stringsAsFactors = stringsAsFactors, colIndex = col.index) - - # Remove column default names if header was set to false - if ( ! header) - colnames(df) <- NULL - - # Clean data frame - df <- df.clean(df, trim.colnames = trim.header, trim.values = trim.values, remove.na.rows = remove.na.rows) - - return(df) - } - - ####################### - # CHECK IF TAB EXISTS # - ####################### - - tab.exists <- function(file, tab) { - - if (is.null(file) || is.na(file) || is.null(tab) || is.na(tab)) - return(FALSE) - - library(rJava) - library(xlsxjars) - library(xlsx, quietly = TRUE) - - wb <- loadWorkbook(file) - sheets <- getSheets(wb) - return(tab %in% names(sheets)) - } - -} # end of load safe guard
--- a/fshlp.R Wed Apr 19 10:00:05 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,20 +0,0 @@ -if ( ! exists('extname')) { # Do not load again if already loaded - - source('strhlp.R') - - ########### - # EXTNAME # - ########### - - extname <- function(path) { - return(sub('^.*\\.([^.]*)$', '\\1', path, perl = TRUE)) - } - - ############## - # REMOVE EXT # - ############## - - remove.ext <- function(path) { - return(sub('\\.[^.]*$', '', path)) - } -}
--- a/htmlhlp.R Wed Apr 19 10:00:05 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,114 +0,0 @@ -if ( ! exists('HtmlWriter')) { # Do not load again if already loaded - - library(methods) - - ##################### - # CLASS DECLARATION # - ##################### - - HtmlWriter <- setRefClass("HtmlWriter", fields = list(.file = "character", .auto.indent = "numeric")) - - - ############### - # CONSTRUCTOR # - ############### - - HtmlWriter$methods( initialize = function(file = NA_character_, auto.indent = TRUE, ...) { - - .file <<- file - .auto.indent <<- if (auto.indent) 0 else NA_integer_ - - # Create empty file - cat('', file = .self$.file, append = FALSE) - - callSuper(...) # calls super-class initializer with remaining parameters - }) - - ######### - # WRITE # - ######### - - HtmlWriter$methods( write = function(text, indent = NA_integer_, newline = TRUE, escape = FALSE) { - - # Compute indentation - if (is.na(indent)) - indent <- if (is.na(.self$.auto.indent)) 0 else .self$.auto.indent - - cat(rep("\t", indent), text, if (newline) "\n" else "", sep = '', file = .self$.file, append = TRUE) - }) - - ############# - # WRITE TAG # - ############# - - HtmlWriter$methods( writeTag = function(tag, attr = NA_character_, text = NA_character_, indent = NA_integer_, newline = TRUE) { - - if (is.na(text)) { - attributes <- if (is.na(attr)) '' else paste0(' ', paste(vapply(names(attr), function(a) paste0(a, '="', attr[[a]], '"'), FUN.VALUE=''), collapse = ' ')) - .self$write(paste0("<", tag, attributes, "/>"), indent = indent, newline = newline, escape = FALSE) - } - else { - .self$writeBegTag(tag, attr = attr, indent = indent, newline = FALSE) - .self$write(text, escape = TRUE , indent = 0, newline = FALSE) - .self$writeEndTag(tag, indent = 0, newline = newline) - } - }) - - ################### - # WRITE BEGIN TAG # - ################### - - HtmlWriter$methods( writeBegTag = function(tag, attr = NA_character_, indent = NA_integer_, newline = TRUE) { - - # Write opening tag - attributes <- if (is.na(attr)) '' else paste0(' ', paste(vapply(names(attr), function(a) paste0(a, '="', attr[[a]], '"'), FUN.VALUE=''), collapse = ' ')) - .self$write(paste0("<", tag, attributes, ">"), indent = indent, newline = newline, escape = FALSE) - - # Increment auto-indent - if ( ! is.na(.self$.auto.indent)) - .auto.indent <<- .self$.auto.indent + 1 - }) - - ################# - # WRITE END TAG # - ################# - - HtmlWriter$methods( writeEndTag = function(tag, indent = NA_integer_, newline = TRUE) { - - # Decrement auto-indent - if ( ! is.na(.self$.auto.indent)) - .auto.indent <<- .self$.auto.indent - 1 - - # Write closing tag - .self$write(paste0("</", tag, ">"), indent = indent, newline = newline, escape = FALSE) - }) - - ############### - # WRITE TABLE # - ############### - - HtmlWriter$methods( writeTable = function(x, indent = NA_integer_, newline = TRUE) { - - .self$writeBegTag('table', indent = indent, newline = newline) - - # Write table header - if ( ! is.null(colnames(x))) { - .self$writeBegTag('tr', indent = indent + 1, newline = newline) - for (field in colnames(x)) - .self$writeTag('th', text = field, indent = indent + 2, newline = newline) - .self$writeEndTag('tr', indent = indent + 1, newline = newline) - } - - # Write values - if (nrow(x) > 0 && ncol(x) > 0) - for (i in 1:nrow(x)) { - .self$writeBegTag('tr', indent = indent + 1, newline = newline) - for (j in 1:ncol(x)) - .self$writeTag('td', text = x[i, j], indent = indent + 2, newline = newline) - .self$writeEndTag('tr', indent = indent + 1, newline = newline) - } - .self$writeEndTag('table', indent = indent, newline = newline) - }) - - -} # end of load safe guard
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lcmsmatching Fri Feb 22 16:04:22 2019 -0500 @@ -0,0 +1,866 @@ +#!/usr/bin/env Rscript +# vi: ft=r fdm=marker + +args <- commandArgs(trailingOnly = F) +script.path <- sub("--file=","",args[grep("--file=",args)]) + +library(getopt) +library(methods) +library(biodb) + +# HTML Writer {{{1 +################################################################ + +HtmlWriter <- methods::setRefClass("HtmlWriter", fields = list(.con = "ANY", .auto.indent = "numeric")) + +# Constructor {{{2 +################################################################ + +HtmlWriter$methods( initialize = function(auto.indent = TRUE, ...) { + + .auto.indent <<- if (auto.indent) 0 else NA_integer_ + .con <<- NULL + + callSuper(...) # calls super-class initializer with remaining parameters +}) + +# Open {{{2 +################################################################################ + +HtmlWriter$methods( file.opn = function(file) { + .con <<- file(file, open = "w") +}) + +# Close {{{2 +################################################################################ + +HtmlWriter$methods( file.close = function() { + close(.self$.con) +}) + +# Write {{{2 +################################################################ + +HtmlWriter$methods( write = function(text, indent = NA_integer_, newline = TRUE, escape = FALSE) { + + # Compute indentation + if (is.na(indent)) + indent <- if (is.na(.self$.auto.indent)) 0 else .self$.auto.indent + + cat(rep("\t", indent), text, if (newline) "\n" else "", sep = '', file = .self$.con) +}) + +# Write tag {{{2 +################################################################ + +HtmlWriter$methods( writeTag = function(tag, attr = NA_character_, text = NA_character_, indent = NA_integer_, newline = TRUE) { + + if (is.na(text)) { + attributes <- if (is.na(attr)) '' else paste0(' ', paste(vapply(names(attr), function(a) paste0(a, '="', attr[[a]], '"'), FUN.VALUE=''), collapse = ' ')) + .self$write(paste0("<", tag, attributes, "/>"), indent = indent, newline = newline, escape = FALSE) + } + else { + .self$writeBegTag(tag, attr = attr, indent = indent, newline = FALSE) + .self$write(text, escape = TRUE , indent = 0, newline = FALSE) + .self$writeEndTag(tag, indent = 0, newline = newline) + } +}) + +# Write begin tag {{{2 +################################################################################### + +HtmlWriter$methods( writeBegTag = function(tag, attr = NA_character_, indent = NA_integer_, newline = TRUE) { + + # Write opening tag + attributes <- if (is.na(attr)) '' else paste0(' ', paste(vapply(names(attr), function(a) paste0(a, '="', attr[[a]], '"'), FUN.VALUE=''), collapse = ' ')) + .self$write(paste0("<", tag, attributes, ">"), indent = indent, newline = newline, escape = FALSE) + + # Increment auto-indent + if ( ! is.na(.self$.auto.indent)) + .auto.indent <<- .self$.auto.indent + 1 +}) + +# Write end tag {{{2 +################################################################ + +HtmlWriter$methods( writeEndTag = function(tag, indent = NA_integer_, newline = TRUE) { + + # Decrement auto-indent + if ( ! is.na(.self$.auto.indent)) + .auto.indent <<- .self$.auto.indent - 1 + + # Write closing tag + .self$write(paste0("</", tag, ">"), indent = indent, newline = newline, escape = FALSE) +}) + +# Write table {{{2 +################################################################ + +HtmlWriter$methods( writeTable = function(x, indent = NA_integer_, newline = TRUE) { + + .self$writeBegTag('table', indent = indent, newline = newline) + + # Write table header + if ( ! is.null(colnames(x))) { + .self$writeBegTag('tr', indent = indent + 1, newline = newline) + for (field in colnames(x)) + .self$writeTag('th', text = field, indent = indent + 2, newline = newline) + .self$writeEndTag('tr', indent = indent + 1, newline = newline) + } + + # Write values + if (nrow(x) > 0 && ncol(x) > 0) + for (i in 1:nrow(x)) { + .self$writeBegTag('tr', indent = indent + 1, newline = newline) + for (j in 1:ncol(x)) + .self$writeTag('td', text = (if (j == 1 && is.na(x[i, j])) 'NA' else x[i, j]), indent = indent + 2, newline = newline) + .self$writeEndTag('tr', indent = indent + 1, newline = newline) + } + .self$writeEndTag('table', indent = indent, newline = newline) +}) + +# Split key/value list {{{1 +################################################################ + +split.kv.list <- function(s, sep = ',', kvsep = '=') { + + # Split + kvs <- strsplit(strsplit(s, sep)[[1]], kvsep) + + # Get keys + k <- vapply(kvs, function(x) x[[1]], FUN.VALUE = '') + v <- vapply(kvs, function(x) x[[2]], FUN.VALUE = '') + + # Set names + names(v) <- k + + return(v) +} + +# Concat key/value list {{{1 +################################################################ + +concat.kv.list <- function(x, sep = ',', kvsep = '=') { + + k <- names(x) + + s = paste(paste(names(x), x, sep = kvsep), collapse = sep) + + return(s) +} + +# Constants {{{1 +################################################################ + +PROG <- sub('^.*/([^/]+)$', '\\1', commandArgs()[4], perl = TRUE) +USERAGENT <- 'W4M lcmsmatching ; pk.roger@icloud.com' + +# Field tags +MSDB.TAG.MZ <- 'mz' +MSDB.TAG.MZEXP <- 'mzexp' +MSDB.TAG.MZTHEO <- 'mztheo' +MSDB.TAG.RT <- 'rt' +MSDB.TAG.MODE <- 'msmode' +MSDB.TAG.MOLID <- 'compoundid' +MSDB.TAG.COL <- 'chromcol' +MSDB.TAG.COLRT <- 'chromcolrt' +MSDB.TAG.ATTR <- 'peakattr' +MSDB.TAG.INT <- 'intensity' +MSDB.TAG.REL <- 'relative.intensity' +MSDB.TAG.COMP <- 'peakcomp' +MSDB.TAG.MOLNAMES <- 'fullnames' +MSDB.TAG.MOLCOMP <- 'compoundcomp' +MSDB.TAG.MOLMASS <- 'compoundmass' +MSDB.TAG.INCHI <- 'inchi' +MSDB.TAG.INCHIKEY <- 'inchikey' +MSDB.TAG.PUBCHEM <- 'pubchemcompid' +MSDB.TAG.CHEBI <- 'chebiid' +MSDB.TAG.HMDB <- 'hmdbid' +MSDB.TAG.KEGG <- 'keggid' + +# Authorized database types +MSDB.VALS <- c('file', 'peakforest') + +# Authorized mode values +MSDB.TAG.POS <- 'pos' +MSDB.TAG.NEG <- 'neg' +POS_MODE <- 'pos' +NEG_MODE <- 'neg' +MSDB.MODE.VALS <- c(POS_MODE, NEG_MODE) + +# Authorized mz tolerance unit values +MSDB.MZTOLUNIT.PPM <- 'ppm' +MSDB.MZTOLUNIT.PLAIN <- 'plain' # same as mz: mass-to-charge ratio +MSDB.MZTOLUNIT.VALS <- c(MSDB.MZTOLUNIT.PPM, MSDB.MZTOLUNIT.PLAIN) + +# Authorized rt units +MSDB.RTUNIT.SEC <- 'sec' +MSDB.RTUNIT.MIN <- 'min' +MSDB.RTUNIT.VALS <- c(MSDB.RTUNIT.SEC ,MSDB.RTUNIT.MIN) + +# Default values +MSDB.DFT.PREC <- list() +MSDB.DFT.PREC[[MSDB.TAG.POS]] <- c("[(M+H)]+", "[M+H]+", "[(M+Na)]+", "[M+Na]+", "[(M+K)]+", "[M+K]+") +MSDB.DFT.PREC[[MSDB.TAG.NEG]] <- c("[(M-H)]-", "[M-H]-", "[(M+Cl)]-", "[M+Cl]-") +MSDB.DFT.MATCH.FIELDS <- list( molids = 'molid', molnames = 'molnames') +MSDB.DFT.MATCH.SEP <- '|' +MSDB.DFT.MODES <- list( pos = 'POS', neg = 'NEG') +MSDB.DFT.MZTOLUNIT <- MSDB.MZTOLUNIT.PPM + +# Get default db fields +################################################################ + +msdb.get.dft.db.fields <- function () { + + dft.fields <- list() + + for (f in c(MSDB.TAG.MZTHEO, MSDB.TAG.COLRT, MSDB.TAG.MOLID, MSDB.TAG.COL, MSDB.TAG.MODE, MSDB.TAG.ATTR, MSDB.TAG.COMP, MSDB.TAG.MOLNAMES, MSDB.TAG.MOLCOMP, MSDB.TAG.MOLMASS, MSDB.TAG.INCHI, MSDB.TAG.INCHIKEY, MSDB.TAG.PUBCHEM, MSDB.TAG.CHEBI, MSDB.TAG.HMDB, MSDB.TAG.KEGG)) + dft.fields[[f]] <- f + + return(dft.fields) +} + +# Default +MSDB.DFT <- list() +MSDB.DFT[['mzshift']] <- 0 # in ppm +MSDB.DFT[['mzprec']] <- 5 # in ppm +MSDB.DFT[['mztolunit']] <- MSDB.DFT.MZTOLUNIT +MSDB.DFT[['precursor-rt-tol']] <- 5 +MSDB.DFT[['molids-sep']] <- MSDB.DFT.MATCH.SEP +MSDB.DFT[['db-fields']] <- concat.kv.list(msdb.get.dft.db.fields()) +MSDB.DFT[['db-ms-modes']] <- concat.kv.list(MSDB.DFT.MODES) +MSDB.DFT[['pos-prec']] <- paste(MSDB.DFT.PREC[[MSDB.TAG.POS]], collapse = ',') +MSDB.DFT[['neg-prec']] <- paste(MSDB.DFT.PREC[[MSDB.TAG.NEG]], collapse = ',') +MSDB.DFT[['db-rt-unit']] <- MSDB.RTUNIT.SEC +MSDB.DFT[['rtunit']] <- MSDB.RTUNIT.SEC +DEFAULT.ARG.VALUES <- MSDB.DFT +DEFAULT.ARG.VALUES[['input-col-names']] <- 'mz=mz,rt=rt' + +# Get default input fields {{{1 +################################################################ + +msdb.get.dft.input.fields <- function () { + + dft.fields <- list() + + for(f in c(MSDB.TAG.MZ, MSDB.TAG.RT)) + dft.fields[[f]] <- f + + return(dft.fields) +} + +# Print help {{{1 +################################################################ + +print.help <- function() { + + cat("USAGE:\n") + prog.mz.match <- paste(PROG, ' -d (', paste(MSDB.VALS, collapse = '|'), ') --url (file|dir|database URL) -i <file> -m (', paste(MSDB.MODE.VALS, collapse = '|'), ") -p <mz precision> -s <mz shift> -u (", paste(MSDB.MZTOLUNIT.VALS, collapse = '|'), ") -o <file>", sep = '') + cat("\t(1) ", prog.mz.match, " ...\n", sep = '') + cat("\n") + cat("\t(2) ", prog.mz.match, "(--all-cols|-c <cols>) -x <X RT tolerance> -y <Y RT tolerance>", " ...\n", sep = '') + cat("\n") + cat("\t(3) ", PROG, ' -d (', paste(MSDB.VALS, collapse = '|'), ") --url (file|dir|database URL) --list-cols\n", sep = '') + + cat("\nDETAILS:\n") + cat("Form (1) is for running an MZ match on a database.\n") + cat("Form (2) is for running an MZ/RT match on a database.\n") + cat("Form (3) is for getting a list of available chromatographic columns in a database.\n") + + cat("\nOPTIONS:\n") + spec <- matrix(make.getopt.spec(), byrow = TRUE, ncol = 5) + max.length.opt.cols <- max(nchar(spec[,1])) + 1 + sections <- list(database = "Database setting", input = "Input file", output = "Output files", mz = "M/Z matching", rt = "RT matching", precursor = "Precursor matching", misc = "Miscellaneous") + for (section in names(sections)) { + cat("\n\t", sections[[section]], ":\n", sep = '') + spec <- matrix(make.getopt.spec(section), byrow = TRUE, ncol = 5) + for (i in seq(nrow(spec))) { + opt <- '' + if ( ! is.na(spec[i,2])) + opt <- paste('-', spec[i,2], '|', sep = '') + opt <- paste(opt, '--', spec[i, 1], sep = '') + nb.space.padding <- max.length.opt.cols - nchar(opt) + 6 + padding <- paste(rep(' ', nb.space.padding), sep = '') + cat("\t\t", opt, padding, "\t", spec[i, 5], "\n", sep = '') + } + } + + cat("\nEXAMPLES:\n") + + cat("\nSimple M/Z matching with a file database:\n") + cat("\t./", PROG, " -d file --url mydbfile.tsv -i input.tsv -m pos -o output.tsv\n", sep = '') + + cat("\nFile database with M/Z tolerance:\n") + cat("\t./", PROG, " -d file --url mydbfile.tsv -i input.tsv -m pos -o output.tsv -p 0.5 -s 0\n", sep = '') + + cat("\nFile database with M/Z tolerance unit:\n") + cat("\t./", PROG, " -d file --url mydbfile.tsv -i input.tsv -m pos -o output.tsv -p 1 -s 0.5 -u plain\n", sep = '') + + cat("\nPeakforest database:\n") + cat("\t./", PROG, " -d peakforest --url https://metabohub.peakforest.org/rest/ --db-token <your Peakforest token> -i input.tsv -m pos -o output.tsv\n", sep = '') +} + +# Set default argument values {{{1 +################################################################ + +set.dft.arg.val <-function(opt) { + + for (f in names(MSDB.DFT)) + if (is.null(opt[[f]])) + opt[[f]] <- MSDB.DFT[[f]] + + if ( ! is.null(opt$rtcol) && opt$rtcol == '') + opt$rtcol <- NULL + + return(opt) +} + +# Parse argument values {{{1 +################################################################ + +parse.arg.val <- function(opt) { + + # Parse input column names + if ( ! is.null(opt[['db-fields']])) { + cust <- split.kv.list(opt[['db-fields']]) + cust <- cust[cust != 'NA'] + opt[['db-fields']] <- split.kv.list(MSDB.DFT[['db-fields']]) + for (x in names(cust)) { + if ( ! is.na(cust[[x]]) && cust[[x]] != 'NA') + opt[['db-fields']][[x]] <- cust[[x]] + } + } + + # Parse MS modes + if ( ! is.null(opt[['db-ms-modes']])) { + cust <- split.kv.list(opt[['db-ms-modes']]) + opt[['db-ms-modes']] <- split.kv.list(MSDB.DFT[['db-ms-modes']]) + opt[['db-ms-modes']][names(cust)] <- cust + } + + # Parse retention time columns + if ( ! is.null(opt$rtcol)) + opt$rtcol <- strsplit(opt$rtcol, ',')[[1]] + + # Parse input column names + if (is.null(opt[['input-col-names']])) { + opt[['input-col-names']] <- msdb.get.dft.input.fields() + } + else { + custcols <- split.kv.list(opt[['input-col-names']]) + custcols <- custcols[custcols != 'NA'] + dftcols <- msdb.get.dft.input.fields() + opt[['input-col-names']] <- c(custcols, dftcols[ ! names(dftcols) %in% names(custcols)]) + } + + # Parse lists of precursors + if ( ! is.null(opt[['pos-prec']])) + opt[['pos-prec']] <- unlist(strsplit(opt[['pos-prec']], ',')) + if ( ! is.null(opt[['neg-prec']])) + opt[['neg-prec']] <- unlist(strsplit(opt[['neg-prec']], ',')) + + return(opt) +} + +# Make getopt specifications {{{1 +################################################################ + +make.getopt.spec <- function(sections = NULL) { + + spec <- character(0) + + if (is.null(sections) || 'input' %in% sections) + spec <- c(spec, + 'input-file', 'i', 1, 'character', 'Set input file.', + 'input-col-names', 'j', 1, 'character', paste0('Set the input column names. Default is "', DEFAULT.ARG.VALUES[['input-col-names']], '".') + ) + + if (is.null(sections) || 'mz' %in% sections) + spec <- c(spec, + 'mode', 'm', 1, 'character', paste0('MS mode. Possible values are:', paste(MSDB.MODE.VALS, collapse = ", "), '.'), + 'mzshift', 's', 1, 'numeric', paste0('Shift on m/z. Default is ', MSDB.DFT$mzshift,'.'), + 'mzprec', 'p', 1, 'numeric', paste0('Tolerance on m/z. Default is ', MSDB.DFT$mzprec,'.'), + 'mztolunit', 'u', 1, 'character', paste0('Unit used for tolerance values (options -s and -p) on M/Z. Default is ', MSDB.DFT$mztolunit,'.') + ) + + # Retention time + if (is.null(sections) || 'rt' %in% sections) + spec <- c(spec, + 'all-cols', 'A', 0, 'logical', 'Use all available chromatographic columns to match retention times.', + 'rtcol', 'c', 1, 'character', paste0('Chromatographic column to use. Unset by default. If set, use the corresponding column to filter on retention times, if retention times are provided.'), + 'check-cols', 'k', 0, 'logical', 'Check that the chromatographic column names specified with option -c really exist.', + 'list-cols', 'l', 0, 'logical', 'List all chromatographic columns present in the database. Write list inside the file specified by -o option.', + 'rttol', 'r', 1, 'numeric', paste0('Tolerance on retention times, in seconds. Unset by default.'), + 'rttolx', 'x', 1, 'numeric', paste0('Tolerance on retention times, in seconds. Unset by default.'), + 'rttoly', 'y', 1, 'numeric', paste0('Tolerance on retention times. Unset by default.'), + 'rtunit', 'v', 1, 'character', paste0('Retention time unit for the input file. Default is ', MSDB.DFT$rtunit, '. Allowed values are:', paste(MSDB.RTUNIT.VALS, collapse = ", "), '.') + ) + + if (is.null(sections) || 'precursor' %in% sections) + spec <- c(spec, + 'precursor-match', 'Q', 0, 'logical', 'Remove peaks whose molecule precursor peak has not been matched. Unset by default.', + 'precursor-rt-tol', 'R', 1, 'numeric', paste0('Precursor retention time tolerance. Only used when precursor-match is enabled. Default is ', MSDB.DFT[['precursor-rt-tol']], '.'), + 'pos-prec', 'Y', 1, 'character', paste0('Set the list of precursors to use in positive mode. Default is "', MSDB.DFT[['pos-prec']], '".'), + 'neg-prec', 'Z', 1, 'character', paste0('Set the list of precursors to use in negative mode. Default is "', MSDB.DFT[['neg-prec']], '".') + ) + + if (is.null(sections) || 'output' %in% sections) + spec <- c(spec, + 'output-file', 'o', 1, 'character', 'Set file to use for the main output. If undefined, standard output will be used.', + 'peak-output-file', 'O', 1, 'character', 'If set and if --same-rows is set, then output all matches inside the specified file, with one mz match per line. The output columns are: mz, rt, id, col, colrt, composition, attribution. This means that if an mz value is matched several times, then it will repeated on several lines, with one match description per line.', + 'html-output-file', 'H', 1, 'character', 'Set file to use for the HTML output.', + 'no-main-table-in-html-output', 't', 0, 'logical', 'Do not display main table in HTML output.', + 'same-rows', 'a', 0, 'logical', 'If set, output exactly the same number of rows as the input. This means that in case of multiple matches for one mz, then only one line is output (i.e.: the mz value is not duplicated on several lines). In the main output file, an "ms.matching" column is output with inside, for each mz, a comma separated list of matched component/molecule IDs. If unset, then only the main output file is used, and one single is written to it with one line per peak match, and eventual mz line duplicated if there are multiple matches for this mz.', + 'same-cols', 'b', 0, 'logical', 'If set, output the same columns as inside the input. All input columns are copied to the output.', + 'molids-sep', 'S', 1, 'character', paste0('Set character separator used to when concatenating molecule IDs in output. Default is "', MSDB.DFT[['molids-sep']] , '".'), + 'first-val', '1', 0, 'logical', 'Keep only the first value in multi-value fields. Unset by default.', + 'excel2011comp', 'X', 0, 'logical', 'Excel 2011 compatiblity mode. Output ASCII text files instead of UTF-8 files, where greek letters are replaced with their latin names, plusminus sign is replaced with +- and apostrophe is replaced with \"prime\". All other non-ASCII characters are repladed with underscore.' + ) + + # Database + if (is.null(sections) || 'database' %in% sections) + spec <- c(spec, + 'database', 'd', 1, 'character', paste0('Set database to use: "file" for a single file database and "peakforest" for a connection to PeakForest database.'), + 'url', 'W', 1, 'character', 'URL of database. For "peakforest" database it is the HTTP URL, for the "xls" database it is the path to the directory containing the Excel files, for the "file" database it is the path to the file database and for the "4tabsql" database it is the IP address of the server.', + 'db-name', 'N', 1, 'character', 'Name of the database. Used by the "4tabsql" database.', + 'db-user', 'U', 1, 'character', 'User of the database. Used by the "4tabsql" database.', + 'db-password', 'P', 1, 'character', 'Password of the database user. Used by the "4tabsql" database.', + 'db-ms-modes', 'M', 1, 'character', paste0('Comma separated key/value list giving the MS modes to be used in the single file database. Default is "', MSDB.DFT[['db-ms-modes']], '".'), + 'db-rt-unit', 'V', 1, 'character', paste0('Retention time unit for the database, used in the single file database. Default is "', MSDB.DFT[['db-rt-unit']], '". Allowed values are:', paste(MSDB.RTUNIT.VALS, collapse = ", "), '.'), + 'db-token', 'T', 1, 'character', 'Database token. Used by Peakforest database.', + 'db-fields', 'F', 1, 'character', paste0('Comma separated key/value list giving the field names to be used in the single file database. Default is "', MSDB.DFT[['db-fields']], '".') + ) + + if (is.null(sections) || 'misc' %in% sections) + spec <- c(spec, + 'help', 'h', 0, 'logical', 'Print this help.', + 'debug', 'g', 0, 'logical', 'Set debug mode.', + 'quiet', 'q', 0, 'logical', 'Quiet mode.', + 'log-to-stdout', 'G', 0, 'logical', 'Send log messages to stdout instead of stderr.' + ) + + return(spec) +} + +# Read args {{{1 +################################################################ + +read.args <- function() { + + # Get options + opt <- getopt(matrix(make.getopt.spec(), byrow = TRUE, ncol = 5)) + + # help + if ( ! is.null(opt$help)) { + print.help() + quit() + } + + opt <- set.dft.arg.val(opt) # Set default values + opt <- parse.arg.val(opt) # Parse list values + + return(opt) +} + +# Check args {{{1 +################################################################ + +check.args <- function(opt) { + + # Check database type + if (is.null(opt$database)) + stop("You must provide a database type through --database option.") + if ( ! opt$database %in% MSDB.VALS) + stop(paste0("Invalid value \"", opt$database, "\" for --database option.")) + + # Check filedb database + if (opt$database == 'file') { + if (is.null(opt$url)) + stop("When using single file database, you must specify the location of the database file with option --url.") + if ( ! file.exists(opt$url)) + stop(paste0("The file path \"", opt$url,"\" specified with --db-file option is not valid.")) + } + + # Check Peakforest database + if (opt$database == 'peakforest') { + if (is.null(opt$url)) + stop("When using PeakForest database, you must specify the URL of the PeakForest server with option --url.") + } + + if (is.null(opt[['list-cols']])) { + + if (is.null(opt[['output-file']])) + stop("You must set a path for the output file.") + + if (is.null(opt[['input-file']])) + stop("You must provide an input file.") + + if (is.null(opt$mode) || ( ! opt$mode %in% MSDB.MODE.VALS)) + stop("You must specify a mode through the --mode option.") + + if (is.null(opt$mzprec)) + stop("You must set a precision in MZ with the --mzprec option.") + + if ( ( ! is.null(opt$rtcol) || ! is.null(opt[['all-cols']])) && (is.null(opt$rttolx) || is.null(opt$rttoly))) + stop("When chromatographic columns are set, you must provide values for --rttolx and -rttoly.") + + if (is.null(opt$mztolunit) || ( ! opt$mztolunit %in% MSDB.MZTOLUNIT.VALS)) + stop("You must specify an M/Z tolerance unit through the --mztolunit option.") + } +} + +# Output HTML {{{1 +################################################################ + +output.html <- function(biodb, peaks, file) { + + # Replace public database IDs by URLs + if ( ! is.null(peaks)) { + # Loop on all dbs + for (extdb in c('kegg.compound', 'hmdb.metabolites', 'chebi', 'ncbi.pubchem.comp')) { + conn <- biodb$getFactory()$createConn(extdb, fail.if.exists = FALSE) + col.name <- paste('lcmsmatching', extdb, 'id', sep = '.') + if (col.name %in% colnames(peaks)) + peaks[[col.name]] <- vapply(peaks[[col.name]], function(id) if (is.na(id)) '' else paste0('<a href="', conn$getEntryPageUrl(id), '">', id, '</a>'), FUN.VALUE = '') + } + } + + # Write HTML + html <- HtmlWriter() + html$file.opn(file = file) + html$writeBegTag('html') + html$writeBegTag('header') + html$writeTag('meta', attr = c(charset = "UTF-8")) + html$writeTag('title', text = "LC/MS matching results") + html$writeBegTag('style') + html$write('table, th, td { border-collapse: collapse; }') + html$write('table, th { border: 1px solid black; }') + html$write('td { border-left: 1px solid black; border-right: 1px solid black; }') + html$write('th, td { padding: 5px; }') + html$write('th { background-color: LightBlue; }') + html$write('tr:nth-child(even) { background-color: LemonChiffon; }') + html$write('tr:nth-child(odd) { background-color: LightGreen; }') + html$writeEndTag('style') + html$writeEndTag('header') + html$writeBegTag('body') + + # Write results + results <- FALSE + if ( ! is.null(peaks) && nrow(peaks) > 0) { + html$writeTag('h3', text = "Matched peaks") + html$writeTable(peaks) + results <- TRUE + } + if ( ! results) + html$writeTag('p', 'None.') + + html$writeEndTag('body') + html$writeEndTag('html') + html$file.close() +} + +# Load input file {{{1 +################################################################ + +load.input.file <- function(file, col.names) { + + if ( ! is.null(file) && ! file.exists(file)) + stop(paste0("Input file \"", file, "\" does not exist.")) + + # Empty file + if (file.info(file)$size == 0) { + input <- data.frame() + input[[col.names[['mz']]]] <- double() + input[[col.names[['rt']]]] <- double() + } + + # Non-empty file + else { + # Load file into data frame + input <- read.table(file = file, header = TRUE, sep = "\t", stringsAsFactor = FALSE, check.names = FALSE, comment.char = '') + } + + return(input) +} + +# Check input column names {{{1 +################################################################ + +check.input.colnames <- function(col.names, input, needs.rt) { + + # Loop on all fields + for (field in names(col.names)) { + + # Is the column not inside the input? + if ( ! col.names[[field]] %in% colnames(input)) { + + # Is the column name an index? + if (length(grep('^[0-9]+$', col.names[[field]])) > 0) { + # Convert each column that is identified by a number into a name + col.index <- as.integer(col.names[[field]]) + if (col.index < 1 || col.index > length(colnames(input))) + stop(paste0("No column n°", col.index, " for input field ", field, ".")) + col.names[[field]] <- colnames(input)[[col.index]] + } + + # Unknown column + else + if (field == 'mz' || (needs.rt && field == 'rt')) + stop(paste("Column ", col.names[[field]], ' for ', field, ' values cannot be found inside input file.', sep = '')) + } + } + + return(col.names) +} + +# Restrict input to essential columns {{{1 +################################################################ + +restrict.input.cols <- function(input, col.names, same.cols, keep.rt) { + + # Restrict to essential columns + if ( ! same.cols) { + + # Get selected column names + cols <- unlist(col.names) + names(cols) <- NULL + + # Only keep columns present in input + cols <- cols[cols %in% names(input)] + + # Remove retention time column + if ( ! keep.rt) + cols <- cols[cols != col.names$rt] + + # Restrict input columns + input <- input[, cols, drop = FALSE] + } + + return(input) +} + +# Create Biodb instance {{{1 +################################################################ + +create.biodb.instance <- function(quiet = FALSE, ms.modes = NULL) { + + biodb <- NULL + + if (is.null(quiet)) + quiet <- FALSE + + # Create biodb instance + if (quiet) + biodb <- biodb::Biodb$new(logger = FALSE) + else { + log.stream = (if (is.null(opt[['log-to-stdout']])) stderr() else stdout()) + logger = biodb::BiodbLogger$new(file = log.stream) + if ( ! is.null(opt$debug)) + logger$includeMsgType('debug') + biodb <- biodb::Biodb$new(logger = FALSE, observers = logger) + } + + # Configure cache + biodb$getConfig()$disable('cache.system') + #biodb$getConfig()$disable('factory.cache') + biodb$getConfig()$disable('cache.subfolders') + biodb$getConfig()$disable('cache.all.requests') + biodb$getConfig()$set('useragent', USERAGENT) + + # Set MS mode values + if ( ! is.null(ms.modes)) + for (k in names(ms.modes)) + biodb$getEntryFields()$get('ms.mode')$addAllowedValue(k, ms.modes[[k]]) + + return(biodb) +} + +# Get database connector {{{1 +################################################################ + +get.db.conn <- function(biodb, db.name, url, token, fields, pos.prec, neg.prec) { + + # Set biodb database name + if (db.name == 'file') + biodb.db.name <- 'mass.csv.file' + else if (db.name == 'peakforest') + biodb.db.name <- 'peakforest.mass' + else + stop(paste0('Unknown database "', db.name, '".')) + + # Set URL & token + if (is.null(url)) + url <- NA_character_ + if (is.null(token)) + token <- NA_character_ + + # Create connector + conn <- biodb$getFactory()$createConn(biodb.db.name, url = url, token = token) + + # Set up file database + if (db.name == 'file') { + + # Set fields + if ( ! MSDB.TAG.MODE %in% names(fields)) + stop("MS mode field is not defined for file database.") + if ( ! MSDB.TAG.MZTHEO %in% names(fields)) + stop("M/Z field is not defined for file database.") + if ( 'accession' %in% names(fields)) + accession <- fields[['accession']] + else { + cols <- character() + for (c in c(MSDB.TAG.MOLID, MSDB.TAG.MODE, MSDB.TAG.COL, MSDB.TAG.COLRT)) + if (c %in% names(fields)) + cols <- c(cols, fields[[c]]) + accession <- cols + } + conn$setField('accession', accession) + conn$setField('ms.mode', fields[[MSDB.TAG.MODE]]) + conn$setField('peak.mztheo', fields[[MSDB.TAG.MZTHEO]]) + conn$setField('fullnames', fields[[MSDB.TAG.MOLNAMES]], ignore.if.missing = TRUE) + conn$setField('compound.id', fields[[MSDB.TAG.MOLID]], ignore.if.missing = TRUE) + conn$setField('chrom.col.rt', fields[[MSDB.TAG.COLRT]], ignore.if.missing = TRUE) + conn$setField('chrom.col', fields[[MSDB.TAG.COL]], ignore.if.missing = TRUE) + conn$setField('peak.attr', fields[[MSDB.TAG.ATTR]], ignore.if.missing = TRUE) + conn$setField('peak.comp', fields[[MSDB.TAG.COMP]], ignore.if.missing = TRUE) + conn$setField('formula', fields[[MSDB.TAG.MOLCOMP]], ignore.if.missing = TRUE) + conn$setField('molecular.mass', fields[[MSDB.TAG.MOLMASS]], ignore.if.missing = TRUE) + conn$setField('inchi', fields[[MSDB.TAG.INCHI]], ignore.if.missing = TRUE) + conn$setField('inchikey', fields[[MSDB.TAG.INCHIKEY]], ignore.if.missing = TRUE) + conn$setField('chebi.id', fields[[MSDB.TAG.CHEBI]], ignore.if.missing = TRUE) + conn$setField('ncbi.pubchem.comp.id', fields[[MSDB.TAG.PUBCHEM]], ignore.if.missing = TRUE) + conn$setField('hmdb.metabolites.id', fields[[MSDB.TAG.HMDB]], ignore.if.missing = TRUE) + conn$setField('kegg.compound.id', fields[[MSDB.TAG.KEGG]], ignore.if.missing = TRUE) + + # Set MS level + if ( ! conn$hasField('ms.level')) + conn$addField('ms.level', 1) + + # Set precursor formulae + if ( ! is.null(pos.prec)) + conn$setPrecursorFormulae(c(pos.prec, neg.prec)) + } + + return(conn) +} + +# Print chrom cols {{{1 +################################################################ + +print.chrom.cols <- function(conn, output = NULL) { + file <- if (is.null(output)) stdout() else output + write.table(conn$getChromCol(), file = file, row.names = FALSE, sep = "\t") +} + +# Get chrom cols {{{1 +################################################################ + +get.chrom.cols <- function(conn, check.cols, chrom.cols, all.cols) { + + # Get all chromatopgrahic columns + if (all.cols) + chrom.cols <- conn$getChromCol()[['id']] + + # Check chromatographic columns + else if (check.cols && ! is.null(chrom.cols)) { + dbcols <- conn$getChromCol()[['id']] + unknown.cols <- chrom.cols[ ! chrom.cols %in% dbcols] + if (length(unknown.cols) > 0) + stop(paste0("unknown chromatographic column", (if (length(unknown.cols) > 1) 's' else ''), ': ', paste(unknown.cols, collapse = ', '), ".\nallowed chromatographic column names are:\n", paste(dbcols, collapse = "\n"))) + } + + return(chrom.cols) +} + +# Search {{{1 +################################################################ + +search <- function(conn, input.file, input.colnames, same.cols, mz.tol, mz.tol.unit, mz.shift, ms.mode, main.output, peaks.output, html.output, chrom.cols, rt.unit, rt.tol, rt.tol.exp, results.sep, precursor, precursor.rt.tol) { + + rt.search <- ! is.null(chrom.cols) && ! all(is.na(chrom.cols)) + + # Load input file + input <- load.input.file(input.file, col.names = input.colnames) + + # Check input column names + input.colnames <- check.input.colnames(input.colnames, input = input, needs.rt = rt.search) + + # Restrict input to essential columns + input <- restrict.input.cols(input, col.names = input.colnames, same.cols = same.cols, keep.rt = rt.search) + + # Update RT search flag + rt.search <- rt.search && 'rt' %in% names(input.colnames) && input.colnames$rt %in% names(input) + + # Run MZ/RT matching + rt.unit <- if (rt.search) (if (rt.unit == MSDB.RTUNIT.SEC) 's' else 'min') else NA_character_ + rt.tol <- if (rt.search && ! is.null(rt.tol)) rt.tol else NA_real_ + rt.tol.exp <- if (rt.search && ! is.null(rt.tol.exp)) rt.tol.exp else NA_real_ + + # Force type for input columns + input[[input.colnames$mz]] <- as.numeric(input[[input.colnames$mz]]) + if (rt.search) + input[[input.colnames$rt]] <- as.numeric(input[[input.colnames$rt]]) + + peaks <- conn$searchMsPeaks(input.df = input, mz.shift = mz.shift, mz.tol = mz.tol, mz.tol.unit = mz.tol.unit, ms.mode = ms.mode, chrom.col.ids = chrom.cols, rt.unit = rt.unit, rt.tol = rt.tol, rt.tol.exp = rt.tol.exp, precursor = precursor, precursor.rt.tol = precursor.rt.tol, insert.input.values = TRUE, compute = FALSE, prefix.on.result.cols = 'lcmsmatching.', input.df.colnames = c(mz = input.colnames$mz, rt = input.colnames$rt), match.rt = rt.search) + + # Build outputs + main <- NULL + if ( ! is.null(peaks)) + main <- conn$collapseResultsDataFrame(results.df = peaks, sep = results.sep, mz.col = input.colnames$mz, rt.col = input.colnames$rt) + + # Write main output + if ( ! is.null(main.output)) + write.table(main, file = main.output, row.names = FALSE, sep = "\t", quote = FALSE) + + # Write peaks output + if ( ! is.null(peaks.output)) + write.table(peaks, file = peaks.output, row.names = FALSE, sep = "\t", quote = FALSE) + + # Write HTML output + if ( ! is.null(html.output)) + output.html(biodb = conn$getBiodb(), peaks = peaks, file = html.output) +} + +# MAIN {{{1 +################################################################ + +# Read command line arguments +opt <- read.args() + +# Set error function for debugging +if (is.null(opt$debug)) { + options(error = function() { quit(status = 1) }, warn = 0 ) +} + +# Create Biodb instance +biodb <- create.biodb.instance(quiet = opt$quiet, ms.modes = opt[['db-ms-modes']]) + +# Get database connector +conn <- get.db.conn(biodb, db.name = opt$database, url = opt$url, token = opt[['db-token']], fields = opt[['db-fields']], pos.prec = opt[['pos-prec']], neg.prec = opt[['neg-prec']]) + +# Print columns +if ( ! is.null(opt[['list-cols']])) { + print.chrom.cols(conn, opt[['output-file']]) + quit(status = 0) +} + +# MS mode +ms.mode <- (if (opt$mode == POS_MODE) MSDB.TAG.POS else MSDB.TAG.NEG) + +# Set RT unit +rt.search <- ! is.null(opt$rtcol) || ! is.null(opt[['all-cols']]) +if ( rt.search && opt$database == 'file' && ! conn$hasField('chrom.rt.unit')) + conn$addField('chrom.rt.unit', (if (opt[['db-rt-unit']] == MSDB.RTUNIT.SEC) 's' else 'min')) + +# Select chromatographic columns +chrom.cols <- get.chrom.cols(conn, check.cols = ! is.null(opt[['check-cols']]), chrom.cols = opt$rtcol, all.cols = ! is.null(opt[['all-cols']])) + +# Search +search(conn, input.file = opt[['input-file']], input.colnames = opt[['input-col-names']], same.cols = ! is.null(opt[['same-cols']]), mz.tol = opt$mzprec, mz.tol.unit = opt$mztolunit, mz.shift = - opt$mzshift, + ms.mode = ms.mode, + chrom.cols = chrom.cols, rt.unit = opt$rtunit, rt.tol = (if (is.null(opt$rttol)) opt$rttolx else opt$rttol), rt.tol.exp = opt$rttoly, + results.sep = opt[['molids-sep']], precursor = ! is.null(opt[['precursor-match']]), precursor.rt.tol = opt[['precursor-rt-tol']], + main.output = opt[['output-file']], peaks.output = opt[['peak-output-file']], html.output = opt[['html-output-file']]) + +# Terminate Biodb instance +biodb$terminate()
--- a/lcmsmatching.xml Wed Apr 19 10:00:05 2017 -0400 +++ b/lcmsmatching.xml Fri Feb 22 16:04:22 2019 -0500 @@ -1,43 +1,38 @@ -<tool id="lcmsmatching" name="LC/MS matching" version="3.3.1" profile="16.01"> +<!-- vi: se fdm=marker : --> +<tool id="lcmsmatching" name="LCMS matching" version="4.0.2" profile="18.05"> - <description>Annotation of MS peaks using matching on a spectra database.</description> + <description>Annotation of LCMS peaks using matching on a in-house spectra database or on PeakForest spectra database.</description> + <!-- Requirements {{{1 --> + <!-- **************************************************************** --> <requirements> - <!--<requirement type="package" version="3.3.3">r</requirement>--> - <requirement type="package" version="7.0">readline</requirement> <!-- Try readline 7.0 --> - <requirement type="package" version="1.20.0">r-getopt</requirement> - <requirement type="package" version="1.0.0">r-stringr</requirement> - <requirement type="package" version="1.8.3">r-plyr</requirement> - <requirement type="package" version="3.98">r-xml</requirement> - <requirement type="package" version="1.0_6">r-bitops</requirement> - <requirement type="package" version="1.95">r-rcurl</requirement> - <requirement type="package" version="1.1">r-jsonlite</requirement> + <requirement type="package" version="1.2.2">r-biodb</requirement> + <requirement type="package" version="1.20.2">r-getopt</requirement> + <requirement type="package" version="0.2_15">r-codetools</requirement> <!-- R_VERSION="0.2-15" IMPORTANT Do not remove, used by travis_install_deps.sh script. --> <!-- codetools package is needed because of the following error when running Galaxy on Travis-CI in planemo tests: "code for methods in class “HtmlWriter” was not checked for suspicious field assignments (recommended package ‘codetools’ not available?)". --> + </requirements> - <code file="list-chrom-cols.py"/> - <code file="list-file-cols.py"/> - <code file="list-ms-mode-values.py"/> - - <!--======= - = COMMAND = - ========--> + <!-- Command {{{1 --> + <!-- **************************************************************** --> <command> <![CDATA[ ## @@@BEGIN_CHEETAH@@@ - $__tool_directory__/search-mz + $__tool_directory__/lcmsmatching + + --log-to-stdout ## Input file -i "$mzrtinput" - --input-col-names "mz=$inputmzfield,rt=$inputrtfield" + --input-col-names "$inputfields" --rtunit "$inputrtunit" ## Database #if $db.dbtype == "inhouse" -d file - --db-fields "mztheo=$db.dbmzreffield,chromcolrt=$db.dbchromcolrtfield,compoundid=$db.dbspectrumidfield,chromcol=$db.dbchromcolfield,msmode=$db.dbmsmodefield,peakattr=$db.dbpeakattrfield,pubchemcompid=$db.dbpubchemcompidfield,chebiid=$db.dbchebiidfield,hmdbid=$db.dbhmdbidfield,keggid=$db.dbkeggidfield" - --db-ms-modes "pos=$db.dbmsposmode,neg=$db.dbmsnegmode" - --db-rt-unit $db.dbrtunit + --db-fields "$db.dbfields" + --db-ms-modes "$db.dbmsmodes" + --db-rt-unit "$db.dbrtunit" #end if #if $db.dbtype == "peakforest" -d peakforest @@ -46,7 +41,7 @@ --url "$db.dburl" ## M/Z matching - -m $mzmode -p $mzprec -s $mzshift + -m $mzmode -p $mzprec -s $mzshift -u $mztolunit ## Precursor matching #if $prec.match == "true" @@ -72,14 +67,13 @@ ## @@@END_CHEETAH@@@ ]]></command> - <!--====== - = INPUTS = - =======--> + <!-- Inputs {{{1 --> + <!-- **************************************************************** --> <inputs> - <!-- DATABASE --> - + <!-- Database {{{2 --> + <!-- **************************************************************** --> <conditional name="db"> <param name="dbtype" label="Database" type="select" refresh_on_change="true"> @@ -87,25 +81,17 @@ <option value="peakforest">Peakforest</option> </param> + <!-- In-house database parameters {{{3 --> + <!-- **************************************************************** --> <when value="inhouse"> <!-- Database file --> <param name="dburl" label="Database file" type="data" format="tabular,tsv" refresh_on_change="true" help="Decimal: '.', missing: NA, mode: character and numerical, sep: tabular. Retention time values must be in seconds."/> <!-- File database field names --> - <param name="dbspectrumidfield" type="select" label="Database file Spectrum ID column name" dynamic_options="get_file_cols(file = db['dburl'], preferred = 'spectrumid,accession,compoundid,molid')" help="Select the Spectrum ID column of the database file."/> - <param name="dbmzreffield" type="select" label="Database file Reference MZ column name" dynamic_options="get_file_cols(file = db['dburl'], preferred = 'mztheo,mzexp,mz')" help="Select the Reference MZ column of the database file."/> - <param name="dbchromcolfield" type="select" label="Database file Chromatographic Column Name column name" dynamic_options="get_file_cols(file = db['dburl'], preferred = 'chromcol,col')" help="Select the Chromatographic Column Name column of the database file." refresh_on_change="true"/> - <param name="dbchromcolrtfield" type="select" label="Database file Chromatographic Column Retention Time column name" dynamic_options="get_file_cols(file = db['dburl'], preferred = 'chromcolrt,colrt,rt')" help="Select the Chromatographic Column Retention Time column of the database file."/> - <param name="dbmsmodefield" type="select" label="Database file MS Mode column name" dynamic_options="get_file_cols(file = db['dburl'], preferred = 'msmode,mode')" help="Select the MS Mode column of the database file." refresh_on_change="true"/> - <param name="dbpeakattrfield" type="select" label="Database file Peak Attribution column name" dynamic_options="get_file_cols(file = db['dburl'], preferred = 'peakattr,attr')" help="Select the Peak Attribution column of the database file."/> - <param name="dbpubchemcompidfield" type="select" label="Database file PubChem Compound ID column name" dynamic_options="get_file_cols(file = db['dburl'], preferred = 'pubchemcompid,pubchemid,pubchemcomp,pubchem')" help="Select the PubChem Compound ID column of the database file."/> - <param name="dbchebiidfield" type="select" label="Database file ChEBI ID column name" dynamic_options="get_file_cols(file = db['dburl'], preferred = 'chebiid,chebi')" help="Select the ChEBI ID column of the database file."/> - <param name="dbhmdbidfield" type="select" label="Database file HMDB Metabolite ID column name" dynamic_options="get_file_cols(file = db['dburl'], preferred = 'hmdbid,hmdb')" help="Select the HMDB Metabolite ID column of the database file."/> - <param name="dbkeggidfield" type="select" label="Database file KEGG Compound ID column name" dynamic_options="get_file_cols(file = db['dburl'], preferred = 'keggid,kegg')" help="Select the KEGG Compound ID column of the database file."/> - + <param name="dbfields" label="Column names" type="text" size="256" value="mztheo=mztheo,chromcolrt=chromcolrt,compoundid=compoundid,chromcol=chromcol,msmode=msmode,peakattr=peakattr,pubchemcompid=pubchemcompid,chebiid=chebiid,hmdbid=hmdbid,keggid=keggid" help="The list of column names of your database in-house file, as a coma separated list of key/value pairs."/> + <!-- File database MS modes --> - <param name="dbmsposmode" label="File database MS Positive mode" type="select" dynamic_options="get_ms_mode_value(file = db['dburl'], col = db['dbmsmodefield'], preferred = 'POS,pos,+')" help="Select the value used to identify the positive MS mode."/> - <param name="dbmsnegmode" label="File database MS Negative mode" type="select" dynamic_options="get_ms_mode_value(file = db['dburl'], col = db['dbmsmodefield'], preferred = 'NEG,neg,-')" help="Select the value used to identify the negitive MS mode."/> + <param name="dbmsmodes" label="MS modes" help="Values used for the file database MS modes, as a coma separated list of key/value pairs." type="text" size="64" value="pos=pos,neg=neg"/> <!-- File database RT unit --> <param name="dbrtunit" label="Retention time unit" type="select" display="radio" multiple="false" help=""> @@ -116,8 +102,10 @@ <param name="dbtoken" type="text" size="32" value="" hidden="true"/> </when> + <!-- PeakForest database parameters {{{3 --> + <!-- **************************************************************** --> <when value="peakforest"> - <param name="dburl" type="text" size="128" value="https://peakforest-alpha.inra.fr/rest" refresh_on_change="true"/> + <param name="dburl" type="text" size="128" value="https://metabohub.peakforest.org/rest/" refresh_on_change="true"/> <param name="dbtoken" label="Peakforest security token" type="text" size="32" value="" refresh_on_change="true" help="If you do not have yet a Peakforest token, go to Peakforest website and request one from your account."/> @@ -125,44 +113,51 @@ </when> </conditional> - <!-- INPUT --> + <!-- Input file {{{2 --> + <!-- **************************************************************** --> - <!-- Input file --> - <param name="mzrtinput" label="Input file - MZ(/RT) values" type="data" format="tabular,tsv" refresh_on_change="true" help="Decimal: '.', missing: NA, mode: character and numerical, sep: tabular. RT values must be in seconds."/> + <!-- Input file --> + <param name="mzrtinput" label="Input file - MZ(/RT) values" type="data" format="tabular,tsv" refresh_on_change="true" help="Decimal: '.', missing: NA, mode: character and numerical, sep: tabular. RT values must be in seconds."/> + + <!-- Input field field names --> + <param name="inputfields" type="text" label="Input column names" size="64" help="Input file column names, as a coma separated list of key/value pairs." value="mz=mz,rt=rt"/> - <!-- Input field field names --> - <param name="inputmzfield" type="select" label="Input file MZ column name" dynamic_options="get_file_cols(file = mzrtinput, preferred = 'mzmed,mz')" help="Select the MZ column of the input file."/> - <param name="inputrtfield" type="select" label="Input file RT column name" dynamic_options="get_file_cols(file = mzrtinput, preferred = 'rtmed,rt')" help="Select the RT column of the input file."/> + <!-- Input file RT unit --> + <param name="inputrtunit" label="Retention time unit" type="select" display="radio" multiple="false" help=""> + <option value="sec">Seconds</option> + <option value="min">Minutes</option> + </param> - <!-- Input file RT unit --> - <param name="inputrtunit" label="Retention time unit" type="select" display="radio" multiple="false" help=""> - <option value="sec">Seconds</option> - <option value="min">Minutes</option> - </param> - - <!-- M/Z MATCHING --> + <!-- M/Z matching {{{2 --> + <!-- **************************************************************** --> - <!-- Mode --> - <param name="mzmode" label="MS mode" type="select" display="radio" multiple="false" help=""> - <option value="pos">Positive</option> - <option value="neg">Negative</option> - </param> + <!-- Mode --> + <param name="mzmode" label="MS mode" type="select" display="radio" multiple="false" help=""> + <option value="pos">Positive</option> + <option value="neg">Negative</option> + </param> - <!-- MZ matching parameters --> - <param name="mzprec" label="M/Z precision (in ppm)" type="float" help="" value="5"/> - <param name="mzshift" label="M/Z shift (in ppm)" type="float" help="" value="0"/> + <!-- MZ matching parameters --> + <param name="mzprec" label="M/Z precision" type="float" help="" value="5"/> + <param name="mzshift" label="M/Z shift" type="float" help="" value="0"/> + <param name="mztolunit" label="M/Z tolerance unit" type="select" display="radio" multiple="false" help=""> + <option value="ppm">PPM</option> + <option value="plain">Plain</option> + </param> - <!-- RETENTION TIME PARAMETERS --> + <!-- RT matching {{{2 --> + <!-- **************************************************************** --> - <!-- List of chromatographic columns --> - <param name="chromcols" type="select" label="Chromatographic columns" multiple="true" dynamic_options="get_chrom_cols(dbtype = db['dbtype'], dburl = db['dburl'], dbtoken = db['dbtoken'], col_field = db['dbchromcolfield'])" help="Select here the set of chromatographic columns against which the retention time matching will be run."/> + <!-- List of chromatographic columns --> + <param name="chromcols" type="text" label="Chromatographic columns" size="2048" value=""/> - <!-- Tolerances --> - <param name="tolx" label="RTX retention time tolerance, parameter x (in seconds)" type="float" help="" value="5"/> - <param name="toly" label="RTY retention time tolerance, parameter y" type="float" help="" value="0.8"/> - <param name="tolz" label="RTZ retention time tolerance, used when precursor matching is enabled." type="float" help="" value="5"/> + <!-- Tolerances --> + <param name="tolx" label="RTX" help="The retention time tolerance X parameter (in seconds)." type="float" value="5"/> + <param name="toly" label="RTY" help="The retention time tolerance Y parameter (no unit)." type="float" value="0.8"/> + <param name="tolz" label="RTZ" help="The retention time tolerance used when precursor matching is enabled." type="float" value="5"/> - <!-- PRECURSOR MATCH --> + <!-- Precursor matching {{{2 --> + <!-- **************************************************************** --> <conditional name="prec"> <param name="match" label="Precursor match" type="select"> @@ -198,9 +193,11 @@ </when> </conditional> - <!-- OUTPUT --> + <!-- Output format {{{2 --> + <!-- **************************************************************** --> + <!-- Molecule IDs separator character --> - <param name="molidssep" label="Molecule IDs separator character" type="text" size="3" value="|" help=""> + <param name="molidssep" label="Multiple matches separator character" type="text" size="3" value="|" help=""> <sanitizer> <valid initial="string.printable"> <remove value='"'/> @@ -213,60 +210,77 @@ </inputs> - <!--======= - = OUTPUTS = - ========--> + <!-- Outputs {{{1 --> + <!-- **************************************************************** --> <outputs> - <!-- Output file --> <data name="mainoutput" label="lcmsmatch_${mzrtinput.name}" format="tabular"/> <data name="peaksoutput" label="lcmsmatch_${mzrtinput.name}_peaks" format="tabular"/> <data name="htmloutput" label="lcmsmatch_${mzrtinput.name}.html" format="html"/> </outputs> - <!--===== - = TESTS = - ======--> + <!-- Tests {{{1 --> + <!-- **************************************************************** --> <tests> - <!-- File database test --> + <!-- Test 1, MZ only {{{2 --> + <!-- **************************************************************** --> + <test> + <param name="dbtype" value="inhouse"/> + <param name="dburl" value="filedb.tsv"/> + <param name="mzrtinput" value="mz-input-small.tsv"/> + <param name="inputfields" value="mz=mz"/> + <param name="mzmode" value="pos"/> + <output name="mainoutput" file="test_1_main_output.tsv"/> + <output name="peaksoutput" file="test_1_peaks_output.tsv"/> + <output name="htmloutput" file="test_1_peaks_output.html"/> + </test> + + <!-- Test 2, MZ & RT {{{2 --> + <!-- **************************************************************** --> <test> <param name="dbtype" value="inhouse"/> <param name="dburl" value="filedb.tsv"/> - <param name="dbfields" value=""/> - <param name="dbmsmodes" value=""/> - <param name="mzrtinput" value="mz-input-small.tsv"/> - <param name="inputmzfield" value="mzmed"/> - <param name="inputrtfield" value="rtmed"/> + <param name="mzrtinput" value="mzrt-input-small.tsv"/> + <param name="inputfields" value="mz=mz,rt=rt"/> <param name="mzmode" value="pos"/> - <output name="mainoutput" file="filedb-small-mz-match-output.tsv"/> - <output name="peaksoutput" file="filedb-small-mz-match-peaks-output.tsv"/> - <output name="htmloutput" file="filedb-small-mz-match-html-output.html"/> + <param name="dbrtunit" value="min"/> + <param name="chromcols" value="col12"/> + <param name="tolx" value="5"/> + <param name="toly" value="0.8"/> + <output name="mainoutput" file="test_2_main_output.tsv"/> + <output name="peaksoutput" file="test_2_peaks_output.tsv"/> + <output name="htmloutput" file="test_2_peaks_output.html"/> </test> - <!-- File database test --> -<!-- + <!-- Test 3, MZ & RT with precursor match {{{2 --> + <!-- **************************************************************** --> <test> - <param name="dbtype" value="peakforest"/> - <param name="dbtoken" value="@PEAKFOREST_TOKEN@"/> - <param name="mzrtinput" value="mz-input-small.tsv"/> - <param name="inputfields" value=""/> + <param name="dbtype" value="inhouse"/> + <param name="dburl" value="filedb.tsv"/> + <param name="mzrtinput" value="mzrt-input-small.tsv"/> + <param name="inputfields" value="mz=mz,rt=rt"/> <param name="mzmode" value="pos"/> - <output name="mainoutput"> - <assert_contents> - <has_text text="mz"/> - </assert_contents> - </output> + <param name="dbrtunit" value="min"/> + <param name="chromcols" value="col12"/> + <param name="tolx" value="5"/> + <param name="toly" value="0.8"/> + <param name="match" value="true"/> + <param name="neg" value="[(M-H)]-,[M-H]-"/> + <param name="pos" value="[(M+H)]+,[M+H]+"/> + <param name="tolz" value="60"/> + <output name="mainoutput" file="test_3_main_output.tsv"/> + <output name="peaksoutput" file="test_3_peaks_output.tsv"/> + <output name="htmloutput" file="test_3_peaks_output.html"/> </test> ---> + </tests> - <!--==== - = HELP = - =====--> + <!-- Help {{{1 --> + <!-- **************************************************************** --> <help> <!-- @@@BEGIN_RST@@@ --> @@ -296,13 +310,14 @@ Single file database ==================== -The database used is provided as a single file, in tabular format, through the *Database file* field. This file must contain a list of MS peaks, with possibly retention times. +In this case, the database used is provided as a single file by the user, in tabular format, through the *Database file* field. This file must contain a list of MS peaks, with possibly retention times. Peaks are "duplicated" as much as necessary. For instance if 3 retention times are available on a compound with 10 peaks in positive mode, then there will be 30 lines for this compound in positive mode. -The file must contain a header with the column names. The names are free, but must be provided through the different fields named *Database file ... column name*. -Then you must provide the values used to identify the MS modes (positive and negative). +The file must contain a header with the column names. The names are free, but must be provided through the *Column names* field as a comma separated list of key/value pairs. See default value as an example. Of course it is much easier if your database file uses the default column names used in the default value of the *Column names* field. The column names shown in the default values, are only the ones used by the algorithm. You can provide any additional columns in your database file, they will be copied in the output. -A last information about the single file database is the unit of the retention times, either in seconds or in minutes. +Then you must provide the values used to identify the MS modes (positive and negative), using field *MS modes*. + +A last information about the single file database is the unit of the retention times, either in seconds or in minutes. Use the field "Retention time unit" to provide this information. Example of database file (totally fake, no meaning): @@ -332,13 +347,18 @@ | A10 | "POS" | 145.097154 | "P92Z6W413 O2" | "[(M+H)-(H2)]+" | "hcoltt" | 0.8 | "J114L6M62O2" | 146.10553 | "Blablaine" | +-------+-------+------------+--------------------+-------------------------+-----------+-------+---------------+-----------+--------------+ +The corresponding value of the *Column names* field for this database field would be: +**mztheo=mz,chromcolrt=rt,compoundid=molid,chromcol=col,msmode=mode,peakattr=attribution**. + +And the value of the *MS modes* field would be: **pos=POS,neg=NEG**. + MZ/RT input file ================ -The input to provide is a file, in a tabular format (or TSV: Tab Seperated Values), containing the list of M/Z values, with possibly also RT values. +The input to provide is a dataset in a tabular format (or TSV: Tab Seperated Values), containing the list of M/Z values, with possibly also RT values. The dataset is chosen through the field *Input file - MZ(/RT) values*. -The column names for the M/Z and RT values must be provided through the fields *Input file MZ column name* and *Input file RT column name*. -As a consequence, the file must contain a header line. +The column names for the M/Z and RT values must be provided through the field *Input column names*, as a comma separated list of key/value pairs. +The file/dataset must contain a header line with the same names specified in the field *Input column names*. The unit of the retention time has to be provided with the field *Retention time unit*. @@ -364,27 +384,29 @@ M/Z matching ------------ -In the simplest form of the algorithm only the *m/z* values are matched against the database peaks. This happens if both *Retention time match* and *Precursor match* are off. +In the simplest form of the algorithm only the *M/Z* values are matched against the database peaks. This happens if both *Retention time match* and *Precursor match* are off. The first parameter is the MS mode, specified through the *MS mode* parameter. -The parameters *M/Z precision* and *M/Z shift* are used by the algorithm in the following formula in order to match an *m/z* value: +The parameters *M/Z precision* and *M/Z shift* are used by the algorithm in the following formula in order to match an *M/Z* value: + + mz - shift - precision < mzref < mz - shift + precision - mz (1 + (- shift - precision) / 10^6) < mzref < mz (1 + (- shift - precision) / 10^6) +Where *mzref* is the M/Z of reference from the database peak that is tested. If this double inequality is true, then the *M/Z* value is matched with this peak. -Where *mzref* is the M/Z of reference from the database peak that is tested. If this double inequality is true, then the *m/z* value is matched with this peak. +The parameters *shift* and *precision* can be input in either PPM values of M/Z or in plain values. Use the field *M/Z tolerance unit* to set the unit. -------------------- Retention time match -------------------- -If at least one column is selected inside the *Chromatographic columns* parameter section, then retention time is also matched, in addition to the *m/z* value, according to the following formula: +If at least one column is selected inside the *Chromatographic columns* parameter section, then retention time is also matched, in addition to the *M/Z* value, according to the following formula: rt - x - rt^y < colrt < rt + x + rt^y Where *x* is the value of the parameter *RTX* and *y* the value of the parameter *RTY*. -If for a reference compound the database does not contain retention time for at least one of the specified columns, then only the *m/z* value is matched against the peaks of the reference compound. This means that in the results you can find compounds that do no match the provided retention time value. +If for a reference compound the database does not contain retention time for at least one of the specified columns, then only the *M/Z* value is matched against the peaks of the reference compound. This means that in the results you can find compounds that do no match the provided retention time value. The *RTZ* parameter is used in the *Precursor match* algorithm (see below). @@ -394,7 +416,7 @@ If the "Precursor match" option is enabled inside the parameters section, then a more sophisticated version of the algorithm, which is executed in two steps, is used. -This algorithm takes two more parameters, one for each MS mode. These are the lists of precursors. Since the matching is run for one MS mode only, only one of the two parameters is used. Inside the single file database, all the peaks whose **attr** column value is equal to one of the precursor listed in *List of negative precursors* or *List of positive precursors*, depending on the mode, are considered as precursor peaks. +This algorithm takes two more parameters, one for each MS mode. These are the lists of precursors. Since the matching is run for one MS mode only, only one of the two parameters is used. Inside the single file database, all the peaks whose **peakattr** column value is equal to one of the precursor listed in *List of negative precursors* or *List of positive precursors*, depending on the mode, are considered as precursor peaks. M/Z matching using precursor matching ===================================== @@ -414,7 +436,7 @@ Output settings --------------- -The *Molecule IDs separator character* is used to customize the character used to separate the molecule IDs of the **molid** column inside the *main* output file. +The *Multiple matches separator character* is used to customize the character used to separate the multiple values inside each row in the *main* output dataset. The *main* output contains as much rows as the MZ/RT input dataset, thus when for one MZ/RT value the algorithm finds more than one match, it concatenates the matches using this separator character. Output files ============ @@ -424,18 +446,25 @@ +-------------+--------------------------------------+--------------------------------------------------------+ | Outputs | File name | Description | +-------------+--------------------------------------+--------------------------------------------------------+ -| Main output | lcmsmatching_{input_file_name} | Contains the list of compounds that have been matched. | +| Main output | lcmsmatching_{input_file_name} | Contains the same data as the input dataset, with | +| | | match result included on each row. If more than one | +| | | match is found for a row, the different values of the | +| | | match are concatenated using the provided separator | +| | | character. | +-------------+--------------------------------------+--------------------------------------------------------+ -| Peak list | lcmsmatching_peaks_{input_file_name} | Contains all matched database peaks. | +| Peak list | lcmsmatching_{input_file_name}_peaks | Contains the same data as the input dataset, with | +| | | match result included on each row. If more than one | +| | | match is found for a row, then the row is duplicated. | +| | | Hence there is either no match for a row, or one | +| | | single match. | +-------------+--------------------------------------+--------------------------------------------------------+ -| HTML output | lcmsmatching_{input_file_name}.html | Contains the two tables on one page. | +| HTML output | lcmsmatching_{input_file_name}.html | Contains the same table as *Peak list* but in HTML | +| | | format and with links to external databases if columns | +| | | for PubChem Compound, ChEBI, HMDB Metabolites or KEGG | +| | | Compounds are provided. | +-------------+--------------------------------------+--------------------------------------------------------+ -The **main** output is identical to the input file, to which is added an *msmatching* column. This column contains a list of IDs of the compounds that have been matched for this couple of (m/z, rt) values. - -The **peak list** output contains all database peaks that have been matched, for each (m/z, rt) input couple. Thus for each (m/z, rt) couple, there will be zero, one or more matched peaks output. The columns output are *mz*, *rt*, *id*, *mztheo*, *col*, *colrt*, *attribution* and *composition*, where *id* is the compound ID, *mztheo* is the theoretical mass of the fragment, *col* is the matched column and *colrt* is the retention time measured on the column for the reference compound. - -The **HTML** output contains the peak table with links toward HMDB, KEGG, ChEBI and PubChem public databases, when IDs are available. +The match results are output as new columns appended to the columns provided inside the MZ/RT input dataset, and prefixed with "lcmsmatching.". ===== About @@ -455,15 +484,30 @@ .. class:: infomark **Please cite** - R Core Team (2013). R: A language and Environment for Statistical Computing. http://www.r-project.org + R Core Team (2013). R: A language and Environment for Statistical Computing. http://www.r-project.org. + +============== +Changelog/News +============== + +**Version 4.0.0 - 02/01/2019** + +- NEW: Use of R biodb library. Connection to databases and matching have been moved to biodb library, which is maintained separately at http://github.com/pkrog/biodb. <!-- @@@END_RST@@@ --> </help> - <!--========= - = CITATIONS = - ==========--> + <!-- Citations {{{1 --> + <!-- **************************************************************** --> - <citations/> + <citations> + <citation type="bibtex">@unpublished{FGiacomoni2017, + title = {PeakForest [Internet], a spectral data portal for Metabolomics community - storing, curating and annotation services for metabolic profiles of biological matrix.}, + author = {Franck Giacomoni, Nils Paulhe}, + institution = {INRA / MetaboHUB}, + year = {2017}, + note = {Unpublished paper, available from: https://peakforest.org/.} + }</citation> + </citations> </tool>
--- a/list-chrom-cols.py Wed Apr 19 10:00:05 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,62 +0,0 @@ -#!/usr/bin/env python -# vi: fdm=marker - -import argparse -import subprocess -import re -import urllib2 -import json -import csv - -# Get chrom cols {{{1 -################################################################ - -def get_chrom_cols(dbtype, dburl, dbtoken = None, col_field = 'chromcol'): - - cols = [] - - if dbtype == 'peakforest': - url = dburl + ( '' if dburl[-1] == '/' else '/' ) + 'metadata/lc/list-code-columns' - if dbtoken is not None: - url += '?token=' + dbtoken - result = urllib2.urlopen(url).read() - v = json.JSONDecoder().decode(result) - i = 0 - for colid, coldesc in v.iteritems(): - s = coldesc['name'] + ' - ' + coldesc['constructor'] + ' - L' + str(coldesc['length']) + ' - diam. ' + str(coldesc['diameter']) + ' - part. ' + str(coldesc['particule_size']) + ' - flow ' + str(coldesc['flow_rate']) - cols.append( (s , colid, i == 0) ) - ++i - - elif dbtype == 'inhouse': - - # Get all column names from file - with open(dburl if isinstance(dburl, str) else dburl.get_file_name(), 'r') as dbfile: - reader = csv.reader(dbfile, delimiter = "\t", quotechar='"') - header = reader.next() - if col_field in header: - i = header.index(col_field) - allcols = [] - for row in reader: - col = row[i] - if col not in allcols: - allcols.append(col) - for i, c in enumerate(allcols): - cols.append( (c, c, i == 0) ) - - return cols - -# Main {{{1 -################################################################ - -if __name__ == '__main__': - - # Parse command line arguments - parser = argparse.ArgumentParser(description='Script for getting chromatographic columns of an RMSDB database for Galaxy tool lcmsmatching.') - parser.add_argument('-d', help = 'Database type', dest = 'dbtype', required = True) - parser.add_argument('-u', help = 'Database URL', dest = 'dburl', required = True) - parser.add_argument('-t', help = 'Database token', dest = 'dbtoken', required = False) - parser.add_argument('-f', help = 'Chromatogrphic column field name', dest = 'col_field', required = False) - args = parser.parse_args() - args_dict = vars(args) - - print(get_chrom_cols(**args_dict))
--- a/list-file-cols.py Wed Apr 19 10:00:05 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,61 +0,0 @@ -#!/usr/bin/env python -# vi: fdm=marker - -import csv -import re -import argparse - -# Get file cols {{{1 -################################################################ - -def get_file_cols(file, preferred): - - cols = [] - - with open(file if isinstance(file, str) else file.get_file_name(), 'r') as f: - - # Read file header - reader = csv.reader(f, delimiter = "\t", quotechar='"') - header = reader.next() - - preferred = preferred.split(',') - - # Determine default value - perfect_matches = [] - partial_matches = [] - for p in preferred: - for c in header: - if c == p: - perfect_matches.append(c) # Perfect match ! - elif re.match(p, c): - partial_matches.append(c) # Keep this partial match in case we find no perfect match - - ordered_cols = perfect_matches + partial_matches - for c in header: - if not c in ordered_cols: - ordered_cols.append(c) - ordered_cols.append('NA') - - default = 0 - if len(perfect_matches) + len(partial_matches) == 0: - default = len(ordered_cols) - 1 - - # Build list of cols - for i, c in enumerate(ordered_cols): - cols.append( (c, c, i == default) ) - - return cols - -# Main {{{1 -################################################################ - -if __name__ == '__main__': - - # Parse command line arguments - parser = argparse.ArgumentParser(description='Script for getting column names in a csv file.') - parser.add_argument('-f', help = 'CSV File (separator must be TAB)', dest = 'file', required = True) - parser.add_argument('-p', help = 'List (comma separated values) of preferred column names for default one.', dest = 'preferred', required = True) - args = parser.parse_args() - args_dict = vars(args) - - print(get_file_cols(**args_dict))
--- a/list-ms-mode-values.py Wed Apr 19 10:00:05 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,60 +0,0 @@ -#!/usr/bin/env python -# vi: fdm=marker - -import csv -import re -import argparse - -# Get MS mode values {{{1 -################################################################ - -def get_ms_mode_value(file, col, preferred): - - modes = [] - cols = [] - preferred = preferred.split(',') - - with open(file if isinstance(file, str) else file.get_file_name(), 'r') as f: - - # Read file header - reader = csv.reader(f, delimiter = "\t", quotechar='"') - header = reader.next() - try: - index = header.index(col) - for row in reader: - v = row[index] - if v not in modes: - modes.append(v) - - # Find default value - default = 0 - for p in preferred: - for i, m in enumerate(modes): - if m == p: - default = i - break - if default != 0: - break - - # Build list of cols - for i, c in enumerate(modes): - cols.append( (c, c, i == default) ) - except: - pass - - return cols - -# Main {{{1 -################################################################ - -if __name__ == '__main__': - - # Parse command line arguments - parser = argparse.ArgumentParser(description='Script for getting column names in a csv file.') - parser.add_argument('-f', help = 'CSV File (separator must be TAB)', dest = 'file', required = True) - parser.add_argument('-c', help = 'MS mode column name.', dest = 'col', required = True) - parser.add_argument('-p', help = 'List (comma separated values) of preferred column names for default one.', dest = 'preferred', required = True) - args = parser.parse_args() - args_dict = vars(args) - - print(get_ms_mode_value(**args_dict))
--- a/msdb-common.R Wed Apr 19 10:00:05 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,212 +0,0 @@ -if ( ! exists('.parse_chrom_col_desc')) { # Do not load again if already loaded - - library('stringr') - source('strhlp.R', chdir = TRUE) - source('biodb-common.R', chdir = TRUE) - - ############# - # CONSTANTS # - ############# - - # Field tags - MSDB.TAG.MZ <- 'mz' - MSDB.TAG.MZEXP <- 'mzexp' - MSDB.TAG.MZTHEO <- 'mztheo' - MSDB.TAG.RT <- 'rt' - MSDB.TAG.MODE <- 'msmode' - MSDB.TAG.MOLID <- 'compoundid' - MSDB.TAG.COL <- 'chromcol' - MSDB.TAG.COLRT <- 'chromcolrt' - MSDB.TAG.ATTR <- 'peakattr' - MSDB.TAG.INT <- 'intensity' - MSDB.TAG.REL <- 'relative.intensity' - MSDB.TAG.COMP <- 'peakcomp' - MSDB.TAG.MOLNAMES <- 'fullnames' - MSDB.TAG.MOLCOMP <- 'compoundmass' - MSDB.TAG.MOLMASS <- 'compoundcomp' - MSDB.TAG.INCHI <- 'inchi' - MSDB.TAG.INCHIKEY <- 'inchikey' - MSDB.TAG.PUBCHEM <- 'pubchemcompid' - MSDB.TAG.CHEBI <- 'chebiid' - MSDB.TAG.HMDB <- 'hmdbid' - MSDB.TAG.KEGG <- 'keggid' - - # Mode tags - MSDB.TAG.POS <- 'neg' - MSDB.TAG.NEG <- 'pos' - - # Fields containing multiple values - MSDB.MULTIVAL.FIELDS <- c(MSDB.TAG.MOLNAMES) - MSDB.MULTIVAL.FIELD.SEP <- ';' - - # Authorized mz tolerance unit values - MSDB.MZTOLUNIT.PPM <- 'ppm' - MSDB.MZTOLUNIT.PLAIN <- 'plain' # same as mz: mass-to-charge ratio - MSDB.MZTOLUNIT.VALS <- c(MSDB.MZTOLUNIT.PPM, MSDB.MZTOLUNIT.PLAIN) - - # Authorized rt units - MSDB.RTUNIT.SEC <- 'sec' - MSDB.RTUNIT.MIN <- 'min' - MSDB.RTUNIT.VALS <- c(MSDB.RTUNIT.SEC ,MSDB.RTUNIT.MIN) - - # Default values - MSDB.DFT.PREC <- list() - MSDB.DFT.PREC[[MSDB.TAG.POS]] <- c("[(M+H)]+", "[M+H]+", "[(M+Na)]+", "[M+Na]+", "[(M+K)]+", "[M+K]+") - MSDB.DFT.PREC[[MSDB.TAG.NEG]] <- c("[(M-H)]-", "[M-H]-", "[(M+Cl)]-", "[M+Cl]-") - MSDB.DFT.OUTPUT.MULTIVAL.FIELD.SEP <- MSDB.MULTIVAL.FIELD.SEP - MSDB.DFT.MATCH.FIELDS <- list( molids = 'molid', molnames = 'molnames') - MSDB.DFT.MATCH.SEP <- ',' - MSDB.DFT.MODES <- list( pos = 'POS', neg = 'NEG') - MSDB.DFT.MZTOLUNIT <- MSDB.MZTOLUNIT.PPM - - ############################ - # GET DEFAULT INPUT FIELDS # - ############################ - - msdb.get.dft.input.fields <- function () { - - dft.fields <- list() - - for(f in c(MSDB.TAG.MZ, MSDB.TAG.RT)) - dft.fields[[f]] <- f - - return(dft.fields) - } - - ######################### - # GET DEFAULT DB FIELDS # - ######################### - - msdb.get.dft.db.fields <- function () { - - dft.fields <- list() - - for (f in c(MSDB.TAG.MZTHEO, MSDB.TAG.COLRT, MSDB.TAG.MOLID, MSDB.TAG.COL, MSDB.TAG.MODE, MSDB.TAG.ATTR, MSDB.TAG.COMP, MSDB.TAG.MOLNAMES, MSDB.TAG.MOLCOMP, MSDB.TAG.MOLMASS, MSDB.TAG.INCHI, MSDB.TAG.INCHIKEY, MSDB.TAG.PUBCHEM, MSDB.TAG.CHEBI, MSDB.TAG.HMDB, MSDB.TAG.KEGG)) - dft.fields[[f]] <- f - - return(dft.fields) - } - - ################## - # MAKE DB FIELDS # - ################## - - msdb.make.db.fields <- function(fields) { - - # Merge with default fields - dft.fields <- msdb.get.dft.db.fields() - absent <- ! names(dft.fields) %in% names(fields) - if (length(absent) > 0) - fields <- c(fields, dft.fields[absent]) - - return(fields) - } - - ######################### - # MAKE INPUT DATA FRAME # - ######################### - - msdb.make.input.df <- function(mz, rt = NULL, rtunit = MSDB.RTUNIT.SEC) { - - field <- msdb.get.dft.input.fields() - - x <- data.frame() - - # Set mz - if (length(mz) > 1) - x[seq(mz), field[[MSDB.TAG.MZ]]] <- mz - else if (length(mz) == 1) - x[1, field[[MSDB.TAG.MZ]]] <- mz - else - x[, field[[MSDB.TAG.MZ]]] <- numeric() - - # Set rt - if ( ! is.null(rt)) { - if (rtunit == MSDB.RTUNIT.MIN) - rtunit <- rtunit * 60 - if (length(rt) > 1) - x[seq(rt), field[[MSDB.TAG.RT]]] <- rt - else if (length(rt) == 1) - x[1, field[[MSDB.TAG.RT]]] <- rt - else - x[, field[[MSDB.TAG.RT]]] <- numeric() - } - - return(x) - } - - ############################### - # GET EMPTY RESULT DATA FRAME # - ############################### - - .get.empty.result.df <- function(rt = FALSE) { - - df <- data.frame(stringsAsFactors = FALSE) - df[MSDB.TAG.MOLID] <- character() - df[MSDB.TAG.MOLNAMES] <- character() - df[MSDB.TAG.MZ] <- numeric() - df[MSDB.TAG.MZTHEO] <- numeric() - df[MSDB.TAG.ATTR] <- character() - df[MSDB.TAG.COMP] <- character() - if (rt) { - df[MSDB.TAG.RT] <- numeric() - df[MSDB.TAG.COL] <- character() - df[MSDB.TAG.COLRT] <- numeric() - } - - return(df) - } - - ############################ - # PARSE COLUMN DESCRIPTION # - ############################ - - .parse_chrom_col_desc <- function(desc) { - - # Clean string - s <- desc - s <- gsub('\\.+', ' ', s, perl = TRUE) # Replace '.' characters by spaces - s <- gsub('[*-]', ' ', s, perl = TRUE) # Replace dashes and asterisks by spaces - s <- gsub('[)(]', '', s, perl = TRUE) # Remove paranthesis - s <- trim(s) - s <- tolower(s) # put in lowercase - - # Match 2 3 4 5 6 7 8 9 10 1112 13 - pattern <- "^(uplc|hsf5|hplc|zicphilic)( (c8|c18|150 5 2 1))?( (\\d+)mn)?( (orbitrap|exactive|qtof|shimadzu exactive))?( (\\d+)mn)?( (bis|ter))?( 1)?$" - g <- str_match(s, pattern) - if (is.na(g[1, 1])) - stop(paste0("Impossible to parse column description \"", desc, "\".")) - - type <- g[1, 2] - stationary_phase <- if ( ! is.na(g[1, 4]) && nchar(g[1, 4]) > 0) g[1, 4] else NA_character_ - msdevice <- if ( ! is.na(g[1, 8]) && nchar(g[1, 8]) > 0) g[1, 8] else NA_character_ - time <- if ( ! is.na(g[1,6]) && nchar(g[1, 6]) > 0) as.integer(g[1, 6]) else ( if ( ! is.na(g[1, 10]) && nchar(g[1, 10]) > 0) as.integer(g[1, 10]) else NA_integer_ ) - - # Correct values - if ( ! is.na(stationary_phase) && stationary_phase == '150 5 2 1') stationary_phase <- '150*5*2.1' - if ( ! is.na(msdevice)) msdevice <- gsub(' ', '', msdevice) # remove spaces - - return(list( type = type, stationary_phase = stationary_phase, time = time, msdevice = msdevice)) - - } - - ######################### - # NORMALIZE COLUMN NAME # - ######################### - - .normalize_column_name <- function(desc) { - - lst <- .parse_chrom_col_desc(desc) - - v <- c(lst$type) - if ( ! is.na(lst$stationary_phase)) - v <- c(v, lst$stationary_phase) - if ( ! is.na(lst$time)) - v <- c(v, paste0(lst$time, "min")) - if ( ! is.na(lst$msdevice)) - v <- c(v, lst$msdevice) - - return(paste(v, collapse = '-')) - } - -} # end of load safe guard
--- a/nethlp.R Wed Apr 19 10:00:05 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,24 +0,0 @@ -if ( ! exists('extract.address')) { - - ################### - # EXTRACT ADDRESS # - ################### - - extract.address <- function(url) { - - addr <- sub('^([0-9A-Za-z.]+).*$', '\\1', url, perl = TRUE) - - return(addr) - } - - ################ - # EXTRACT PORT # - ################ - - extract.port <- function(url) { - - port <- sub('^.*:([0-9]+)$', '\\1', url, perl = TRUE) - - return(as.integer(port)) - } -}
--- a/search-mz Wed Apr 19 10:00:05 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,517 +0,0 @@ -#!/usr/bin/env Rscript -# vi: ft=R fdm=marker -args <- commandArgs(trailingOnly = F) -script.path <- sub("--file=","",args[grep("--file=",args)]) -library(getopt) -source(file.path(dirname(script.path), 'msdb-common.R'), chdir = TRUE) -source(file.path(dirname(script.path), 'MsDbLogger.R'), chdir = TRUE) -source(file.path(dirname(script.path), 'MsDbInputDataFrameStream.R'), chdir = TRUE) -source(file.path(dirname(script.path), 'MsDbOutputDataFrameStream.R'), chdir = TRUE) -source(file.path(dirname(script.path), 'htmlhlp.R'), chdir = TRUE) -source(file.path(dirname(script.path), 'strhlp.R'), chdir = TRUE) -source(file.path(dirname(script.path), 'fshlp.R'), chdir = TRUE) -source(file.path(dirname(script.path), 'biodb-common.R'), chdir = TRUE) -source(file.path(dirname(script.path), 'nethlp.R'), chdir = TRUE) - -# Missing paste0() function in R 2.14.1 -if (as.integer(R.Version()$major) == 2 && as.numeric(R.Version()$minor) < 15) - paste0 <- function(...) paste(..., sep = '') - -# Constants {{{1 -################################################################ - -PROG <- sub('^.*/([^/]+)$', '\\1', commandArgs()[4], perl = TRUE) -USERAGENT <- 'search-mz ; pierrick.roger@gmail.com' - -# Authorized database types -MSDB.XLS <- 'xls' -MSDB.4TABSQL <- '4tabsql' -MSDB.FILE <- 'file' -MSDB.PEAKFOREST <- 'peakforest' -MSDB.VALS <- c(MSDB.XLS, MSDB.4TABSQL, MSDB.FILE, MSDB.PEAKFOREST) -DB.SRC.FILE <- list () -DB.SRC.FILE[[MSDB.FILE]] <- 'MsFileDb.R' -DB.SRC.FILE[[MSDB.PEAKFOREST]] <- 'MsPeakForestDb.R' -DB.SRC.FILE[[MSDB.XLS]] <- 'MsXlsDb.R' -DB.SRC.FILE[[MSDB.4TABSQL]] <- 'Ms4TabSqlDb.R' - -# Authorized mode values -POS_MODE <- 'pos' -NEG_MODE <- 'neg' -MSDB.MODE.VALS <- c(POS_MODE, NEG_MODE) - -# Default -MSDB.DFT <- list() -MSDB.DFT[['mzshift']] <- 0 # in ppm -MSDB.DFT[['mzprec']] <- 5 # in ppm -MSDB.DFT[['mztolunit']] <- MSDB.DFT.MZTOLUNIT -MSDB.DFT[['precursor-rt-tol']] <- 5 -MSDB.DFT[['molids-sep']] <- MSDB.DFT.MATCH.SEP -MSDB.DFT[['db-fields']] <- concat.kv.list(msdb.get.dft.db.fields()) -MSDB.DFT[['db-ms-modes']] <- concat.kv.list(MSDB.DFT.MODES) -MSDB.DFT[['pos-prec']] <- paste(MSDB.DFT.PREC[[MSDB.TAG.POS]], collapse = ',') -MSDB.DFT[['neg-prec']] <- paste(MSDB.DFT.PREC[[MSDB.TAG.NEG]], collapse = ',') -MSDB.DFT[['db-rt-unit']] <- MSDB.RTUNIT.SEC -MSDB.DFT[['rtunit']] <- MSDB.RTUNIT.SEC -DEFAULT.ARG.VALUES <- MSDB.DFT -DEFAULT.ARG.VALUES[['input-col-names']] <- concat.kv.list(msdb.get.dft.input.fields()) - -# Print help {{{1 -################################################################ - -print.help <- function() { - - cat("USAGE:\n") - prog.mz.match <- paste(PROG, ' -d (', paste(MSDB.VALS, collapse = '|'), ') --url (file|dir|database URL) -i <file> -m (', paste(MSDB.MODE.VALS, collapse = '|'), ") -p <mz precision> -s <mz shift> -u (", paste(MSDB.MZTOLUNIT.VALS, collapse = '|'), ") -o <file>", sep = '') - cat("\t(1) ", prog.mz.match, " ...\n", sep = '') - cat("\n") - cat("\t(2) ", prog.mz.match, "(--all-cols|-c <cols>) -x <X RT tolerance> -y <Y RT tolerance>", " ...\n", sep = '') - cat("\n") - cat("\t(3) ", PROG, ' -d (', paste(MSDB.VALS, collapse = '|'), ") --url (file|dir|database URL) --list-cols\n", sep = '') - - cat("\nDETAILS:\n") - cat("Form (1) is for running an MZ match on a database.\n") - cat("Form (2) is for running an MZ/RT match on a database.\n") - cat("Form (3) is for getting a list of available chromatographic columns in a database.\n") - - cat("\nOPTIONS:\n") - spec <- matrix(make.getopt.spec(), byrow = TRUE, ncol = 5) - max.length.opt.cols <- max(nchar(spec[,1])) + 1 - sections <- list(database = "Database setting", input = "Input file", output = "Output files", mz = "M/Z matching", rt = "RT matching", precursor = "Precursor matching", misc = "Miscellaneous") - for (section in names(sections)) { - cat("\n\t", sections[[section]], ":\n", sep = '') - spec <- matrix(make.getopt.spec(section), byrow = TRUE, ncol = 5) - for (i in seq(nrow(spec))) { - opt <- '' - if ( ! is.na(spec[i,2])) - opt <- paste('-', spec[i,2], '|', sep = '') - opt <- paste(opt, '--', spec[i, 1], sep = '') - nb.space.padding <- max.length.opt.cols - nchar(opt) + 6 - padding <- paste(rep(' ', nb.space.padding), sep = '') - cat("\t\t", opt, padding, "\t", spec[i, 5], "\n", sep = '') - } - } - - cat("\nEXAMPLES:\n") - - cat("\nSimple M/Z matching with a file database:\n") - cat("\t./", PROG, " -d file --url mydbfile.tsv -i input.tsv -m pos -o output.tsv\n", sep = '') - - cat("\nFile database with M/Z tolerance:\n") - cat("\t./", PROG, " -d file --url mydbfile.tsv -i input.tsv -m pos -o output.tsv -p 0.5 -s 0\n", sep = '') - - cat("\nFile database with M/Z tolerance unit:\n") - cat("\t./", PROG, " -d file --url mydbfile.tsv -i input.tsv -m pos -o output.tsv -p 1 -s 0.5 -u plain\n", sep = '') - - cat("\nPeakforest database:\n") - cat("\t./", PROG, " -d peakforest --url https://rest.peakforest.org/ --db-token <your Peakforest token> -i input.tsv -m pos -o output.tsv\n", sep = '') -} - -# Set default argument values {{{1 -################################################################ - -set.dft.arg.val <-function(opt) { - - for (f in names(MSDB.DFT)) - if (is.null(opt[[f]])) - opt[[f]] <- MSDB.DFT[[f]] - - # Set default values - if ( opt$database == MSDB.XLS && ! is.null(opt$url) && is.null(opt[['cache-dir']])) - opt[['cache-dir']] <- file.path(opt$url, 'cache') - - if ( ! is.null(opt$rtcol) && opt$rtcol == '') - opt$rtcol <- NULL - - return(opt) -} - -# Parse argument values {{{1 -################################################################ - -parse.arg.val <- function(opt) { - - # Parse input column names - if ( ! is.null(opt[['db-fields']])) { - cust <- split.kv.list(opt[['db-fields']]) - cust <- cust[cust != 'NA'] - opt[['db-fields']] <- split.kv.list(MSDB.DFT[['db-fields']]) - cust <- cust[names(cust) %in% names(opt[['db-fields']])] - opt[['db-fields']][names(cust)] <- cust - } - - # Parse MS modes - if ( ! is.null(opt[['db-ms-modes']])) { - cust <- split.kv.list(opt[['db-ms-modes']]) - opt[['db-ms-modes']] <- split.kv.list(MSDB.DFT[['db-ms-modes']]) - opt[['db-ms-modes']][names(cust)] <- cust - } - - # Parse retention time columns - if ( ! is.null(opt$rtcol)) - opt$rtcol <- strsplit(opt$rtcol, ',')[[1]] - - # Parse input column names - if (is.null(opt[['input-col-names']])) { - opt[['input-col-names']] <- msdb.get.dft.input.fields() - } - else { - custcols <- split.kv.list(opt[['input-col-names']]) - custcols <- custcols[custcols != 'NA'] - dftcols <- msdb.get.dft.input.fields() - opt[['input-col-names']] <- c(custcols, dftcols[ ! names(dftcols) %in% names(custcols)]) - } - - # Parse lists of precursors - if ( ! is.null(opt[['pos-prec']])) - opt[['pos-prec']] <- split.str(opt[['pos-prec']], unlist = TRUE) - if ( ! is.null(opt[['neg-prec']])) - opt[['neg-prec']] <- split.str(opt[['neg-prec']], unlist = TRUE) - - return(opt) -} - -# Make getopt specifications {{{1 -################################################################ - -make.getopt.spec <- function(sections = NULL) { - - spec <- character(0) - - if (is.null(sections) || 'input' %in% sections) - spec <- c(spec, - 'input-file', 'i', 1, 'character', 'Set input file.', - 'input-col-names', 'j', 1, 'character', paste0('Set the input column names. Default is "', DEFAULT.ARG.VALUES[['input-col-names']], '".') - ) - - if (is.null(sections) || 'mz' %in% sections) - spec <- c(spec, - 'mode', 'm', 1, 'character', paste0('MS mode. Possible values are:', paste(MSDB.MODE.VALS, collapse = ", "), '.'), - 'mzshift', 's', 1, 'numeric', paste0('Shift on m/z. Default is ', MSDB.DFT$mzshift,'.'), - 'mzprec', 'p', 1, 'numeric', paste0('Tolerance on m/z. Default is ', MSDB.DFT$mzprec,'.'), - 'mztolunit', 'u', 1, 'character', paste0('Unit used for tolerance values (options -s and -p) on M/Z. Default is ', MSDB.DFT$mztolunit,'.') - ) - - if (is.null(sections) || 'rt' %in% sections) - spec <- c(spec, - 'all-cols', 'A', 0, 'logical', 'Use all available chromatographic columns to match retention times.', - 'rtcol', 'c', 1, 'character', paste0('Chromatographic column to use. Unset by default. If set, use the corresponding column to filter on retention times, if retention times are provided.'), - 'check-cols', 'k', 0, 'logical', 'Check that the chromatographic column names specified with option -c really exist.', - 'list-cols', 'l', 0, 'logical', 'List all chromatographic columns present in the database. Write list inside the file specified by -o option.', - 'rttol', 'r', 1, 'numeric', paste0('Tolerance on retention times. Unset by default.'), - 'rttolx', 'x', 1, 'numeric', paste0('Tolerance on retention times. Unset by default.'), - 'rttoly', 'y', 1, 'numeric', paste0('Tolerance on retention times. Unset by default.'), - 'rtunit', 'v', 1, 'character', paste0('Retention time unit for the input file. Default is ', MSDB.DFT$rtunit, '. Allowed values are:', paste(MSDB.RTUNIT.VALS, collapse = ", "), '.') - ) - - if (is.null(sections) || 'precursor' %in% sections) - spec <- c(spec, - 'precursor-match', 'Q', 0, 'logical', 'Remove peaks whose molecule precursor peak has not been matched. Unset by default.', - 'precursor-rt-tol', 'R', 1, 'numeric', paste0('Precursor retention time tolerance. Only used when precursor-match is enabled. Default is ', MSDB.DFT[['precursor-rt-tol']], '.'), - 'pos-prec', 'Y', 1, 'character', paste0('Set the list of precursors to use in positive mode. Default is "', MSDB.DFT[['pos-prec']], '".'), - 'neg-prec', 'Z', 1, 'character', paste0('Set the list of precursors to use in negative mode. Default is "', MSDB.DFT[['neg-prec']], '".') - ) - - if (is.null(sections) || 'output' %in% sections) - spec <- c(spec, - 'output-file', 'o', 1, 'character', 'Set file to use for the main output.', - 'peak-output-file', 'O', 1, 'character', 'If set and if --same-rows is set, then output all matches inside the specified file, with one mz match per line. The output columns are: mz, rt, id, col, colrt, composition, attribution. This means that if an mz value is matched several times, then it will repeated on several lines, with one match description per line.', - 'html-output-file', 'H', 1, 'character', 'Set file to use for the HTML output.', - 'no-main-table-in-html-output', 't', 0, 'logical', 'Do not display main table in HTML output.', - 'same-rows', 'a', 0, 'logical', 'If set, output exactly the same number of rows as the input. This means that in case of multiple matches for one mz, then only one line is output (i.e.: the mz value is not duplicated on several lines). In the main output file, an "ms.matching" column is output with inside, for each mz, a comma separated list of matched component/molecule IDs. If unset, then only the main output file is used, and one single is written to it with one line per peak match, and eventual mz line duplicated if there are multiple matches for this mz.', - 'same-cols', 'b', 0, 'logical', 'If set, output the same columns as inside the input. All input columns are copied to the output.', - 'molids-sep', 'S', 1, 'character', paste0('Set character separator used to when concatenating molecule IDs in output. Default is "', MSDB.DFT[['molids-sep']] , '".'), - 'first-val', '1', 0, 'logical', 'Keep only the first value in multi-value fields. Unset by default.', - 'excel2011comp', 'X', 0, 'logical', 'Excel 2011 compatiblity mode. Output ASCII text files instead of UTF-8 files, where greek letters are replaced with their latin names, plusminus sign is replaced with +- and apostrophe is replaced with \"prime\". All other non-ASCII characters are repladed with underscore.' - ) - - if (is.null(sections) || 'database' %in% sections) - spec <- c(spec, - 'database', 'd', 1, 'character', paste0('Set database to use: "xls" for an Excel database, "file" for a single file database, "4tabsql" for a 4Tab SQL database, and "peakforest" for a connection to PeakForest database.'), - 'url', 'W', 1, 'character', 'URL of database. For "peakforest" database it is the HTTP URL, for the "xls" database it is the path to the directory containing the Excel files, for the "file" database it is the path to the file database and for the "4tabsql" database it is the IP address of the server.', - 'cache-dir', 'C', 1, 'character', 'Path to directory where to store cache files. Only used when database flag is set to "xls".', - 'db-name', 'N', 1, 'character', 'Name of the database. Used by the "4tabsql" database.', - 'db-user', 'U', 1, 'character', 'User of the database. Used by the "4tabsql" database.', - 'db-password', 'P', 1, 'character', 'Password of the database user. Used by the "4tabsql" database.', - 'db-ms-modes', 'M', 1, 'character', paste0('Comma separated key/value list giving the MS modes to be used in the single file database. Default is "', MSDB.DFT[['db-ms-modes']], '".'), - 'db-rt-unit', 'V', 1, 'character', paste0('Retention time unit for the database, used in the single file database. Default is "', MSDB.DFT[['db-rt-unit']], '". Allowed values are:', paste(MSDB.RTUNIT.VALS, collapse = ", "), '.'), - 'db-token', 'T', 1, 'character', 'Database token. Used by Peakforest database.', - 'db-fields', 'F', 1, 'character', paste0('Comma separated key/value list giving the field names to be used in the single file database. Default is "', MSDB.DFT[['db-fields']], '".') - ) - - if (is.null(sections) || 'misc' %in% sections) - spec <- c(spec, - 'help', 'h', 0, 'logical', 'Print this help.', - 'debug', 'g', 0, 'logical', 'Set debug mode.' - ) - - return(spec) -} - -# Read args {{{1 -################################################################ - -read_args <- function() { - - # Get options - opt <- getopt(matrix(make.getopt.spec(), byrow = TRUE, ncol = 5)) - - # help - if ( ! is.null(opt$help)) { - print.help() - quit() - } - - opt <- set.dft.arg.val(opt) # Set default values - opt <- parse.arg.val(opt) # Parse list values - - # Check values - error <- check.args(opt) - - return(opt) -} - -# Check args {{{1 -################################################################ - -check.args <- function(opt) { - - # Check database type - if (is.null(opt$database)) - stop("You must provide a database type through --database option.") - if ( ! opt$database %in% MSDB.VALS) - stop(paste0("Invalid value \"", opt$database, "\" for --database option.")) - - # Check filedb database - if (opt$database == MSDB.FILE) { - if (is.null(opt$url)) - stop("When using single file database, you must specify the location of the database file with option --url.") - if ( ! file.exists(opt$url)) - stop(paste0("The file path \"", opt$url,"\" specified with --db-file option is not valid.")) - } - - # Check Excel database - if (opt$database == MSDB.XLS) { - if (is.null(opt$url)) - stop("When using Excel database, you must specify the location of the Excel files directory with option --url.") - if ( ! file.exists(opt$url)) - stop(paste0("The directory path \"", opt$url,"\" specified with --xls-dir option is not valid.")) - } - - # Check 4 tab database - if (opt$database == MSDB.4TABSQL) { - if (is.null(opt$url)) - stop("When using 4Tab SQL database, you must specify the URL of the SQL server with option --url.") - if (is.null(opt[['db-name']])) - stop("When using 4Tab SQL database, you must specify the database name through the --db-name option.") - if (is.null(opt[['db-user']])) - stop("When using 4Tab SQL database, you must specify the database user through the --db-user option.") - if (is.null(opt[['db-password']])) - stop("When using 4Tab SQL database, you must specify the database user password through the --db-password option.") - } - - # Check Peakforest database - if (opt$database == MSDB.PEAKFOREST) { - if (is.null(opt$url)) - stop("When using PeakForest database, you must specify the URL of the PeakForest server with option --url.") - } - - if (is.null(opt[['list-cols']])) { - - if (is.null(opt[['output-file']])) - stop("You must set a path for the output file.") - - if (is.null(opt[['input-file']])) - stop("You must provide an input file.") - - if (is.null(opt$mode) || ( ! opt$mode %in% MSDB.MODE.VALS)) - stop("You must specify a mode through the --mode option.") - - if (is.null(opt$mzprec)) - stop("You must set a precision in MZ with the --mzprec option.") - - if ( ( ! is.null(opt$rtcol) || ! is.null(opt[['all-cols']])) && (is.null(opt$rttolx) || is.null(opt$rttoly))) - stop("When chromatographic columns are set, you must provide values for --rttolx and -rttoly.") - - if (is.null(opt$mztolunit) || ( ! opt$mztolunit %in% MSDB.MZTOLUNIT.VALS)) - stop("You must specify an M/Z tolerance unit through the --mztolunit option.") - } -} - -# Load database {{{1 -################################################################ - -.load.db <- function(opt) { - - if (is.null(opt[['pos-prec']]) && is.null(opt[['neg-prec']])) { - precursors <- NULL - } else { - precursors <- list() - precursors[[MSDB.TAG.POS]] <- opt[['pos-prec']] - precursors[[MSDB.TAG.NEG]] <- opt[['neg-prec']] - } - - db <- switch(opt$database, - peakforest = MsPeakForestDb$new(url = opt$url, useragent = USERAGENT, token = opt[['db-token']]), - xls = MsXlsDb$new(db_dir = opt$url, cache_dir = opt[['cache-dir']]), - '4tabsql' = Ms4TabSqlDb$new(host = extract.address(opt$url), port = extract.port(opt$url), dbname = opt[['db-name']], user = opt[['db-user']], password = opt[['db-password']]), - file = MsFileDb$new(file = opt$url), - NULL) - db$setPrecursors(precursors) - if (db$areDbFieldsSettable()) - db$setDbFields(opt[['db-fields']]) - if (db$areDbMsModesSettable()) - db$setDbMsModes(opt[['db-ms-modes']]) - db$addObservers(MsDbLogger$new()) - - return(db) -} - -# Output HTML {{{1 -################################################################ - -output.html <- function(db, peaks, file) { - - # Replace public database IDs by URLs - if ( ! is.null(peaks)) { - # Conversion from extdb id field to extdb name - extdb2classdb = list() - extdb2classdb[MSDB.TAG.KEGG] = BIODB.KEGG - extdb2classdb[MSDB.TAG.HMDB] = BIODB.HMDB - extdb2classdb[MSDB.TAG.CHEBI] = BIODB.CHEBI - extdb2classdb[MSDB.TAG.PUBCHEM] = BIODB.PUBCHEMCOMP - - # Loop on all dbs - for (extdb in c(MSDB.TAG.KEGG, MSDB.TAG.HMDB, MSDB.TAG.CHEBI, MSDB.TAG.PUBCHEM)) { - if ( ! is.null(peaks) && extdb %in% colnames(peaks)) - peaks[[extdb]] <- vapply(peaks[[extdb]], function(id) if (is.na(id)) '' else paste0('<a href="', get.entry.url(class = extdb2classdb[[extdb]], accession = id, content.type = BIODB.HTML), '">', id, '</a>'), FUN.VALUE = '') - } - } - - # Write HTML - html <- HtmlWriter(file = file) - html$writeBegTag('html') - html$writeBegTag('header') - html$writeTag('meta', attr = c(charset = "UTF-8")) - html$writeTag('title', text = "LC/MS matching results") - html$writeBegTag('style') - html$write('table, th, td { border-collapse: collapse; }') - html$write('table, th { border: 1px solid black; }') - html$write('td { border-left: 1px solid black; border-right: 1px solid black; }') - html$write('th, td { padding: 5px; }') - html$write('th { background-color: LightBlue; }') - html$write('tr:nth-child(even) { background-color: LemonChiffon; }') - html$write('tr:nth-child(odd) { background-color: LightGreen; }') - html$writeEndTag('style') - html$writeEndTag('header') - html$writeBegTag('body') - - # Write results - results <- FALSE - if ( ! is.null(peaks) && nrow(peaks) > 0) { - html$writeTag('h3', text = "Matched peaks") - html$writeTable(peaks) - results <- TRUE - } - if ( ! results) - html$writeTag('p', 'None.') - - html$writeEndTag('body') - html$writeEndTag('html') -} - -# MAIN {{{1 -################################################################ - -# Read command line arguments -opt <- read_args() - -if (is.null(opt$debug)) { - options(error = function() { quit(status = 1) }, warn = 0 ) -} - -# Load database -source(file.path(dirname(script.path), DB.SRC.FILE[[opt$database]]), chdir = TRUE) -db <- .load.db(opt) - -# Print columns -if ( ! is.null(opt[['list-cols']])) { - cols <- db$getChromCol() - df.write.tsv(cols, file = if (is.null(opt[['output-file']])) stdout() else opt[['output-file']]) - q(status = 0) -} - -# Read input -if ( ! is.null(opt[['input-file']]) && ! file.exists(opt[['input-file']])) - stop(paste0("Input file \"", opt[['input-file']], "\" does not exist.")) -if (file.info(opt[['input-file']])$size > 0) { - - # Load file into data frame - input <- read.table(file = opt[['input-file']], header = TRUE, sep = "\t", stringsAsFactor = FALSE, check.names = FALSE, comment.char = '') - - # Convert each column that is identified by a number into a name - for (field in names(opt[['input-col-names']])) { - if ( ! opt[['input-col-names']][[field]] %in% colnames(input) && length(grep('^[0-9]+$', opt[['input-col-names']][[field]])) > 0) { - col.index <- as.integer(opt[['input-col-names']][[field]]) - if (col.index < 1 || col.index > length(colnames(input))) - stop(paste0("No column n°", col.index, " for input field ", field, ".")) - opt[['input-col-names']][[field]] <- colnames(input)[[col.index]] - } - } -} else { - input <- data.frame() - input[[opt[['input-col-names']][['mz']]]] <- double() - input[[opt[['input-col-names']][['rt']]]] <- double() -} - -# Check mz column -if ( ! opt[['input-col-names']][['mz']] %in% colnames(input)) - stop(paste0('No column named "', opt[['input-col-names']][['mz']], '" in input file.')) - -# Set columns 'all-cols' specified -if ( ! is.null(opt[['all-cols']])) - opt$rtcol <- db$getChromCol()[['id']] - -# Check chrom columns -if ( ! is.null(opt[['check-cols']]) && ! is.null(opt$rtcol)) { - dbcols <- db$getChromCol()[['id']] - unknown.cols <- opt$rtcol[ ! opt$rtcol %in% dbcols] - if (length(unknown.cols) > 0) { - stop(paste0("Unknown chromatographic column", (if (length(unknown.cols) > 1) 's' else ''), ': ', paste(unknown.cols, collapse = ', '), ".\nAllowed chromatographic column names are:\n", paste(dbcols, collapse = "\n"))) - } -} - -# Check that an RT column exists when using MZ/RT matching -if ( ! is.null(opt$rtcol) && ! opt[['input-col-names']][['rt']] %in% colnames(input)) - stop(paste0("You are running an MZ/RT match run on your input data, but no retention time column named '", opt[['input-col-names']][['rt']],"' can be found inside your input file.")) - -# Set output col names -output.col.names <- opt[['input-col-names']] - -# Set streams -input.stream <- MsDbInputDataFrameStream$new(df = input, input.fields = opt[['input-col-names']], rtunit = opt[['rtunit']]) -main.output <- MsDbOutputDataFrameStream$new(keep.unused = ! is.null(opt[['same-cols']]), output.fields = output.col.names, one.line = ! is.null(opt[['same-rows']]), match.sep = opt[['molids-sep']], first.val = ! is.null(opt[['first-val']]), ascii = ! is.null(opt[['excel2011comp']]), nogreek = ! is.null(opt[['excel2011comp']]), noapostrophe = ! is.null(opt[['excel2011comp']]), noplusminus = ! is.null(opt[['excel2011comp']]), rtunit = opt[['rtunit']]) -peaks.output <- MsDbOutputDataFrameStream$new(keep.unused = ! is.null(opt[['same-cols']]), output.fields = output.col.names, first.val = ! is.null(opt[['first-val']]), ascii = ! is.null(opt[['excel2011comp']]), nogreek = ! is.null(opt[['excel2011comp']]), noapostrophe = ! is.null(opt[['excel2011comp']]), noplusminus = ! is.null(opt[['excel2011comp']]), rtunit = opt[['rtunit']]) -invisible(db$setInputStream(input.stream)) -db$addOutputStreams(c(main.output, peaks.output)) - -# Set database units -db$setMzTolUnit(opt$mztolunit) -if ( ! is.null(opt[['db-rt-unit']]) && opt$database == 'file') - db$setRtUnit(opt[['db-rt-unit']]) - -# Search database -mode <- if (opt$mode == POS_MODE) MSDB.TAG.POS else MSDB.TAG.NEG -db$searchForMzRtList(mode = mode, shift = opt$mzshift, prec = opt$mzprec, rt.tol = opt$rttol, rt.tol.x = opt$rttolx, rt.tol.y = opt$rttoly, col = opt$rtcol, precursor.match = ! is.null(opt[['precursor-match']]), precursor.rt.tol = opt[['precursor-rt-tol']]) - -# Write output -main.output$moveColumnsToBeginning(colnames(input)) -peaks.output$moveColumnsToBeginning(colnames(input)) -# TODO Create a class MsDbOutputCsvFileStream -df.write.tsv(main.output$getDataFrame(), file = opt[['output-file']], row.names = FALSE) -if ( ! is.null(opt[['peak-output-file']])) - # TODO Create a class MsDbOutputCsvFileStream - df.write.tsv(peaks.output$getDataFrame(), file = opt[['peak-output-file']], row.names = FALSE) -if ( ! is.null(opt[['html-output-file']])) - # TODO Create a class MsDbOutputHtmlFileStream - output.html(db = db, peaks = peaks.output$getDataFrame(), file = opt[['html-output-file']])
--- a/search.R Wed Apr 19 10:00:05 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,64 +0,0 @@ -if ( ! exists('binary.search')) { # Do not load again if already loaded - - # Run a binary search on a sorted array. - # val The value to search. - # tab The array of values, sorted in ascending order. - # lower If set to NA, then search for the first value found by the binary search. If set to TRUE, find the value with the lowest index in the array. If set to FALSE, find the value with the highest index in the array. - # first The index of the array from which to start (1 by default). - # last The index of the array where to stop searching (end of the array by default). - # Returns the index of the found value, or NA. - binary.search <- function(val, tab, lower = NA, first = 1L, last = length(tab)) - { - # Check array & value - if (is.null(tab)) - stop('Argument "tab" is NULL.') - if (is.null(val)) - stop('Argument "val" is NULL.') - - # Wrong arguments - if (is.na(val) || last < first || length(tab) == 0) - return(NA_integer_) - - # Find value - l <- first - h <- last - while (h >= l) { - - # Take middle point - m <- (h + l) %/% 2 - # Found value - if (tab[m] == val) { - if (is.na(lower)) - return(m) - if (lower && m > first) { - for (i in (m-1):first) - if (tab[i] != val) - return(i+1) - } - else if ( ! lower && m < last) - for (i in (m+1):last) - if (tab[i] != val) - return(i-1) - return(m) - } - - # Decrease higher bound - else if (tab[m] > val) h <- m - 1 - - # Increase lower bound - else l <- m + 1 - } - - # Value not found - if ( ! is.na(lower)) { - # Look for lower or higher bound - if (lower) - return(if (h < first) NA_integer_ else h) - else - return(if (l > last) NA_integer_ else l) - } - - return(NA_integer_) - } - -} # end of load safe guard
--- a/strhlp.R Wed Apr 19 10:00:05 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,72 +0,0 @@ -if ( ! exists('trim')) { # Do not load again if already loaded - - ####################### - # WHITESPACE TRIMMING # - ####################### - - # Trim leading whitespaces - trim.leading <- function (x) sub("^\\s+", "", x) - - # Trim trailing whitespaces - trim.trailing <- function (x) sub("\\s+$", "", x) - - # Trim leading and trailing whitespaces - trim <- function (x) gsub("^\\s+|\\s+$", "", x) - - ############# - # SPLITTING # - ############# - - # s The string to split. - # sep The separator on which to split. - # trim Trim whitespaces for the resulting elements. - # unlist Unlist the result, So that for a single string (i.e.: s has length 1), it returns a vector of strings instead of a list of vectors of strings. - # RETURN A list of strings. - split.str <- function(s, sep = ',', trim = TRUE, unlist = FALSE) { - v <- strsplit(s, sep) - if (trim) v <- lapply(v, trim) - if (unlist) v <- unlist(v) - return(v) - } - - ######################## - # SPLIT KEY/VALUE LIST # - ######################## - - split.kv.list <- function(s, sep = ',', kvsep = '=') { - - # Split - kvs <- strsplit(strsplit(s, sep)[[1]], kvsep) - - # Get keys - k <- vapply(kvs, function(x) x[[1]], FUN.VALUE = '') - v <- vapply(kvs, function(x) x[[2]], FUN.VALUE = '') - - # Set names - names(v) <- k - - return(v) - } - - ######################### - # CONCAT KEY/VALUE LIST # - ######################### - - concat.kv.list <- function(x, sep = ',', kvsep = '=') { - - k <- names(x) - - s = paste(paste(names(x), x, sep = kvsep), collapse = sep) - - return(s) - } - - ################# - # REMOVE QUOTES # - ################# - - remove.quotes <- function(s) { - return(sub('^["\']?([^\'"]*)["\']?$', '\\1', s, perl = TRUE)) - } - -} # end of load safe guard
--- a/test-data/filedb.tsv Wed Apr 19 10:00:05 2017 -0400 +++ b/test-data/filedb.tsv Fri Feb 22 16:04:22 2019 -0500 @@ -19,7 +19,7 @@ A10 "POS" 84.080775 "P9Z5W410 O0" "[(M+H)-(NH3)-(HCOOH)]+" "colzz" 5.69 "J114L6M62O2" 146.10553 "Blablaine" A10 "POS" 84.080775 "P9Z5W410 O0" "[(M+H)-(NH3)-(HCOOH)]+" "colzz3" 4.54 "J114L6M62O2" 146.10553 "Blablaine" A10 "POS" 84.080775 "P9Z5W410 O0" "[(M+H)-(NH3)-(HCOOH)]+" "colpp" 0.89 "J114L6M62O2" 146.10553 "Blablaine" -A10 "POS" 148.116159 "U513P92ZW415 O2" "[(M+H)]+ (13C)" "hcoltt" 0.8 "J114L6M62O2" 146.10553 "Blablaine" +A10 "POS" 148.116159 "U513P92ZW415 O2" "[(M+H)]+ (13C)" "coltt" 0.8 "J114L6M62O2" 146.10553 "Blablaine" A10 "POS" 148.116159 "U513P92ZW415 O2" "[(M+H)]+ (13C)" "colzz" 5.69 "J114L6M62O2" 146.10553 "Blablaine" A10 "POS" 145.097154 "P92Z6W413 O2" "[(M+H)-(H2)]+" "somecol" 8.97 "J114L6M62O2" 146.10553 "Blablaine" A10 "POS" 148.116159 "U513P92ZW415 O2" "[(M+H)]+ (13C)" "colpp" 0.89 "J114L6M62O2" 146.10553 "Blablaine"
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/mz-input-small_with_nas.tsv Fri Feb 22 16:04:22 2019 -0500 @@ -0,0 +1,31 @@ +mz +80.04959021 +82.04819461 +83.01343941 +84.05585475 +87.05536392 +89.50682004 +90.97680734 +NA +94.57331384 +97.07602789 +99.5429594 +101.0708987 +102.066292 +NA +104.0034256 +104.5317528 +105.4460999 +105.7271343 +106.0231437 +106.2399954 +106.5116177 +106.7629705 +NA +107.2424051 +107.4569385 +107.6884734 +107.9272908 +108.1575604 +109.0777249 +NA
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/mzrt-input-small.tsv Fri Feb 22 16:04:22 2019 -0500 @@ -0,0 +1,32 @@ +mz rt +80.04959021 339.9725632 +82.04819461 1593.540123 +83.01343941 654.9535891 +84.05585475 4.748268943 +87.05536392 3.480291112 +89.50682004 39.62335341 +90.97680734 1598.991244 +92.98092987 46.13716368 +94.57331384 44.37587921 +97.07602789 655.2993307 +99.5429594 42.19533608 +101.0708987 733.3084926 +102.066292 52.02654598 +102.2845376 1601.345355 +104.0034256 48.82052248 +104.5317528 1602.886534 +105.4460999 1611.919675 +105.7271343 1611.835039 +106.0231437 64.49318885 +106.2399954 1612.325904 +106.5116177 1612.17329 +106.7629705 1611.850322 +106.9814579 1611.648399 +107.2424051 1611.574767 +107.4569385 1611.778713 +107.6884734 1611.621904 +107.9272908 1611.145653 +108.1575604 1611.664677 +109.0777249 3.299196943 +110.0599023 3.456417112 +147.112804 48
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_1_main_output.tsv Fri Feb 22 16:04:22 2019 -0500 @@ -0,0 +1,31 @@ +mz lcmsmatching.accession lcmsmatching.chrom.col.id lcmsmatching.chrom.col.name lcmsmatching.chrom.rt lcmsmatching.compound.id lcmsmatching.formula lcmsmatching.mass.csv.file.id lcmsmatching.molecular.mass lcmsmatching.ms.level lcmsmatching.ms.mode lcmsmatching.msprecmz lcmsmatching.name lcmsmatching.peak.attr lcmsmatching.peak.comp lcmsmatching.peak.mz lcmsmatching.peak.mztheo +80.04959021 U761.pos.col12.1.32|U761.pos.colpp.0.95|U761.pos.colzz2.4.24|U761.pos.colzz3.4.3|U761.pos.hcoltt.2.5 col12|colpp|colzz2|colzz3|hcoltt col12|colpp|colzz2|colzz3|hcoltt 1.32|0.95|4.24|4.3|2.5 U761 J16L6M62O U761.pos.col12.1.32|U761.pos.colpp.0.95|U761.pos.colzz2.4.24|U761.pos.colzz3.4.3|U761.pos.hcoltt.2.5 122.04801 1 pos 123.055289 Coquelicol;Paquerettol [(M+H)-(NHCO)]+ P9Z5W46 O0 80.049475 80.049475 +82.04819461 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +83.01343941 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +84.05585475 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +87.05536392 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +89.50682004 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +90.97680734 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +92.98092987 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +94.57331384 A10.pos.col12.0.8|A10.pos.colAA.1.58|A10.pos.somecol.8.97 col12|colAA|somecol col12|colAA|somecol 0.8|1.58|8.97 A10 J114L6M62O2 A10.pos.col12.0.8|A10.pos.colAA.1.58|A10.pos.somecol.8.97 146.10553 1 pos NA Blablaine|Blablaine';Blablaine|Blablaine [(M+2H)+(CH3CN)]++ P93Z8W419 O2 94.5733145 94.5733145 +97.07602789 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +99.5429594 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +101.0708987 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +102.066292 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +102.2845376 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +104.0034256 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +104.5317528 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +105.4460999 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +105.7271343 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +106.0231437 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +106.2399954 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +106.5116177 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +106.7629705 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +106.9814579 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +107.2424051 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +107.4569385 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +107.6884734 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +107.9272908 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +108.1575604 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +109.0777249 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +110.0599023 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_1_peaks_output.html Fri Feb 22 16:04:22 2019 -0500 @@ -0,0 +1,723 @@ +<html> + <header> + <meta charset="UTF-8"/> + <title>LC/MS matching results</title> + <style> + table, th, td { border-collapse: collapse; } + table, th { border: 1px solid black; } + td { border-left: 1px solid black; border-right: 1px solid black; } + th, td { padding: 5px; } + th { background-color: LightBlue; } + tr:nth-child(even) { background-color: LemonChiffon; } + tr:nth-child(odd) { background-color: LightGreen; } + </style> + </header> + <body> + <h3>Matched peaks</h3> + <table> + <tr> + <th>mz</th> + <th>lcmsmatching.accession</th> + <th>lcmsmatching.chrom.col.id</th> + <th>lcmsmatching.chrom.col.name</th> + <th>lcmsmatching.chrom.rt</th> + <th>lcmsmatching.compound.id</th> + <th>lcmsmatching.formula</th> + <th>lcmsmatching.mass.csv.file.id</th> + <th>lcmsmatching.molecular.mass</th> + <th>lcmsmatching.ms.level</th> + <th>lcmsmatching.ms.mode</th> + <th>lcmsmatching.msprecmz</th> + <th>lcmsmatching.name</th> + <th>lcmsmatching.peak.attr</th> + <th>lcmsmatching.peak.comp</th> + <th>lcmsmatching.peak.mz</th> + <th>lcmsmatching.peak.mztheo</th> + </tr> + <tr> + <td>80.04959</td> + <td>U761.pos.col12.1.32</td> + <td>col12</td> + <td>col12</td> + <td>1.32</td> + <td>U761</td> + <td>J16L6M62O</td> + <td>U761.pos.col12.1.32</td> + <td>122.048</td> + <td>1</td> + <td>pos</td> + <td>123.0553</td> + <td>Coquelicol;Paquerettol</td> + <td>4</td> + <td>P9Z5W46 O0</td> + <td>80.04948</td> + <td>80.04948</td> + </tr> + <tr> + <td>80.04959</td> + <td>U761.pos.colpp.0.95</td> + <td>colpp</td> + <td>colpp</td> + <td>0.95</td> + <td>U761</td> + <td>J16L6M62O</td> + <td>U761.pos.colpp.0.95</td> + <td>122.048</td> + <td>1</td> + <td>pos</td> + <td>123.0553</td> + <td>Coquelicol;Paquerettol</td> + <td>4</td> + <td>P9Z5W46 O0</td> + <td>80.04948</td> + <td>80.04948</td> + </tr> + <tr> + <td>80.04959</td> + <td>U761.pos.colzz2.4.24</td> + <td>colzz2</td> + <td>colzz2</td> + <td>4.24</td> + <td>U761</td> + <td>J16L6M62O</td> + <td>U761.pos.colzz2.4.24</td> + <td>122.048</td> + <td>1</td> + <td>pos</td> + <td>123.0553</td> + <td>Coquelicol;Paquerettol</td> + <td>4</td> + <td>P9Z5W46 O0</td> + <td>80.04948</td> + <td>80.04948</td> + </tr> + <tr> + <td>80.04959</td> + <td>U761.pos.colzz3.4.3</td> + <td>colzz3</td> + <td>colzz3</td> + <td>4.3</td> + <td>U761</td> + <td>J16L6M62O</td> + <td>U761.pos.colzz3.4.3</td> + <td>122.048</td> + <td>1</td> + <td>pos</td> + <td>123.0553</td> + <td>Coquelicol;Paquerettol</td> + <td>4</td> + <td>P9Z5W46 O0</td> + <td>80.04948</td> + <td>80.04948</td> + </tr> + <tr> + <td>80.04959</td> + <td>U761.pos.hcoltt.2.5</td> + <td>hcoltt</td> + <td>hcoltt</td> + <td>2.5</td> + <td>U761</td> + <td>J16L6M62O</td> + <td>U761.pos.hcoltt.2.5</td> + <td>122.048</td> + <td>1</td> + <td>pos</td> + <td>123.0553</td> + <td>Coquelicol;Paquerettol</td> + <td>4</td> + <td>P9Z5W46 O0</td> + <td>80.04948</td> + <td>80.04948</td> + </tr> + <tr> + <td>82.04819</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>83.01344</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>84.05585</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>87.05536</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>89.50682</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>90.97681</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>92.98093</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>94.57331</td> + <td>A10.pos.col12.0.8</td> + <td>col12</td> + <td>col12</td> + <td>0.8</td> + <td>A10</td> + <td>J114L6M62O2</td> + <td>A10.pos.col12.0.8</td> + <td>146.1055</td> + <td>1</td> + <td>pos</td> + <td/> + <td>Blablaine</td> + <td>31</td> + <td>P93Z8W419 O2</td> + <td>94.57331</td> + <td>94.57331</td> + </tr> + <tr> + <td>94.57331</td> + <td>A10.pos.colAA.1.58</td> + <td>colAA</td> + <td>colAA</td> + <td>1.58</td> + <td>A10</td> + <td>J114L6M62O2</td> + <td>A10.pos.colAA.1.58</td> + <td>146.1055</td> + <td>1</td> + <td>pos</td> + <td/> + <td>Blablaine';Blablaine</td> + <td>31</td> + <td>P93Z8W419 O2</td> + <td>94.57331</td> + <td>94.57331</td> + </tr> + <tr> + <td>94.57331</td> + <td>A10.pos.somecol.8.97</td> + <td>somecol</td> + <td>somecol</td> + <td>8.97</td> + <td>A10</td> + <td>J114L6M62O2</td> + <td>A10.pos.somecol.8.97</td> + <td>146.1055</td> + <td>1</td> + <td>pos</td> + <td/> + <td>Blablaine</td> + <td>31</td> + <td>P93Z8W419 O2</td> + <td>94.57331</td> + <td>94.57331</td> + </tr> + <tr> + <td>97.07603</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>99.54296</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>101.0709</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>102.0663</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>102.2845</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>104.0034</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>104.5318</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>105.4461</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>105.7271</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>106.0231</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>106.24</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>106.5116</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>106.763</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>106.9815</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>107.2424</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>107.4569</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>107.6885</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>107.9273</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>108.1576</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>109.0777</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>110.0599</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + </table> + </body> +</html>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_1_peaks_output.tsv Fri Feb 22 16:04:22 2019 -0500 @@ -0,0 +1,37 @@ +mz lcmsmatching.accession lcmsmatching.chrom.col.id lcmsmatching.chrom.col.name lcmsmatching.chrom.rt lcmsmatching.compound.id lcmsmatching.formula lcmsmatching.mass.csv.file.id lcmsmatching.molecular.mass lcmsmatching.ms.level lcmsmatching.ms.mode lcmsmatching.msprecmz lcmsmatching.name lcmsmatching.peak.attr lcmsmatching.peak.comp lcmsmatching.peak.mz lcmsmatching.peak.mztheo +80.04959021 U761.pos.col12.1.32 col12 col12 1.32 U761 J16L6M62O U761.pos.col12.1.32 122.04801 1 pos 123.055289 Coquelicol;Paquerettol [(M+H)-(NHCO)]+ P9Z5W46 O0 80.049475 80.049475 +80.04959021 U761.pos.colpp.0.95 colpp colpp 0.95 U761 J16L6M62O U761.pos.colpp.0.95 122.04801 1 pos 123.055289 Coquelicol;Paquerettol [(M+H)-(NHCO)]+ P9Z5W46 O0 80.049475 80.049475 +80.04959021 U761.pos.colzz2.4.24 colzz2 colzz2 4.24 U761 J16L6M62O U761.pos.colzz2.4.24 122.04801 1 pos 123.055289 Coquelicol;Paquerettol [(M+H)-(NHCO)]+ P9Z5W46 O0 80.049475 80.049475 +80.04959021 U761.pos.colzz3.4.3 colzz3 colzz3 4.3 U761 J16L6M62O U761.pos.colzz3.4.3 122.04801 1 pos 123.055289 Coquelicol;Paquerettol [(M+H)-(NHCO)]+ P9Z5W46 O0 80.049475 80.049475 +80.04959021 U761.pos.hcoltt.2.5 hcoltt hcoltt 2.5 U761 J16L6M62O U761.pos.hcoltt.2.5 122.04801 1 pos 123.055289 Coquelicol;Paquerettol [(M+H)-(NHCO)]+ P9Z5W46 O0 80.049475 80.049475 +82.04819461 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +83.01343941 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +84.05585475 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +87.05536392 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +89.50682004 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +90.97680734 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +92.98092987 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +94.57331384 A10.pos.col12.0.8 col12 col12 0.8 A10 J114L6M62O2 A10.pos.col12.0.8 146.10553 1 pos NA Blablaine [(M+2H)+(CH3CN)]++ P93Z8W419 O2 94.5733145 94.5733145 +94.57331384 A10.pos.colAA.1.58 colAA colAA 1.58 A10 J114L6M62O2 A10.pos.colAA.1.58 146.10553 1 pos NA Blablaine';Blablaine [(M+2H)+(CH3CN)]++ P93Z8W419 O2 94.5733145 94.5733145 +94.57331384 A10.pos.somecol.8.97 somecol somecol 8.97 A10 J114L6M62O2 A10.pos.somecol.8.97 146.10553 1 pos NA Blablaine [(M+2H)+(CH3CN)]++ P93Z8W419 O2 94.5733145 94.5733145 +97.07602789 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +99.5429594 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +101.0708987 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +102.066292 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +102.2845376 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +104.0034256 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +104.5317528 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +105.4460999 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +105.7271343 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +106.0231437 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +106.2399954 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +106.5116177 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +106.7629705 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +106.9814579 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +107.2424051 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +107.4569385 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +107.6884734 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +107.9272908 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +108.1575604 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +109.0777249 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +110.0599023 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_2_main_output.tsv Fri Feb 22 16:04:22 2019 -0500 @@ -0,0 +1,32 @@ +mz rt lcmsmatching.accession lcmsmatching.chrom.col.id lcmsmatching.chrom.col.name lcmsmatching.chrom.rt lcmsmatching.chrom.rt.unit lcmsmatching.compound.id lcmsmatching.formula lcmsmatching.mass.csv.file.id lcmsmatching.molecular.mass lcmsmatching.ms.level lcmsmatching.ms.mode lcmsmatching.name lcmsmatching.peak.attr lcmsmatching.peak.comp lcmsmatching.peak.mz lcmsmatching.peak.mztheo +80.04959021 339.9725632 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +82.04819461 1593.540123 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +83.01343941 654.9535891 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +84.05585475 4.748268943 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +87.05536392 3.480291112 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +89.50682004 39.62335341 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +90.97680734 1598.991244 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +92.98092987 46.13716368 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +94.57331384 44.37587921 A10.pos.col12.0.8 col12 col12 0.8 min A10 J114L6M62O2 A10.pos.col12.0.8 146.10553 1 pos Blablaine [(M+2H)+(CH3CN)]++ P93Z8W419 O2 94.5733145 94.5733145 +97.07602789 655.2993307 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +99.5429594 42.19533608 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +101.0708987 733.3084926 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +102.066292 52.02654598 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +102.2845376 1601.345355 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +104.0034256 48.82052248 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +104.5317528 1602.886534 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +105.4460999 1611.919675 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +105.7271343 1611.835039 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +106.0231437 64.49318885 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +106.2399954 1612.325904 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +106.5116177 1612.17329 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +106.7629705 1611.850322 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +106.9814579 1611.648399 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +107.2424051 1611.574767 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +107.4569385 1611.778713 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +107.6884734 1611.621904 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +107.9272908 1611.145653 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +108.1575604 1611.664677 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +109.0777249 3.299196943 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +110.0599023 3.456417112 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +147.112804 48 A10.pos.col12.0.8 col12 col12 0.8 min A10 J114L6M62O2 A10.pos.col12.0.8 146.10553 1 pos Blablaine [(M+H)]+ P92Z6W415 O2 147.112804 147.112804
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_2_peaks_output.html Fri Feb 22 16:04:22 2019 -0500 @@ -0,0 +1,660 @@ +<html> + <header> + <meta charset="UTF-8"/> + <title>LC/MS matching results</title> + <style> + table, th, td { border-collapse: collapse; } + table, th { border: 1px solid black; } + td { border-left: 1px solid black; border-right: 1px solid black; } + th, td { padding: 5px; } + th { background-color: LightBlue; } + tr:nth-child(even) { background-color: LemonChiffon; } + tr:nth-child(odd) { background-color: LightGreen; } + </style> + </header> + <body> + <h3>Matched peaks</h3> + <table> + <tr> + <th>mz</th> + <th>rt</th> + <th>lcmsmatching.accession</th> + <th>lcmsmatching.chrom.col.id</th> + <th>lcmsmatching.chrom.col.name</th> + <th>lcmsmatching.chrom.rt</th> + <th>lcmsmatching.chrom.rt.unit</th> + <th>lcmsmatching.compound.id</th> + <th>lcmsmatching.formula</th> + <th>lcmsmatching.mass.csv.file.id</th> + <th>lcmsmatching.molecular.mass</th> + <th>lcmsmatching.ms.level</th> + <th>lcmsmatching.ms.mode</th> + <th>lcmsmatching.name</th> + <th>lcmsmatching.peak.attr</th> + <th>lcmsmatching.peak.comp</th> + <th>lcmsmatching.peak.mz</th> + <th>lcmsmatching.peak.mztheo</th> + </tr> + <tr> + <td>80.04959</td> + <td>339.9726</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>82.04819</td> + <td>1593.54</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>83.01344</td> + <td>654.9536</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>84.05585</td> + <td>4.748269</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>87.05536</td> + <td>3.480291</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>89.50682</td> + <td>39.62335</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>90.97681</td> + <td>1598.991</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>92.98093</td> + <td>46.13716</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>94.57331</td> + <td>44.37588</td> + <td>A10.pos.col12.0.8</td> + <td>col12</td> + <td>col12</td> + <td>0.8</td> + <td>min</td> + <td>A10</td> + <td>J114L6M62O2</td> + <td>A10.pos.col12.0.8</td> + <td>146.1055</td> + <td>1</td> + <td>pos</td> + <td>Blablaine</td> + <td>19</td> + <td>P93Z8W419 O2</td> + <td>94.57331</td> + <td>94.57331</td> + </tr> + <tr> + <td>97.07603</td> + <td>655.2993</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>99.54296</td> + <td>42.19534</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>101.0709</td> + <td>733.3085</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>102.0663</td> + <td>52.02655</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>102.2845</td> + <td>1601.345</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>104.0034</td> + <td>48.82052</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>104.5318</td> + <td>1602.887</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>105.4461</td> + <td>1611.92</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>105.7271</td> + <td>1611.835</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>106.0231</td> + <td>64.49319</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>106.24</td> + <td>1612.326</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>106.5116</td> + <td>1612.173</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>106.763</td> + <td>1611.85</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>106.9815</td> + <td>1611.648</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>107.2424</td> + <td>1611.575</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>107.4569</td> + <td>1611.779</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>107.6885</td> + <td>1611.622</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>107.9273</td> + <td>1611.146</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>108.1576</td> + <td>1611.665</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>109.0777</td> + <td>3.299197</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>110.0599</td> + <td>3.456417</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>147.1128</td> + <td>48</td> + <td>A10.pos.col12.0.8</td> + <td>col12</td> + <td>col12</td> + <td>0.8</td> + <td>min</td> + <td>A10</td> + <td>J114L6M62O2</td> + <td>A10.pos.col12.0.8</td> + <td>146.1055</td> + <td>1</td> + <td>pos</td> + <td>Blablaine</td> + <td>34</td> + <td>P92Z6W415 O2</td> + <td>147.1128</td> + <td>147.1128</td> + </tr> + </table> + </body> +</html>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_2_peaks_output.tsv Fri Feb 22 16:04:22 2019 -0500 @@ -0,0 +1,32 @@ +mz rt lcmsmatching.accession lcmsmatching.chrom.col.id lcmsmatching.chrom.col.name lcmsmatching.chrom.rt lcmsmatching.chrom.rt.unit lcmsmatching.compound.id lcmsmatching.formula lcmsmatching.mass.csv.file.id lcmsmatching.molecular.mass lcmsmatching.ms.level lcmsmatching.ms.mode lcmsmatching.name lcmsmatching.peak.attr lcmsmatching.peak.comp lcmsmatching.peak.mz lcmsmatching.peak.mztheo +80.04959021 339.9725632 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +82.04819461 1593.540123 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +83.01343941 654.9535891 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +84.05585475 4.748268943 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +87.05536392 3.480291112 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +89.50682004 39.62335341 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +90.97680734 1598.991244 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +92.98092987 46.13716368 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +94.57331384 44.37587921 A10.pos.col12.0.8 col12 col12 0.8 min A10 J114L6M62O2 A10.pos.col12.0.8 146.10553 1 pos Blablaine [(M+2H)+(CH3CN)]++ P93Z8W419 O2 94.5733145 94.5733145 +97.07602789 655.2993307 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +99.5429594 42.19533608 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +101.0708987 733.3084926 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +102.066292 52.02654598 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +102.2845376 1601.345355 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +104.0034256 48.82052248 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +104.5317528 1602.886534 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +105.4460999 1611.919675 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +105.7271343 1611.835039 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +106.0231437 64.49318885 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +106.2399954 1612.325904 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +106.5116177 1612.17329 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +106.7629705 1611.850322 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +106.9814579 1611.648399 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +107.2424051 1611.574767 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +107.4569385 1611.778713 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +107.6884734 1611.621904 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +107.9272908 1611.145653 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +108.1575604 1611.664677 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +109.0777249 3.299196943 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +110.0599023 3.456417112 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +147.112804 48 A10.pos.col12.0.8 col12 col12 0.8 min A10 J114L6M62O2 A10.pos.col12.0.8 146.10553 1 pos Blablaine [(M+H)]+ P92Z6W415 O2 147.112804 147.112804
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_3_main_output.tsv Fri Feb 22 16:04:22 2019 -0500 @@ -0,0 +1,32 @@ +mz rt lcmsmatching.accession lcmsmatching.chrom.col.id lcmsmatching.chrom.col.name lcmsmatching.chrom.rt lcmsmatching.chrom.rt.unit lcmsmatching.compound.id lcmsmatching.formula lcmsmatching.mass.csv.file.id lcmsmatching.molecular.mass lcmsmatching.ms.level lcmsmatching.ms.mode lcmsmatching.msprecmz lcmsmatching.name lcmsmatching.peak.attr lcmsmatching.peak.comp lcmsmatching.peak.mz lcmsmatching.peak.mztheo +80.04959021 339.9725632 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +82.04819461 1593.540123 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +83.01343941 654.9535891 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +84.05585475 4.748268943 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +87.05536392 3.480291112 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +89.50682004 39.62335341 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +90.97680734 1598.991244 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +92.98092987 46.13716368 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +94.57331384 44.37587921 A10.pos.col12.0.8 col12 col12 0.8 min A10 J114L6M62O2 A10.pos.col12.0.8 146.10553 1 pos 147.112804 Blablaine [(M+2H)+(CH3CN)]++ P93Z8W419 O2 94.5733145 94.5733145 +97.07602789 655.2993307 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +99.5429594 42.19533608 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +101.0708987 733.3084926 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +102.066292 52.02654598 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +102.2845376 1601.345355 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +104.0034256 48.82052248 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +104.5317528 1602.886534 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +105.4460999 1611.919675 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +105.7271343 1611.835039 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +106.0231437 64.49318885 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +106.2399954 1612.325904 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +106.5116177 1612.17329 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +106.7629705 1611.850322 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +106.9814579 1611.648399 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +107.2424051 1611.574767 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +107.4569385 1611.778713 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +107.6884734 1611.621904 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +107.9272908 1611.145653 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +108.1575604 1611.664677 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +109.0777249 3.299196943 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +110.0599023 3.456417112 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +147.112804 48 A10.pos.col12.0.8 col12 col12 0.8 min A10 J114L6M62O2 A10.pos.col12.0.8 146.10553 1 pos 147.112804 Blablaine [(M+H)]+ P92Z6W415 O2 147.112804 147.112804
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_3_peaks_output.html Fri Feb 22 16:04:22 2019 -0500 @@ -0,0 +1,692 @@ +<html> + <header> + <meta charset="UTF-8"/> + <title>LC/MS matching results</title> + <style> + table, th, td { border-collapse: collapse; } + table, th { border: 1px solid black; } + td { border-left: 1px solid black; border-right: 1px solid black; } + th, td { padding: 5px; } + th { background-color: LightBlue; } + tr:nth-child(even) { background-color: LemonChiffon; } + tr:nth-child(odd) { background-color: LightGreen; } + </style> + </header> + <body> + <h3>Matched peaks</h3> + <table> + <tr> + <th>mz</th> + <th>rt</th> + <th>lcmsmatching.accession</th> + <th>lcmsmatching.chrom.col.id</th> + <th>lcmsmatching.chrom.col.name</th> + <th>lcmsmatching.chrom.rt</th> + <th>lcmsmatching.chrom.rt.unit</th> + <th>lcmsmatching.compound.id</th> + <th>lcmsmatching.formula</th> + <th>lcmsmatching.mass.csv.file.id</th> + <th>lcmsmatching.molecular.mass</th> + <th>lcmsmatching.ms.level</th> + <th>lcmsmatching.ms.mode</th> + <th>lcmsmatching.msprecmz</th> + <th>lcmsmatching.name</th> + <th>lcmsmatching.peak.attr</th> + <th>lcmsmatching.peak.comp</th> + <th>lcmsmatching.peak.mz</th> + <th>lcmsmatching.peak.mztheo</th> + </tr> + <tr> + <td>80.04959</td> + <td>339.9726</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>82.04819</td> + <td>1593.54</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>83.01344</td> + <td>654.9536</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>84.05585</td> + <td>4.748269</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>87.05536</td> + <td>3.480291</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>89.50682</td> + <td>39.62335</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>90.97681</td> + <td>1598.991</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>92.98093</td> + <td>46.13716</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>94.57331</td> + <td>44.37588</td> + <td>A10.pos.col12.0.8</td> + <td>col12</td> + <td>col12</td> + <td>0.8</td> + <td>min</td> + <td>A10</td> + <td>J114L6M62O2</td> + <td>A10.pos.col12.0.8</td> + <td>146.1055</td> + <td>1</td> + <td>pos</td> + <td>147.1128</td> + <td>Blablaine</td> + <td>19</td> + <td>P93Z8W419 O2</td> + <td>94.57331</td> + <td>94.57331</td> + </tr> + <tr> + <td>97.07603</td> + <td>655.2993</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>99.54296</td> + <td>42.19534</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>101.0709</td> + <td>733.3085</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>102.0663</td> + <td>52.02655</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>102.2845</td> + <td>1601.345</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>104.0034</td> + <td>48.82052</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>104.5318</td> + <td>1602.887</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>105.4461</td> + <td>1611.92</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>105.7271</td> + <td>1611.835</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>106.0231</td> + <td>64.49319</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>106.24</td> + <td>1612.326</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>106.5116</td> + <td>1612.173</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>106.763</td> + <td>1611.85</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>106.9815</td> + <td>1611.648</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>107.2424</td> + <td>1611.575</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>107.4569</td> + <td>1611.779</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>107.6885</td> + <td>1611.622</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>107.9273</td> + <td>1611.146</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>108.1576</td> + <td>1611.665</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>109.0777</td> + <td>3.299197</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>110.0599</td> + <td>3.456417</td> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + <td/> + </tr> + <tr> + <td>147.1128</td> + <td>48</td> + <td>A10.pos.col12.0.8</td> + <td>col12</td> + <td>col12</td> + <td>0.8</td> + <td>min</td> + <td>A10</td> + <td>J114L6M62O2</td> + <td>A10.pos.col12.0.8</td> + <td>146.1055</td> + <td>1</td> + <td>pos</td> + <td>147.1128</td> + <td>Blablaine</td> + <td>34</td> + <td>P92Z6W415 O2</td> + <td>147.1128</td> + <td>147.1128</td> + </tr> + </table> + </body> +</html>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_3_peaks_output.tsv Fri Feb 22 16:04:22 2019 -0500 @@ -0,0 +1,32 @@ +mz rt lcmsmatching.accession lcmsmatching.chrom.col.id lcmsmatching.chrom.col.name lcmsmatching.chrom.rt lcmsmatching.chrom.rt.unit lcmsmatching.compound.id lcmsmatching.formula lcmsmatching.mass.csv.file.id lcmsmatching.molecular.mass lcmsmatching.ms.level lcmsmatching.ms.mode lcmsmatching.msprecmz lcmsmatching.name lcmsmatching.peak.attr lcmsmatching.peak.comp lcmsmatching.peak.mz lcmsmatching.peak.mztheo +80.04959021 339.9725632 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +82.04819461 1593.540123 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +83.01343941 654.9535891 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +84.05585475 4.748268943 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +87.05536392 3.480291112 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +89.50682004 39.62335341 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +90.97680734 1598.991244 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +92.98092987 46.13716368 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +94.57331384 44.37587921 A10.pos.col12.0.8 col12 col12 0.8 min A10 J114L6M62O2 A10.pos.col12.0.8 146.10553 1 pos 147.112804 Blablaine [(M+2H)+(CH3CN)]++ P93Z8W419 O2 94.5733145 94.5733145 +97.07602789 655.2993307 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +99.5429594 42.19533608 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +101.0708987 733.3084926 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +102.066292 52.02654598 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +102.2845376 1601.345355 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +104.0034256 48.82052248 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +104.5317528 1602.886534 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +105.4460999 1611.919675 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +105.7271343 1611.835039 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +106.0231437 64.49318885 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +106.2399954 1612.325904 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +106.5116177 1612.17329 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +106.7629705 1611.850322 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +106.9814579 1611.648399 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +107.2424051 1611.574767 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +107.4569385 1611.778713 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +107.6884734 1611.621904 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +107.9272908 1611.145653 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +108.1575604 1611.664677 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +109.0777249 3.299196943 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +110.0599023 3.456417112 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA +147.112804 48 A10.pos.col12.0.8 col12 col12 0.8 min A10 J114L6M62O2 A10.pos.col12.0.8 146.10553 1 pos 147.112804 Blablaine [(M+H)]+ P92Z6W415 O2 147.112804 147.112804