Mercurial > repos > prog > lcmsmatching
changeset 5:fb9c0409d85c draft
planemo upload for repository https://github.com/workflow4metabolomics/lcmsmatching.git commit 608d9e59a0d2dcf85a037968ddb2c61137fb9bce
author | prog |
---|---|
date | Wed, 19 Apr 2017 10:00:05 -0400 |
parents | b34c14151f25 |
children | f86fec07f392 |
files | BiodbConn.R BiodbEntry.R BiodbLogger.R BiodbObserver.R ChebiConn.R ChebiEntry.R ChemspiderConn.R ChemspiderEntry.R EnzymeConn.R EnzymeEntry.R HmdbConn.R HmdbEntry.R KeggConn.R KeggEntry.R LipidmapsConn.R LipidmapsEntry.R MassFiledbConn.R MirbaseConn.R MirbaseEntry.R MsBioDb.R MsDb.R MsDbInputDataFrameStream.R MsDbOutputDataFrameStream.R MsDbOutputStream.R MsFileDb.R MsPeakForestDb.R MsXlsDb.R NcbiccdsConn.R NcbiccdsEntry.R NcbigeneConn.R NcbigeneEntry.R PubchemConn.R PubchemEntry.R README.md RemotedbConn.R UniprotConn.R UniprotEntry.R UrlRequestScheduler.R biodb-package.R build.xml chem.R dfhlp.R hshhlp.R lcmsmatching.xml list-chrom-cols.py list-file-cols.py list-ms-mode-values.py massdb-helper.R msdb-common.R mysql.R search-mz spec-dist.R test-data/filedb-small-mz-match-html-output.html test-data/filedb-small-mz-match-output.tsv test-data/filedb-small-mz-match-peaks-output.tsv test-data/filedb.tsv todf.R tolst.R tostr.R |
diffstat | 59 files changed, 1185 insertions(+), 3827 deletions(-) [+] |
line wrap: on
line diff
--- a/BiodbConn.R Tue Mar 14 12:40:22 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,80 +0,0 @@ -##################### -# CLASS DECLARATION # -##################### - -BiodbConn <- methods::setRefClass("BiodbConn", contains = "BiodbObject", fields = list( .debug = "logical" )) - -############### -# CONSTRUCTOR # -############### - -BiodbConn$methods( initialize = function(debug = FALSE, ...) { - .debug <<- debug -}) - -####################### -# PRINT DEBUG MESSAGE # -####################### - -BiodbConn$methods( .print.debug.msg = function(msg) { - if (.self$.debug) - .print.msg(msg = msg, class = class(.self)) -}) - -########################## -# GET ENTRY CONTENT TYPE # -########################## - -BiodbConn$methods( getEntryContentType = function() { - .self$.abstract.method() -}) - -############# -# GET ENTRY # -############# - -BiodbConn$methods( getEntry = function(id, drop = TRUE) { - content <- .self$getEntryContent(id) - return(.self$createEntry(content, drop = drop)) -}) - -##################### -# GET ENTRY CONTENT # -##################### - -# Download entry content from the public database. -# type The entry type. -# id The ID of the entry to get. -# RETURN An entry content downloaded from database. -BiodbConn$methods( getEntryContent = function(id) { - .self$.abstract.method() -}) - -############################# -# CREATE ENTRY FROM CONTENT # -############################# - -# Creates a Compound instance from file content. -# content A file content, downloaded from the public database. -# RETURN A compound instance. -BiodbConn$methods( createEntry = function(content, drop = TRUE) { - .self$.abstract.method() -}) - -################# -# GET ENTRY IDS # -################# - -# Get a list of IDs of all entries contained in this database. -BiodbConn$methods( getEntryIds = function(max.results = NA_integer_) { - .self$.abstract.method() -}) - -################## -# GET NB ENTRIES # -################## - -# Get the number of entries contained in this database. -BiodbConn$methods( getNbEntries = function() { - .self$.abstract.method() -})
--- a/BiodbEntry.R Tue Mar 14 12:40:22 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,182 +0,0 @@ -############# -# CONSTANTS # -############# - -BIODB.BASIC.CLASSES <- c('character', 'integer', 'double', 'logical') - -######################## -# ENTRY ABSTRACT CLASS # -######################## - -BiodbEntry <- methods::setRefClass("BiodbEntry", fields = list(.fields ='list', .factory = "ANY")) - -############### -# CONSTRUCTOR # -############### - -BiodbEntry$methods( initialize = function(...) { - - .fields <<- list() - .factory <<- NULL - - callSuper(...) # calls super-class initializer with remaining parameters -}) - -################### -# SET FIELD VALUE # -################### - -BiodbEntry$methods( setFieldValue = function(field, value) { - - class = .self$getFieldClass(field) - - # Secific case to handle objects. - if ( class ==" object" & !(isS4(value) & methods::is(value, "refClass"))) - stop(paste0('Cannot set a non RC instance to field "', field, '" in BiodEntry.')) - - # Check cardinality - if (class != 'data.frame' && .self$getFieldCardinality(field) == BIODB.CARD.ONE && length(value) > 1) - stop(paste0('Cannot set more that one value to single value field "', field, '" in BiodEntry.')) - - # Check value class - value <- switch(class, - 'character' = as.character(value), - 'double' = as.double(value), - 'integer' = as.integer(value), - 'logical' = as.logical(value), - value) - # TODO check value class - - .self$.fields[[field]] <- value -}) - -################### -# GET FIELD NAMES # -################### - -BiodbEntry$methods( getFieldNames = function(field) { - return(names(.self$.fields)) -}) - -############# -# HAS FIELD # -############# - -BiodbEntry$methods( hasField = function(field) { - return(field %in% names(.self$.fields)) -}) - -################### -# GET FIELD CLASS # -################### - -BiodbEntry$methods( getFieldClass = function(field) { - - if ( ! field %in% BIODB.FIELDS[['name']]) - stop(paste0('Unknown field "', field, '" in BiodEntry.')) - - field.class <- BIODB.FIELDS[which(field == BIODB.FIELDS[['name']]), 'class'] - - return(field.class) -}) - -######################### -# FIELD HAS BASIC CLASS # -######################### - -BiodbEntry$methods( fieldHasBasicClass = function(field) { - return(.self$getFieldClass(field) %in% BIODB.BASIC.CLASSES) -}) - -######################### -# GET FIELD CARDINALITY # -######################### - -BiodbEntry$methods( getFieldCardinality = function(field) { - - if ( ! field %in% BIODB.FIELDS[['name']]) - stop(paste0('Unknown field "', field, '" in BiodEntry.')) - - field.card <- BIODB.FIELDS[which(field == BIODB.FIELDS[['name']]), 'cardinality'] - - return(field.card) -}) - -################### -# GET FIELD VALUE # -################### - -BiodbEntry$methods( getFieldValue = function(field, compute = TRUE) { - - if ( ! field %in% BIODB.FIELDS[['name']]) - stop(paste0('Unknown field "', field, '" in BiodEntry.')) - - if (field %in% names(.self$.fields)) - return(.self$.fields[[field]]) - else if (compute && .self$.compute.field(field)) - return(.self$.fields[[field]]) - - # Return NULL or NA - class = .self$getFieldClass(field) - return(if (class %in% BIODB.BASIC.CLASSES) as.vector(NA, mode = class) else NULL) -}) - -################# -# COMPUTE FIELD # -################## - -BiodbEntry$methods( .compute.field = function(field) { - - if ( ! is.null(.self$.factory) && field %in% names(BIODB.FIELD.COMPUTING)) { - for (db in BIODB.FIELD.COMPUTING[[field]]) { - db.id <- .self$getField(paste0(db, 'id')) - if ( ! is.na(db.id)) { - db.entry <- .self$.factory$createEntry(db, id = db.id) - if ( ! is.null(db.entry)) { - .self$setField(field, db.entry$getField(field)) - return(TRUE) - } - } - } - } - - return(FALSE) -}) - -############################ -# GET FIELDS AS DATA FRAME # -############################ -###TODO add a limiting option to get some fields. -BiodbEntry$methods( getFieldsAsDataFrame = function() { - df <- data.frame() - # Loop on all fields - for (f in names(.self$.fields)) - - # If field class is a basic type - if (.self$getFieldClass(f) %in% c('character', 'logical', 'integer', 'double') & - length(.self$getFieldValue(f)) == 1) - df[1, f] <- .self$getFieldValue(f) - - return(df) -}) - -########### -# FACTORY # -########### - -BiodbEntry$methods( setFactory = function(factory) { - is.null(factory) || inherits(factory, "BiodbFactory") || stop("The factory instance must inherit from BiodbFactory class.") - .factory <<- factory -}) - -############## -# DEPRECATED # -############## - -BiodbEntry$methods( getField = function(field) { - return(.self$getFieldValue(field)) -}) - -BiodbEntry$methods( setField = function(field, value) { - .self$setFieldValue(field, value) -})
--- a/BiodbLogger.R Tue Mar 14 12:40:22 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,47 +0,0 @@ -# vi: fdm=marker - -########################## -# CLASS DECLARATION {{{1 # -########################## - -BiodbLogger <- methods::setRefClass("BiodbLogger", contains = 'BiodbObserver', fields = list(.verbose.level = 'integer', .debug.level = 'integer', .file = 'ANY', .fail.on.error = 'logical', .signal.warnings = 'logical')) - -#################### -# CONSTRUCTOR {{{1 # -#################### - -BiodbLogger$methods( initialize = function(verbose.level = 1, debug.level = 1, file = NULL, ...) { - - .verbose.level <<- if ( ! is.null(verbose.level) && ! is.na(verbose.level)) verbose.level else 1 - .debug.level <<- if ( ! is.null(debug.level) && ! is.na(debug.level)) debug.level else 1 - .file <<- if ( ! is.null(file) && ! is.na(file)) file else stderr() - .fail.on.error <<- TRUE - .signal.warnings <<- TRUE - - callSuper(...) -}) - -################ -# MESSAGE {{{1 # -################ - -BiodbLogger$methods( message = function(type = MSG.INFO, msg, level = 1) { - type %in% biodb::MSG.TYPES || .self$message(biodb::MSG.ERROR, paste0("Unknown message type ", type, ".")) - - display = TRUE - if (type == biodb::MSG.INFO && .self$.verbose.level < level) - display = FALSE - if (type == biodb::MSG.DEBUG && .self$.debug.level < level) - display = FALSE - - if (display) - cat(type, ': ', msg, "\n", sep = '', file = .self$.file) - - # Raise error - if (type == biodb::MSG.ERROR && .self$.fail.on.error) - stop(msg) - - # Raise warning - if (type == biodb::MSG.WARNING && .self$.signal.warnings) - warning(msg) -})
--- a/BiodbObserver.R Tue Mar 14 12:40:22 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,25 +0,0 @@ -# vi: fdm=marker - -########################## -# CLASS DECLARATION {{{1 # -########################## - -BiodbObserver <- methods::setRefClass("BiodbObserver", fields = list()) - -################### -# CONSTANTS {{{ 1 # -################### - -MSG.INFO <- 'INFO' -MSG.DEBUG <- 'DEBUG' -MSG.WARNING <- 'WARNING' -MSG.ERROR <- 'ERROR' - -.MSG.TYPES <- c(MSG.ERROR, MSG.WARNING, MSG.DEBUG, MSG.INFO) - -################ -# MESSAGE {{{1 # -################ - -BiodbObserver$methods( message = function(type = MSG.INFO, msg, level = 1) { -})
--- a/ChebiConn.R Tue Mar 14 12:40:22 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,59 +0,0 @@ -##################### -# CLASS DECLARATION # -##################### - -ChebiConn <- methods::setRefClass("ChebiConn", contains = "RemotedbConn") - -########################## -# GET ENTRY CONTENT TYPE # -########################## - -ChebiConn$methods( getEntryContentType = function() { - return(BIODB.HTML) -}) - -##################### -# GET ENTRY CONTENT # -##################### - -ChebiConn$methods( getEntryContent = function(id) { - - # Initialize return values - content <- rep(NA_character_, length(id)) - - # Request - content <- vapply(id, function(x) .self$.get.url(get.entry.url(BIODB.CHEBI, x)), FUN.VALUE = '') - - return(content) -}) - -################ -# CREATE ENTRY # -################ - -ChebiConn$methods( createEntry = function(content, drop = TRUE) { - return(createChebiEntryFromHtml(content, drop = drop)) -}) - -################## -# GET NB ENTRIES # -################## - -ChebiConn$methods( getNbEntries = function() { - return(NA_integer_) -}) - -################# -# GET ENTRY IDS # -################# - -ChebiConn$methods( getEntryIds = function(max.results = NA_integer_) { - request <- "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?><SOAP-ENV:Envelope xmlns:SOAP-ENV=\"http://schemas.xmlsoap.org/soap/envelope/\" xmlns:tns=\"http://www.ebi.ac.uk/webservices/chebi\" xmlns:xsd=\"http://www.w3.org/2001/XMLSchema\" xmlns:soap=\"http://schemas.xmlsoap.org/wsdl/soap/\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" ><SOAP-ENV:Body><tns:getLiteEntity xmlns:tns=\"http://www.ebi.ac.uk/webservices/chebi\"><tns:search>1*</tns:search><tns:searchCategory>CHEBI ID</tns:searchCategory><tns:maximumResults>100</tns:maximumResults><tns:stars></tns:stars></tns:getLiteEntity></SOAP-ENV:Body></SOAP-ENV:Envelope>" - print('********************************************************************************') - print('********************************************************************************') - results <- .self$.scheduler$sendSoapRequest('http://www.ebi.ac.uk:80/webservices/chebi/2.0/webservice', request) - print(results) - print('********************************************************************************') - print('********************************************************************************') - return(NULL) -})
--- a/ChebiEntry.R Tue Mar 14 12:40:22 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,57 +0,0 @@ -##################### -# CLASS DECLARATION # -##################### - -ChebiEntry <- methods::setRefClass("ChebiEntry", contains = "BiodbEntry") - -########### -# FACTORY # -########### - -createChebiEntryFromHtml <- function(contents, drop = TRUE) { - - entries <- list() - - # Define xpath expressions - xpath.expr <- character() -# xpath.expr[[BIODB.ACCESSION]] <- "//b[starts-with(., 'CHEBI:')]" - xpath.expr[[BIODB.INCHI]] <- "//td[starts-with(., 'InChI=')]" - xpath.expr[[BIODB.INCHIKEY]] <- "//td[text()='InChIKey']/../td[2]" - - for (content in contents) { - - # Create instance - entry <- ChebiEntry$new() - - if ( ! is.null(content) && ! is.na(content)) { - - # Parse HTML - xml <- XML::htmlTreeParse(content, asText = TRUE, useInternalNodes = TRUE) - - # Test generic xpath expressions - for (field in names(xpath.expr)) { - v <- XML::xpathSApply(xml, xpath.expr[[field]], XML::xmlValue) - if (length(v) > 0) - entry$setField(field, v) - } - - # Get accession - accession <- XML::xpathSApply(xml, "//b[starts-with(., 'CHEBI:')]", XML::xmlValue) - if (length(accession) > 0) { - accession <- sub('^CHEBI:([0-9]+)$', '\\1', accession, perl = TRUE) - entry$setField(BIODB.ACCESSION, accession) - } - } - - entries <- c(entries, entry) - } - - # Replace elements with no accession id by NULL - entries <- lapply(entries, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x) - - # If the input was a single element, then output a single object - if (drop && length(contents) == 1) - entries <- entries[[1]] - - return(entries) -}
--- a/ChemspiderConn.R Tue Mar 14 12:40:22 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,92 +0,0 @@ -##################### -# CLASS DECLARATION # -##################### - -ChemspiderConn <- methods::setRefClass("ChemspiderConn", contains = "RemotedbConn") - -########################## -# GET ENTRY CONTENT TYPE # -########################## - -ChemspiderConn$methods( getEntryContentType = function() { - return(BIODB.XML) -}) - -##################### -# GET ENTRY CONTENT # -##################### - -ChemspiderConn$methods( getEntryContent = function(ids) { - - # Debug - .self$.print.debug.msg(paste0("Get entry content(s) for ", length(ids)," id(s)...")) - - URL.MAX.LENGTH <- 2083 - - # Initialize return values - content <- rep(NA_character_, length(ids)) - - # Loop on all - n <- 0 - inc <- NA_integer_ - while (n < length(ids)) { - - # Get list of accession ids to retrieve - accessions <- ids[(n + 1):(if (is.na(inc)) length(ids) else (min(n + inc, length(ids))))] - - # Create URL request - x <- get.entry.url(class = BIODB.CHEMSPIDER, accession = accessions, content.type = BIODB.XML, max.length = URL.MAX.LENGTH, base.url = .self$.url, token = .self$.token) - - # Debug - .self$.print.debug.msg(paste0("Send URL request for ", x$n," id(s)...")) - - # Send request - xmlstr <- .self$.get.url(x$url) - - # Error : "Cannot convert WRONG to System.Int32.\r\nParameter name: type ---> Input string was not in a correct format.\r\n" - if (grepl('^Cannot convert .* to System\\.Int32\\.', xmlstr)) { - # One of the ids is incorrect - if (is.na(inc)) { - inc <- 1 - next - } - else - xmlstr <- NA_character_ - } - - # Increase number of entries retrieved - n <- n + x$n - - # Parse XML and get included XML - if ( ! is.na(xmlstr)) { - xml <- xmlInternalTreeParse(xmlstr, asText = TRUE) - ns <- c(csns = "http://www.chemspider.com/") - returned.ids <- xpathSApply(xml, "//csns:ExtendedCompoundInfo/csns:CSID", xmlValue, namespaces = ns) - content[match(returned.ids, ids)] <- vapply(getNodeSet(xml, "//csns:ExtendedCompoundInfo", namespaces = ns), saveXML, FUN.VALUE = '') - } - - # Debug - .self$.print.debug.msg(paste0("Now ", length(ids) - n," id(s) left to be retrieved...")) - } - - return(content) -}) - -################ -# CREATE ENTRY # -################ - -ChemspiderConn$methods( createEntry = function(content, drop = TRUE) { - return(createChemspiderEntryFromXml(content, drop = drop)) -}) - -############################ -# GET CHEMSPIDER IMAGE URL # -############################ - -get.chemspider.image.url <- function(id) { - - url <- paste0('http://www.chemspider.com/ImagesHandler.ashx?w=300&h=300&id=', id) - - return(url) -}
--- a/ChemspiderEntry.R Tue Mar 14 12:40:22 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,103 +0,0 @@ -##################### -# CLASS DECLARATION # -##################### - -ChemspiderEntry <- methods::setRefClass("ChemspiderEntry", contains = "BiodbEntry") - -############################ -# CREATE COMPOUND FROM XML # -############################ - -createChemspiderEntryFromXml <- function(contents, drop = TRUE) { - - entries <- list() - - # Define xpath expressions - xpath.expr <- character() - xpath.expr[[BIODB.ACCESSION]] <- "//CSID" - xpath.expr[[BIODB.FORMULA]] <- "//MF" - xpath.expr[[BIODB.NAME]] <- "//CommonName" - xpath.expr[[BIODB.AVERAGE.MASS]] <- "//AverageMass" - xpath.expr[[BIODB.INCHI]] <- "//InChI" - xpath.expr[[BIODB.INCHIKEY]] <- "//InChIKey" - xpath.expr[[BIODB.SMILES]] <- "//SMILES" - - for (content in contents) { - - # Create instance - entry <- ChemspiderEntry$new() - - if ( ! is.null(content) && ! is.na(content) && content != 'NA') { - - # Parse XML - xml <- XML::xmlInternalTreeParse(content, asText = TRUE) - - # Test generic xpath expressions - for (field in names(xpath.expr)) { - v <- XML::xpathSApply(xml, xpath.expr[[field]], XML::xmlValue) - if (length(v) > 0) - entry$setField(field, v) - } - } - - entries <- c(entries, entry) - } - - # Replace elements with no accession id by NULL - entries <- lapply(entries, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x) - - # If the input was a single element, then output a single object - if (drop && length(contents) == 1) - entries <- entries[[1]] - - return(entries) -} - -############################# -# CREATE COMPOUND FROM HTML # -############################# - -createChemspiderEntryFromHtml <- function(contents, drop = TRUE) { - - entries <- list() - - # Define xpath expressions - xpath.expr <- character() - - for (content in contents) { - - # Create instance - entry <- ChemspiderEntry$new() - - if ( ! is.null(content) && ! is.na(content)) { - - # Parse HTML - xml <- XML::htmlTreeParse(content, asText = TRUE, useInternalNodes = TRUE) - - # Test generic xpath expressions - for (field in names(xpath.expr)) { - v <- XML::xpathSApply(xml, xpath.expr[[field]], XML::xmlValue) - if (length(v) > 0) - entry$setField(field, v) - } - - # Get accession - accession <- XML::xpathSApply(xml, "//li[starts-with(., 'ChemSpider ID')]", XML::xmlValue) - if (length(accession) > 0) { - accession <- sub('^ChemSpider ID([0-9]+)$', '\\1', accession, perl = TRUE) - entry$setField(BIODB.ACCESSION, accession) - } - } - - entries <- c(entries, entry) - } - - # Replace elements with no accession id by NULL - entries <- lapply(entries, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x) - - # If the input was a single element, then output a single object - if (drop && length(contents) == 1) - entries <- entries[[1]] - - return(entries) -}
--- a/EnzymeConn.R Tue Mar 14 12:40:22 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,36 +0,0 @@ -##################### -# CLASS DECLARATION # -##################### - -EnzymeConn <- methods::setRefClass("EnzymeConn", contains = "RemotedbConn") - -########################## -# GET ENTRY CONTENT TYPE # -########################## - -EnzymeConn$methods( getEntryContentType = function() { - return(BIODB.TXT) -}) - -##################### -# GET ENTRY CONTENT # -##################### - -EnzymeConn$methods( getEntryContent = function(id) { - - # Initialize return values - content <- rep(NA_character_, length(id)) - - # Request - content <- vapply(id, function(x) .self$.get.url(get.entry.url(BIODB.ENZYME, accession = x, content.type = BIODB.TXT)), FUN.VALUE = '') - - return(content) -}) - -################ -# CREATE ENTRY # -################ - -EnzymeConn$methods( createEntry = function(content, drop = TRUE) { - return(createEnzymeEntryFromTxt(content, drop = drop)) -})
--- a/EnzymeEntry.R Tue Mar 14 12:40:22 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,53 +0,0 @@ -##################### -# CLASS DECLARATION # -##################### - -EnzymeEntry <- methods::setRefClass("EnzymeEntry", contains = 'BiodbEntry') - -########### -# FACTORY # -########### - -createEnzymeEntryFromTxt <- function(contents, drop = TRUE) { - - entries <- list() - - # Define fields regex - regex <- character() - regex[[BIODB.ACCESSION]] <- "^ID\\s+([0-9.]+)$" - regex[[BIODB.DESCRIPTION]] <- "^DE\\s+(.+)$" - - for (text in contents) { - - # Create instance - entry <- EnzymeEntry$new() - - lines <- strsplit(text, "\n") - for (s in lines[[1]]) { - - # Test generic regex - parsed <- FALSE - for (field in names(regex)) { - g <- stringr::str_match(s, regex[[field]]) - if ( ! is.na(g[1,1])) { - entry$setField(field, g[1,2]) - parsed <- TRUE - break - } - } - if (parsed) - next - } - - entries <- c(entries, entry) - } - - # Replace elements with no accession id by NULL - entries <- lapply(entries, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x) - - # If the input was a single element, then output a single object - if (drop && length(contents) == 1) - entries <- entries[[1]] - - return(entries) -}
--- a/HmdbConn.R Tue Mar 14 12:40:22 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,36 +0,0 @@ -##################### -# CLASS DECLARATION # -##################### - -HmdbConn <- methods::setRefClass("HmdbConn", contains = "RemotedbConn") - -########################## -# GET ENTRY CONTENT TYPE # -########################## - -HmdbConn$methods( getEntryContentType = function() { - return(BIODB.XML) -}) - -##################### -# GET ENTRY CONTENT # -##################### - -HmdbConn$methods( getEntryContent = function(id) { - - # Initialize return values - content <- rep(NA_character_, length(id)) - - # Request - content <- vapply(id, function(x) .self$.get.url(get.entry.url(BIODB.HMDB, x, content.type = BIODB.XML)), FUN.VALUE = '') - - return(content) -}) - -################ -# CREATE ENTRY # -################ - -HmdbConn$methods( createEntry = function(content, drop = TRUE) { - return(createHmdbEntryFromXml(content, drop = drop)) -})
--- a/HmdbEntry.R Tue Mar 14 12:40:22 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,59 +0,0 @@ -##################### -# CLASS DECLARATION # -##################### - -HmdbEntry <- methods::setRefClass("HmdbEntry", contains = "BiodbEntry") - -########### -# FACTORY # -########### - -createHmdbEntryFromXml <- function(contents, drop = FALSE) { - - entries <- list() - - # Define xpath expressions - xpath.expr <- character() - xpath.expr[[BIODB.ACCESSION]] <- "/metabolite/accession" - xpath.expr[[BIODB.KEGG.ID]] <- "//kegg_id" - xpath.expr[[BIODB.NAME]] <- "/metabolite/name" - xpath.expr[[BIODB.FORMULA]] <- "/metabolite/chemical_formula" - xpath.expr[[BIODB.SUPER.CLASS]] <- "//super_class" - xpath.expr[[BIODB.AVERAGE.MASS]] <- "//average_molecular_weight" - xpath.expr[[BIODB.MONOISOTOPIC.MASS]] <- "//monisotopic_moleculate_weight" - - for (content in contents) { - - # Create instance - entry <- HmdbEntry$new() - - if ( ! is.null(content) && ! is.na(content)) { - - # Parse XML - xml <- XML::xmlInternalTreeParse(content, asText = TRUE) - - # An error occured - if (length(XML::getNodeSet(xml, "//error")) == 0) { - - # Test generic xpath expressions - for (field in names(xpath.expr)) { - v <- XML::xpathSApply(xml, xpath.expr[[field]], XML::xmlValue) - if (length(v) > 0) - entry$setField(field, v) - } - - } - } - - entries <- c(entries, entry) - } - - # Replace elements with no accession id by NULL - entries <- lapply(entries, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x) - - # If the input was a single element, then output a single object - if (drop && length(contents) == 1) - entries <- entries[[1]] - - return(entries) -}
--- a/KeggConn.R Tue Mar 14 12:40:22 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,36 +0,0 @@ -##################### -# CLASS DECLARATION # -##################### - -KeggConn <- methods::setRefClass("KeggConn", contains = "RemotedbConn") - -########################## -# GET ENTRY CONTENT TYPE # -########################## - -KeggConn$methods( getEntryContentType = function() { - return(BIODB.TXT) -}) - -##################### -# GET ENTRY CONTENT # -##################### - -KeggConn$methods( getEntryContent = function(id) { - - # Initialize return values - content <- rep(NA_character_, length(id)) - - # Request - content <- vapply(id, function(x) .self$.get.url(get.entry.url(BIODB.KEGG, x, content.type = BIODB.TXT)), FUN.VALUE = '') - - return(content) -}) - -################ -# CREATE ENTRY # -################ - -KeggConn$methods( createEntry = function(content, drop = TRUE) { - return(createKeggEntryFromTxt(content, drop = drop)) -})
--- a/KeggEntry.R Tue Mar 14 12:40:22 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,81 +0,0 @@ -##################### -# CLASS DECLARATION # -##################### - -KeggEntry <- methods::setRefClass("KeggEntry", contains = 'BiodbEntry') - -########### -# FACTORY # -########### - -createKeggEntryFromTxt <- function(contents, drop = TRUE) { - - entries <- list() - - # Define fields regex - regex <- character() - regex[[BIODB.NAME]] <- "^NAME\\s+([^,;]+)" - regex[[BIODB.CHEBI.ID]] <- "^\\s+ChEBI:\\s+(\\S+)" - regex[[BIODB.LIPIDMAPS.ID]] <- "^\\s+LIPIDMAPS:\\s+(\\S+)" - - for (text in contents) { - - # Create instance - entry <- KeggEntry$new() - - lines <- strsplit(text, "\n") - for (s in lines[[1]]) { - - # Test generic regex - parsed <- FALSE - for (field in names(regex)) { - g <- stringr::str_match(s, regex[[field]]) - if ( ! is.na(g[1,1])) { - entry$setField(field, g[1,2]) - parsed <- TRUE - break - } - } - if (parsed) - next - - # ACCESSION - { - # ENZYME ID - g <- stringr::str_match(s, "^ENTRY\\s+EC\\s+(\\S+)") - if ( ! is.na(g[1,1])){ - entry$setField(BIODB.ACCESSION, paste('ec', g[1,2], sep = ':')) - - # ENTRY ID - }else { - g <- stringr::str_match(s, "^ENTRY\\s+(\\S+)\\s+Compound") - if ( ! is.na(g[1,1])){ - entry$setField(BIODB.ACCESSION, paste('cpd', g[1,2], sep = ':')) - - # OTHER ID - }else { - g <- stringr::str_match(s, "^ENTRY\\s+(\\S+)") - if ( ! is.na(g[1,1])) - entry$setField(BIODB.ACCESSION, g[1,2]) - } - } - - # ORGANISM - g <- stringr::str_match(s, "^ORGANISM\\s+(\\S+)") - if ( ! is.na(g[1,1])) - entry$setField(BIODB.ACCESSION, paste(g[1,2], entry$getField(BIODB.ACCESSION), sep = ':')) - } - } - - entries <- c(entries, entry) - } - - # Replace elements with no accession id by NULL - entries <- lapply(entries, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x) - - # If the input was a single element, then output a single object - if (drop && length(contents) == 1) - entries <- entries[[1]] - - return(entries) -}
--- a/LipidmapsConn.R Tue Mar 14 12:40:22 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,46 +0,0 @@ -##################### -# CLASS DECLARATION # -##################### - -LipidmapsConn <- methods::setRefClass("LipidmapsConn", contains = "RemotedbConn") - -############### -# CONSTRUCTOR # -############### - -LipidmapsConn$methods( initialize = function(...) { - # From http://www.lipidmaps.org/data/structure/programmaticaccess.html: - # If you write a script to automate calls to LMSD, please be kind and do not hit our server more often than once per 20 seconds. We may have to kill scripts that hit our server more frequently. - callSuper(scheduler = UrlRequestScheduler$new(t = 20), ...) -}) - -########################## -# GET ENTRY CONTENT TYPE # -########################## - -LipidmapsConn$methods( getEntryContentType = function() { - return(BIODB.CSV) -}) - -##################### -# GET ENTRY CONTENT # -##################### - -LipidmapsConn$methods( getEntryContent = function(id) { - - # Initialize return values - content <- rep(NA_character_, length(id)) - - # Request - content <- vapply(id, function(x) .self$.get.url(get.entry.url(BIODB.LIPIDMAPS, x, content.type = BIODB.CSV)), FUN.VALUE = '') - - return(content) -}) - -################ -# CREATE ENTRY # -################ - -LipidmapsConn$methods( createEntry = function(content, drop = TRUE) { - return(createLipidmapsEntryFromCsv(content, drop = drop)) -})
--- a/LipidmapsEntry.R Tue Mar 14 12:40:22 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,64 +0,0 @@ -##################### -# CLASS DECLARATION # -##################### - -LipidmapsEntry <- methods::setRefClass("LipidmapsEntry", contains = 'BiodbEntry') - -########### -# FACTORY # -########### - -createLipidmapsEntryFromCsv <- function(contents, drop = TRUE) { - - entries <- list() - - # Mapping column names - col2field <- list() - col2field[[BIODB.NAME]] <- 'COMMON_NAME' - col2field[[BIODB.ACCESSION]] <- 'LM_ID' - col2field[[BIODB.KEGG.ID]] <- 'KEGG_ID' - col2field[[BIODB.HMDB.ID]] <- 'HMDBID' - col2field[[BIODB.MASS]] <- 'MASS' - col2field[[BIODB.FORMULA]] <- 'FORMULA' - - for (text in contents) { - - # Create instance - entry <- LipidmapsEntry$new() - - # Split text in lines - lines <- split.str(text, sep = "\n", unlist = TRUE) - - # An error occured - if ( ! grepl("No record found", lines[[2]])) { - - # Keys on first line - keys <- split.str(lines[[1]], unlist = TRUE) - - # Values on second line - values <- split.str(lines[[2]], unlist = TRUE) - names(values) <- keys[seq(values)] - - # Get field values - for (field in names(col2field)) - if (values[[col2field[[field]]]] != '-') - entry$setField(field, values[[col2field[[field]]]]) - - # Set names - if (values[['SYNONYMS']] != '-') { - # TODO - } - } - - entries <- c(entries, entry) - } - - # Replace elements with no accession id by NULL - entries <- lapply(entries, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x) - - # If the input was a single element, then output a single object - if (drop && length(contents) == 1) - entries <- entries[[1]] - - return(entries) -}
--- a/MassFiledbConn.R Tue Mar 14 12:40:22 2017 -0400 +++ b/MassFiledbConn.R Wed Apr 19 10:00:05 2017 -0400 @@ -61,7 +61,7 @@ if (is.null(.self$.db)) { # Load database - .db <<- read.table(.self$.file, sep = .self$.file.sep, .self$.file.quote, header = TRUE, stringsAsFactors = FALSE, row.names = NULL) + .db <<- read.table(.self$.file, sep = .self$.file.sep, .self$.file.quote, header = TRUE, stringsAsFactors = FALSE, row.names = NULL, comment.char = '') # Save column names .db.orig.colnames <<- colnames(.self$.db)
--- a/MirbaseConn.R Tue Mar 14 12:40:22 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,54 +0,0 @@ -##################### -# CLASS DECLARATION # -##################### - -MirbaseConn <- methods::setRefClass("MirbaseConn", contains = "RemotedbConn") - -########################## -# GET ENTRY CONTENT TYPE # -########################## - -MirbaseConn$methods( getEntryContentType = function() { - return(BIODB.HTML) -}) - -##################### -# GET ENTRY CONTENT # -##################### - -MirbaseConn$methods( getEntryContent = function(ids) { - - # Initialize return values - content <- rep(NA_character_, length(ids)) - - # Request - content <- vapply(ids, function(x) .self$.get.url(get.entry.url(BIODB.MIRBASE, x, content.type = BIODB.HTML)), FUN.VALUE = '') - - return(content) -}) - -################ -# CREATE ENTRY # -################ - -MirbaseConn$methods( createEntry = function(content, drop = TRUE) { - return(createMirbaseEntryFromHtml(content, drop = drop)) -}) - -################### -# FIND ACCESSIONS # -################### - -MirbaseConn$methods( findAccessions = function(name) { - - # Get HTML - htmlstr <- .self$.get.url('http://www.mirbase.org/cgi-bin/query.pl', params = c(terms = name, submit = 'Search')) - - # Parse HTML - xml <- htmlTreeParse(htmlstr, asText = TRUE, useInternalNodes = TRUE) - - # Get accession number - acc <- unlist(xpathSApply(xml, "//a[starts-with(.,'MIMAT')]", xmlValue)) - - return(acc) -})
--- a/MirbaseEntry.R Tue Mar 14 12:40:22 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,47 +0,0 @@ -##################### -# CLASS DECLARATION # -##################### - -MirbaseEntry <- methods::setRefClass("MirbaseEntry", contains = "BiodbEntry") - -########### -# FACTORY # -########### - -createMirbaseEntryFromHtml <- function(contents, drop = TRUE) { - - entries <- list() - - # Define fields regex - xpath.expr <- character() - xpath.expr[[BIODB.ACCESSION]] <- "//td[text()='Accession number']/../td[2]" - xpath.expr[[BIODB.NAME]] <- "//td[text()='ID']/../td[2]" - xpath.expr[[BIODB.SEQUENCE]] <- "//td[text()='Sequence']/..//pre" - - for (html in contents) { - - # Create instance - entry <- MirbaseEntry$new() - - # Parse HTML - xml <- XML::htmlTreeParse(html, asText = TRUE, useInternalNodes = TRUE) - - # Test generic xpath expressions - for (field in names(xpath.expr)) { - v <- XML::xpathSApply(xml, xpath.expr[[field]], XML::xmlValue) - if (length(v) > 0) - entry$setField(field, v) - } - - entries <- c(entries, entry) - } - - # Replace elements with no accession id by NULL - entries <- lapply(entries, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x) - - # If the input was a single element, then output a single object - if (drop && length(contents) == 1) - entries <- entries[[1]] - - return(entries) -}
--- a/MsBioDb.R Tue Mar 14 12:40:22 2017 -0400 +++ b/MsBioDb.R Wed Apr 19 10:00:05 2017 -0400 @@ -2,8 +2,8 @@ library(methods) source('MsDb.R') - source(file.path('BiodbObject.R'), chdir = TRUE) - source(file.path('BiodbFactory.R'), chdir = TRUE) + source('BiodbObject.R', chdir = TRUE) + source('BiodbFactory.R', chdir = TRUE) ##################### # CLASS DECLARATION #
--- a/MsDb.R Tue Mar 14 12:40:22 2017 -0400 +++ b/MsDb.R Wed Apr 19 10:00:05 2017 -0400 @@ -9,21 +9,22 @@ # CLASS DECLARATION # ##################### - MsDb <- setRefClass("MsDb", fields = list(.observers = "ANY", .prec = "list", .output.streams = "ANY", .input.stream = "ANY", .mz.tol.unit = "character")) + MsDb <- setRefClass("MsDb", fields = list(.observers = "ANY", .prec = "list", .output.streams = "ANY", .input.stream = "ANY", .mz.tol.unit = "character", .rt.unit = "character")) ############### # CONSTRUCTOR # ############### MsDb$methods( initialize = function(...) { + + callSuper(...) .observers <<- NULL .output.streams <<- NULL .input.stream <<- NULL .prec <<- MSDB.DFT.PREC .mz.tol.unit <<- MSDB.DFT.MZTOLUNIT - - callSuper(...) + .rt.unit <<- MSDB.RTUNIT.SEC }) #################### @@ -126,6 +127,10 @@ stop("Method setDbMsModes() not implemented in concrete class.") }) + ################### + # SET MZ TOL UNIT # + ################### + MsDb$methods( setMzTolUnit = function(mztolunit) { if ( ! mztolunit %in% MSDB.MZTOLUNIT.VALS) @@ -134,6 +139,26 @@ .mz.tol.unit <<- mztolunit }) + ############### + # SET RT UNIT # + ############### + + MsDb$methods( setRtUnit = function(unit) { + + if ( ! unit %in% MSDB.RTUNIT.VALS) + stop(paste0("RT unit must be one of: ", paste(MSDB.RTUNIT.VALS, collapse = ', '), ".")) + + .rt.unit <<- unit + }) + + ############### + # GET RT UNIT # + ############### + + MsDb$methods( getRtUnit = function(unit) { + return(.self$.rt.unit) + }) + #################### # HANDLE COMPOUNDS # #################### @@ -294,8 +319,6 @@ # peaks <- rbind(peaks, peaks.na) # # # Sort -# print(colnames(peaks)) -# print(x.colnames) # peaks <- peaks[order(peaks[[x.colnames$mz]], peaks[[x.colnames$rt]], peaks[[MSDB.TAG.MOLID]], peaks[[MSDB.TAG.COL]]), ] # # # Remove rownames
--- a/MsDbInputDataFrameStream.R Tue Mar 14 12:40:22 2017 -0400 +++ b/MsDbInputDataFrameStream.R Wed Apr 19 10:00:05 2017 -0400 @@ -7,18 +7,19 @@ # CLASS DECLARATION # ##################### - MsDbInputDataFrameStream <- setRefClass("MsDbInputDataFrameStream", contains = 'MsDbInputStream', fields = list( .df = "ANY", .i = "integer")) + MsDbInputDataFrameStream <- setRefClass("MsDbInputDataFrameStream", contains = 'MsDbInputStream', fields = list( .df = "ANY", .i = "integer", .rtunit = 'character')) ############### # CONSTRUCTOR # ############### - MsDbInputDataFrameStream$methods( initialize = function(df = data.frame(), input.fields = msdb.get.dft.input.fields(), ...) { + MsDbInputDataFrameStream$methods( initialize = function(df = data.frame(), input.fields = msdb.get.dft.input.fields(), rtunit = MSDB.RTUNIT.SEC, ...) { + + callSuper(input.fields = input.fields, ...) .df <<- df .i <<- 0L - - callSuper(input.fields = input.fields, ...) + .rtunit <<- rtunit }) ########## @@ -39,10 +40,15 @@ MsDbInputDataFrameStream$methods( getRt = function() { - if (.self$.i > 0 && .self$.i <= nrow(.self$.df) && ! is.null(.self$.input.fields[[MSDB.TAG.RT]])) - return(.self$.df[.self$.i, .self$.input.fields[[MSDB.TAG.RT]]]) + rt <- NULL - return(NULL) + if (.self$.i > 0 && .self$.i <= nrow(.self$.df) && ! is.null(.self$.input.fields[[MSDB.TAG.RT]])) { + rt <- .self$.df[.self$.i, .self$.input.fields[[MSDB.TAG.RT]]] + if (.self$.rtunit == MSDB.RTUNIT.MIN) + rt <- rt * 60 + } + + return(rt) }) ###########
--- a/MsDbOutputDataFrameStream.R Tue Mar 14 12:40:22 2017 -0400 +++ b/MsDbOutputDataFrameStream.R Wed Apr 19 10:00:05 2017 -0400 @@ -8,17 +8,18 @@ # CLASS DECLARATION # ##################### - MsDbOutputDataFrameStream <- setRefClass("MsDbOutputDataFrameStream", contains = 'MsDbOutputStream', fields = list( .df = "ANY")) + MsDbOutputDataFrameStream <- setRefClass("MsDbOutputDataFrameStream", contains = 'MsDbOutputStream', fields = list( .df = "ANY", .output.fields = "ANY")) ############### # CONSTRUCTOR # ############### - MsDbOutputDataFrameStream$methods( initialize = function(keep.unused = FALSE, one.line = FALSE, match.sep = MSDB.DFT.MATCH.SEP, output.fields = msdb.get.dft.output.fields(), multval.field.sep = MSDB.DFT.OUTPUT.MULTIVAL.FIELD.SEP, first.val = FALSE, ascii = FALSE, noapostrophe = FALSE, noplusminus = FALSE, nogreek = FALSE, ...) { + MsDbOutputDataFrameStream$methods( initialize = function(keep.unused = FALSE, one.line = FALSE, match.sep = MSDB.DFT.MATCH.SEP, output.fields = NULL, multval.field.sep = MSDB.DFT.OUTPUT.MULTIVAL.FIELD.SEP, first.val = FALSE, ascii = FALSE, noapostrophe = FALSE, noplusminus = FALSE, nogreek = FALSE, ...) { + + callSuper(keep.unused = keep.unused, one.line = one.line, match.sep = match.sep, multval.field.sep = multval.field.sep, first.val = first.val, ascii = ascii, noapostrophe = noapostrophe, noplusminus = noplusminus, nogreek = nogreek, ...) .df <<- data.frame() - - callSuper(keep.unused = keep.unused, one.line = one.line, match.sep = match.sep, output.fields = output.fields, multval.field.sep = multval.field.sep, first.val = first.val, ascii = ascii, noapostrophe = noapostrophe, noplusminus = noplusminus, nogreek = nogreek, ...) + .output.fields <<- output.fields }) ################## @@ -57,19 +58,28 @@ if ( ! is.null(rt)) { x.rt <- data.frame(rt = rt) colnames(x.rt) <- MSDB.TAG.RT + if (.self$.rtunit == MSDB.RTUNIT.MIN) + x.rt[[MSDB.TAG.RT]] <- x.rt[[MSDB.TAG.RT]] / 60 x <- cbind(x, x.rt) } + # Merge input values with matched peaks if ( ! is.null(peaks)) { # No rows - if (nrow(peaks) == 0) + if (nrow(peaks) == 0) { # Add NA values peaks[1, ] <- NA # Process existing rows - else { + } else { + + # Convert RT + if (.self$.rtunit == MSDB.RTUNIT.MIN) + if (MSDB.TAG.COLRT %in% colnames(peaks)) + peaks[[MSDB.TAG.COLRT]] <- peaks[[MSDB.TAG.COLRT]] / 60 + # Process multi-value fields for (c in colnames(peaks)) if (c %in% MSDB.MULTIVAL.FIELDS) {
--- a/MsDbOutputStream.R Tue Mar 14 12:40:22 2017 -0400 +++ b/MsDbOutputStream.R Wed Apr 19 10:00:05 2017 -0400 @@ -7,7 +7,7 @@ # CLASS DECLARATION # ##################### - MsDbOutputStream <- setRefClass("MsDbOutputStream", fields = list(.keep.unused = "logical", .one.line = "logical", .match.sep = "character", .output.fields = "ANY", .multval.field.sep = "character", .first.val = "logical", .ascii = "logical", .noapostrophe = "logical", .noplusminus = "logical", .nogreek = "logical")) + MsDbOutputStream <- setRefClass("MsDbOutputStream", fields = list(.keep.unused = "logical", .one.line = "logical", .match.sep = "character", .multval.field.sep = "character", .first.val = "logical", .ascii = "logical", .noapostrophe = "logical", .noplusminus = "logical", .nogreek = "logical", .rtunit = 'character')) ############### # CONSTRUCTOR # @@ -20,20 +20,20 @@ #' @return #' @examples #' stream <- MsDbOutputDataFrameStream$new(one.line = TRUE) - MsDbOutputStream$methods( initialize = function(keep.unused = FALSE, one.line = FALSE, match.sep = MSDB.DFT.MATCH.SEP, output.fields = msdb.get.dft.output.fields(), multval.field.sep = MSDB.DFT.OUTPUT.MULTIVAL.FIELD.SEP, first.val = FALSE, ascii = FALSE, noapostrophe = FALSE, noplusminus = FALSE, nogreek = FALSE, ...) { + MsDbOutputStream$methods( initialize = function(keep.unused = FALSE, one.line = FALSE, match.sep = MSDB.DFT.MATCH.SEP, multval.field.sep = MSDB.DFT.OUTPUT.MULTIVAL.FIELD.SEP, first.val = FALSE, ascii = FALSE, noapostrophe = FALSE, noplusminus = FALSE, nogreek = FALSE, rtunit = MSDB.RTUNIT.SEC, ...) { + + callSuper(...) .keep.unused <<- keep.unused .one.line <<- one.line .match.sep <<- match.sep - .output.fields <<- output.fields .multval.field.sep <<- multval.field.sep .first.val <<- first.val .ascii <<- ascii .noapostrophe <<- noapostrophe .noplusminus <<- noplusminus .nogreek <<- nogreek - - callSuper(...) + .rtunit <<- rtunit }) #################
--- a/MsFileDb.R Tue Mar 14 12:40:22 2017 -0400 +++ b/MsFileDb.R Wed Apr 19 10:00:05 2017 -0400 @@ -82,7 +82,7 @@ if (is.null(.self$.db)) { # Load database - .db <<- read.table(.self$.file, sep = "\t", quote = "\"", header = TRUE, stringsAsFactors = FALSE, row.names = NULL) + .db <<- read.table(.self$.file, sep = "\t", quote = "\"", header = TRUE, stringsAsFactors = FALSE, row.names = NULL, check.names = FALSE, comment.char = '') # Check that colnames are unique dupcol <- duplicated(colnames(.self$.db)) @@ -395,8 +395,10 @@ db <- db[db[[MSDB.TAG.COL]] %in% col,] # Filter on retention time - if ( ! is.null(rt.low) && ! is.na(rt.low) && ! is.null(rt.high) && ! is.na(rt.high)) - db <- db[db[[MSDB.TAG.COLRT]] >= rt.low & db[[MSDB.TAG.COLRT]] <= rt.high, ] + if ( ! is.null(rt.low) && ! is.na(rt.low) && ! is.null(rt.high) && ! is.na(rt.high)) { + scale <- if (.self$getRtUnit() == MSDB.RTUNIT.MIN) 60 else 1 + db <- db[db[[MSDB.TAG.COLRT]] * scale >= rt.low & db[[MSDB.TAG.COLRT]] * scale <= rt.high, ] + } # Remove retention times and column information if (is.null(col) || is.na(col) || is.null(rt.low) || is.na(rt.low) || is.null(rt.high) || is.na(rt.high)) { @@ -409,19 +411,6 @@ # Filter on mz db <- db[db[[MSDB.TAG.MZTHEO]] >= mz.low & db[[MSDB.TAG.MZTHEO]] <= mz.high, ] - # Rename database fields -# conv <- c( mz = 'mztheo', rt = 'colrt') # solving mismatch of field names between database and output -# cols <- colnames(db) -# for (db.field in names(.self$.fields)) { -# output.field <- if (db.field %in% names(conv)) conv[[db.field]] else db.field -# if (.self$.fields[[db.field]] %in% cols && output.field %in% names(.self$.output.fields)) -# cols[cols %in% .self$.fields[[db.field]]] <- .self$.output.fields[[output.field]] -# } -# colnames(db) <- cols - - # Remove unwanted columns -# db <- db[, colnames(db) %in% .self$.output.fields] - return(db) }) @@ -487,6 +476,9 @@ rt[col] <- list(colrts) } + if (.self$getRtUnit() == MSDB.RTUNIT.MIN) + rt <- 60 * rt + return(rt) })
--- a/MsPeakForestDb.R Tue Mar 14 12:40:22 2017 -0400 +++ b/MsPeakForestDb.R Wed Apr 19 10:00:05 2017 -0400 @@ -2,7 +2,7 @@ library(methods) source('MsDb.R') - source(file.path('UrlRequestScheduler.R')) + source('UrlRequestScheduler.R') ##################### # CLASS DECLARATION # @@ -16,6 +16,8 @@ MsPeakForestDb$methods( initialize = function(url = NA_character_, useragent = NA_character_, token = NA_character_, ...) { + callSuper(...) + # Check URL if (is.null(url) || is.na(url)) stop("No URL defined for new MsPeakForestDb instance.") @@ -26,8 +28,7 @@ .url.scheduler <<- UrlRequestScheduler$new(n = 3, useragent = useragent) .self$.url.scheduler$setVerbose(1L) .token <<- token - - callSuper(...) + .rt.unit <<- MSDB.RTUNIT.MIN }) ########### @@ -46,18 +47,15 @@ # Add token if ( ! is.na(.self$.token)) params <- c(params, token = .self$.token) - param.str <- if (is.null(params)) '' else paste('?', vapply(names(params), function(p) paste(p, params[[p]], sep = '='), FUN.VALUE = ''), collapse = '&', sep = '') # Get URL content <- .self$.url.scheduler$getUrl(url = url, params = params) if (ret.type == 'json') { - library(RJSONIO) + res <- jsonlite::fromJSON(content, simplifyDataFrame = FALSE) - res <- fromJSON(content, nullValue = NA) - - if (class(res) == 'list' && 'success' %in% names(res) && res$success == FALSE) { + if (is.null(res)) { param.str <- if (is.null(params)) '' else paste('?', vapply(names(params), function(p) paste(p, params[[p]], sep = '='), FUN.VALUE = ''), collapse = '&', sep = '') stop(paste0("Failed to run web service. URL was \"", url, param.str, "\".")) } @@ -66,8 +64,7 @@ if (grepl('^[0-9]+$', content, perl = TRUE)) res <- as.integer(content) else { - library(RJSONIO) - res <- fromJSON(content, nullValue = NA) + res <- jsonlite::fromJSON(content, simplifyDataFrame = FALSE) } } } @@ -141,6 +138,7 @@ for (s in spectra) if (is.na(col) || s$liquidChromatography$columnCode %in% col) { ret.time <- (s$RTmin + s$RTmax) / 2 + ret.time <- ret.time * 60 # Retention time are in minutes in Peakforest, but we want them in seconds c <- s$liquidChromatography$columnCode if (c %in% names(rt)) { if ( ! ret.time %in% rt[[c]]) @@ -262,21 +260,21 @@ results <- data.frame(MSDB.TAG.MOLID = character(), MSDB.TAG.MOLNAMES = character(), MSDB.TAG.MOLMASS = numeric(), MSDB.TAG.MZTHEO = numeric(), MSDB.TAG.COMP = character(), MSDB.TAG.ATTR = character(), MSDB.TAG.INCHI = character(), MSDB.TAG.INCHIKEY = character(), MSDB.TAG.CHEBI = character(), MSDB.TAG.HMDB = character(), MSDB.TAG.KEGG = character(), MSDB.TAG.PUBCHEM = character()) for (x in spectra) { if ('source' %in% names(x) && is.list(x$source)) - mztheo <- if ('theoricalMass' %in% names(x)) as.numeric(x$theoricalMass) else NA_real_ - comp <- if ('composition' %in% names(x)) x$composition else NA_character_ - attr <- if ('attribution' %in% names(x)) x$attribution else NA_character_ + mztheo <- if ('mz' %in% names(x) && ! is.null(x$mz)) as.numeric(x$mz) else NA_real_ + comp <- if ('composition' %in% names(x) && ! is.null(x$composition)) x$composition else NA_character_ + attr <- if ('attribution' %in% names(x) && ! is.null(x$attribution)) x$attribution else NA_character_ if ('listOfCompounds' %in% names(x$source)) { - molids <- vapply(x$source$listOfCompounds, function(c) as.character(c$id), FUN.VALUE = '') - molnames <- vapply(x$source$listOfCompounds, function(c) paste(c$names, collapse = MSDB.MULTIVAL.FIELD.SEP), FUN.VALUE = '') - mass <- vapply(x$source$listOfCompounds, function(c) as.character(c$averageMass), FUN.VALUE = '') - inchi <- vapply(x$source$listOfCompounds, function(c) as.character(c$inChI), FUN.VALUE = '') - inchikey <- vapply(x$source$listOfCompounds, function(c) as.character(c$inChIKey), FUN.VALUE = '') - chebi <- vapply(x$source$listOfCompounds, function(c) as.character(c$ChEBI), FUN.VALUE = '') + molids <- vapply(x$source$listOfCompounds, function(c) if ('id' %in% names(c) && ! is.null(c$id)) as.character(c$id) else NA_character_, FUN.VALUE = '') + molnames <- vapply(x$source$listOfCompounds, function(c) if ('names' %in% names(c) && ! is.null(c$names)) paste(c$names, collapse = MSDB.MULTIVAL.FIELD.SEP) else NA_character_, FUN.VALUE = '') + mass <- vapply(x$source$listOfCompounds, function(c) if ( ! 'averageMass' %in% names(c) || is.null(c$averageMass)) NA_real_ else as.double(c$averageMass), FUN.VALUE = 0.0) + inchi <- vapply(x$source$listOfCompounds, function(c) if ( ! 'inChI' %in% names(c) || is.null(c$inChI)) NA_character_ else as.character(c$inChI), FUN.VALUE = '') + inchikey <- vapply(x$source$listOfCompounds, function(c) if ( ! 'inChIKey' %in% names(c) || is.null(c$inChIKey)) NA_character_ else as.character(c$inChIKey), FUN.VALUE = '') + chebi <- vapply(x$source$listOfCompounds, function(c) if ('ChEBI' %in% names(c) && ! is.null(c$ChEBI)) as.character(c$ChEBI) else NA_character_, FUN.VALUE = '') chebi[chebi == 'CHEBI:null'] <- NA_character_ - hmdb <- vapply(x$source$listOfCompounds, function(c) as.character(c$HMDB), FUN.VALUE = '') + hmdb <- vapply(x$source$listOfCompounds, function(c) if ('HMDB' %in% names(c) && ! is.null(c$HMDB)) as.character(c$HMDB) else NA_character_, FUN.VALUE = '') hmdb[hmdb == 'HMDBnull'] <- NA_character_ - kegg <- vapply(x$source$listOfCompounds, function(c) as.character(c$KEGG), FUN.VALUE = '') - pubchem <- vapply(x$source$listOfCompounds, function(c) as.character(c$PubChemCID), FUN.VALUE = '') + kegg <- vapply(x$source$listOfCompounds, function(c) if ( ! 'KEGG' %in% names(c) || is.null(c$KEGG)) NA_character_ else as.character(c$KEGG), FUN.VALUE = '') + pubchem <- vapply(x$source$listOfCompounds, function(c) if ( ! 'PubChemCID' %in% names(c) || is.null(c$PubChemCID)) NA_character_ else as.character(c$PubChemCID), FUN.VALUE = '') if (length(molids) > 0 && length(molids) == length(molnames)) results <- rbind(results, data.frame(MSDB.TAG.MOLID = molids, MSDB.TAG.MOLNAMES = molnames, MSDB.TAG.MOLMASS = mass, MSDB.TAG.MZTHEO = mztheo, MSDB.TAG.COMP = comp, MSDB.TAG.ATTR = attr, MSDB.TAG.INCHI = inchi, MSDB.TAG.INCHIKEY = inchikey, MSDB.TAG.CHEBI = chebi, MSDB.TAG.HMDB = hmdb, MSDB.TAG.KEGG = kegg, MSDB.TAG.PUBCHEM = pubchem, stringsAsFactors = FALSE)) } @@ -288,8 +286,9 @@ rt.res <- data.frame(MSDB.TAG.MOLID = character(), MSDB.TAG.COL = character(), MSDB.TAG.COLRT = numeric()) if (nrow(results) > 0) { + # Build URL for rt search - url <- paste0('spectra/lcms/range-rt-min/', rt.low, '/', rt.high) + url <- paste0('spectra/lcms/range-rt-min/', rt.low / 60, '/', rt.high / 60) params <- NULL if ( ! is.null(col)) params <- c(columns = paste(col, collapse = ',')) @@ -298,11 +297,20 @@ rtspectra <- .self$.get.url(url = url, params = params) # Get compound/molecule IDs - for (x in spectra) - rt.res <- rbind(rt.res, data.frame(MSDB.TAG.MOLID = vapply(x$listOfCompounds, function(c) as.character(c$id), FUN.VALUE = ''), - MSDB.TAG.COL = as.character(x$liquidChromatography$columnCode), - MSDB.TAG.COLRT = (as.numeric(x$RTmin) + as.numeric(x$RTmax)) / 2, - stringsAsFactors = FALSE)) + for (x in rtspectra) + if (all(c('listOfCompounds', 'liquidChromatography') %in% names(x))) { + molids <- vapply(x$listOfCompounds, function(c) if ('id' %in% names(c) && ! is.null(c$id)) as.character(c$id) else NA_character_, FUN.VALUE = '') + if (length(molids) > 0) { + col <- if ('columnCode' %in% names(x$liquidChromatography) && ! is.null(x$liquidChromatography$columnCode)) as.character(x$liquidChromatography$columnCode) else NA_character_ + rtmin <- if ('RTmin' %in% names(x) && ! is.null(x$RTmin)) as.double(x$RTmin) else NA_real_ + rtmax <- if ('RTmax' %in% names(x) && ! is.null(x$RTmax)) as.double(x$RTmax) else NA_real_ + colrt <- (rtmin + rtmax) / 2 + rt.res <- rbind(rt.res, data.frame(MSDB.TAG.MOLID = molids, + MSDB.TAG.COL = col, + MSDB.TAG.COLRT = colrt * 60, + stringsAsFactors = FALSE)) + } + } } # Add retention times and column info
--- a/MsXlsDb.R Tue Mar 14 12:40:22 2017 -0400 +++ b/MsXlsDb.R Wed Apr 19 10:00:05 2017 -0400 @@ -56,7 +56,7 @@ # GET MOLECULE IDS # #################### - MsXlsDb$methods( getMoleculeIds = function() { + MsXlsDb$methods( getMoleculeIds = function(max.results = NA_integer_) { # Init file list .self$.init.file.list() @@ -64,6 +64,10 @@ # Get IDs mol.ids <- as.integer(which( ! is.na(.self$.files))) + # Cut + if ( ! is.na(max.results) && length(mol.ids) > max.results) + mol.ids <- mol.ids[max.results, ] + return(mol.ids) }) @@ -801,6 +805,7 @@ mols <- .self$getMoleculeIds() results <- data.frame(id = integer(), col = character(), colrt = double(), stringsAsFactors = FALSE) + colnames(results) <- c(MSDB.TAG.MOLID, MSDB.TAG.COL, MSDB.TAG.COLRT) # Loop on all molecules for (molid in mols) {
--- a/NcbiccdsConn.R Tue Mar 14 12:40:22 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,45 +0,0 @@ -##################### -# CLASS DECLARATION # -##################### - -NcbiccdsConn <- methods::setRefClass("NcbiccdsConn", contains = "RemotedbConn") - -############### -# CONSTRUCTOR # -############### - -NcbiccdsConn$methods( initialize = function(...) { - # From NCBI E-Utility manual: "In order not to overload the E-utility servers, NCBI recommends that users post no more than three URL requests per second and limit large jobs to either weekends or between 9:00 PM and 5:00 AM Eastern time during weekdays". - callSuper(scheduler = UrlRequestScheduler$new(n = 3), ...) -}) - -########################## -# GET ENTRY CONTENT TYPE # -########################## - -NcbiccdsConn$methods( getEntryContentType = function() { - return(BIODB.HTML) -}) - -##################### -# GET ENTRY CONTENT # -##################### - -NcbiccdsConn$methods( getEntryContent = function(id) { - - # Initialize return values - content <- rep(NA_character_, length(id)) - - # Request - content <- vapply(id, function(x) .self$.get.url(get.entry.url(BIODB.NCBICCDS, x, content.type = BIODB.HTML)), FUN.VALUE = '') - - return(content) -}) - -################ -# CREATE ENTRY # -################ - -NcbiccdsConn$methods( createEntry = function(content, drop = TRUE) { - return(createNcbiccdsEntryFromHtml(content, drop = drop)) -})
--- a/NcbiccdsEntry.R Tue Mar 14 12:40:22 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,39 +0,0 @@ -##################### -# CLASS DECLARATION # -##################### - -NcbiccdsEntry <- methods::setRefClass("NcbiccdsEntry", contains = "BiodbEntry") - -########### -# FACTORY # -########### - -createNcbiccdsEntryFromHtml <- function(contents, drop = TRUE) { - - entries <- list() - - for (html in contents) { - - # Create instance - entry <- NcbiccdsEntry$new() - - # Parse HTML - xml <- XML::htmlTreeParse(html, asText = TRUE, useInternalNodes = TRUE) - - if (length(XML::getNodeSet(xml, "//*[starts-with(.,'No results found for CCDS ID ')]")) == 0) { - entry$setField(BIODB.ACCESSION, XML::xpathSApply(xml, "//input[@id='DATA']", XML::xmlGetAttr, "value")) - entry$setField(BIODB.SEQUENCE, XML::xpathSApply(xml, "//b[starts-with(.,'Nucleotide Sequence')]/../tt", XML::xmlValue)) - } - - entries <- c(entries, entry) - } - - # Replace elements with no accession id by NULL - entries <- lapply(entries, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x) - - # If the input was a single element, then output a single object - if (drop && length(contents) == 1) - entries <- entries[[1]] - - return(entries) -}
--- a/NcbigeneConn.R Tue Mar 14 12:40:22 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,45 +0,0 @@ -##################### -# CLASS DECLARATION # -##################### - -NcbigeneConn <- methods::setRefClass("NcbigeneConn", contains = "RemotedbConn") - -############### -# CONSTRUCTOR # -############### - -NcbigeneConn$methods( initialize = function(...) { - # From NCBI E-Utility manual: "In order not to overload the E-utility servers, NCBI recommends that users post no more than three URL requests per second and limit large jobs to either weekends or between 9:00 PM and 5:00 AM Eastern time during weekdays". - callSuper(scheduler = UrlRequestScheduler$new(n = 3), ...) -}) - -########################## -# GET ENTRY CONTENT TYPE # -########################## - -NcbigeneConn$methods( getEntryContentType = function() { - return(BIODB.XML) -}) - -##################### -# GET ENTRY CONTENT # -##################### - -NcbigeneConn$methods( getEntryContent = function(id) { - - # Initialize return values - content <- rep(NA_character_, length(id)) - - # Request - content <- vapply(id, function(x) .self$.get.url(get.entry.url(BIODB.NCBIGENE, x, content.type = BIODB.XML)), FUN.VALUE = '') - - return(content) -}) - -################ -# CREATE ENTRY # -################ - -NcbigeneConn$methods( createEntry = function(content, drop = TRUE) { - return(createNcbigeneEntryFromXml(content, drop = drop)) -})
--- a/NcbigeneEntry.R Tue Mar 14 12:40:22 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,110 +0,0 @@ -##################### -# CLASS DECLARATION # -##################### - -NcbigeneEntry <- methods::setRefClass("NcbigeneEntry", contains = "BiodbEntry") - -########### -# FACTORY # -########### - -createNcbigeneEntryFromXml <- function(contents, drop = TRUE) { - - entries <- list() - - # Define xpath expressions - xpath.expr <- character() - xpath.expr[[BIODB.ACCESSION]] <- "//Gene-track_geneid" - xpath.expr[[BIODB.KEGG.ID]] <- "/Dbtag_db[text()='KEGG']/..//Object-id_str" - xpath.expr[[BIODB.UNIPROT.ID]] <- "//Gene-commentary_heading[text()='UniProtKB']/..//Dbtag_db[text()='UniProtKB/Swiss-Prot']/..//Object-id_str" - xpath.expr[[BIODB.LOCATION]] <- "//Gene-ref_maploc" - xpath.expr[[BIODB.PROTEIN.DESCRIPTION]] <- "//Gene-ref_desc" - xpath.expr[[BIODB.SYMBOL]] <- "//Gene-ref_locus" - xpath.expr[[BIODB.SYNONYMS]] <- "//Gene-ref_syn_E" - - for (content in contents) { - - # Create instance - entry <- NcbigeneEntry$new() - - # Parse HTML - xml <- XML::xmlInternalTreeParse(content, asText = TRUE) - - # An error occured - if (length(XML::getNodeSet(xml, "//Error")) == 0 && length(XML::getNodeSet(xml, "//ERROR")) == 0) { - - # Test generic xpath expressions - for (field in names(xpath.expr)) { - v <- XML::xpathSApply(xml, xpath.expr[[field]], XML::xmlValue) - if (length(v) > 0) { - - # Eliminate duplicates - v <- v[ ! duplicated(v)] - - # Set field - entry$setField(field, v) - } - } - - # CCDS ID - ccdsid <- .find.ccds.id(xml) - if ( ! is.na(ccdsid)) - entry$setField(BIODB.NCBI.CCDS.ID, ccdsid) - } - - entries <- c(entries, entry) - } - - # Replace elements with no accession id by NULL - entries <- lapply(entries, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x) - - # If the input was a single element, then output a single object - if (drop && length(contents) == 1) - entries <- entries[[1]] - - return(entries) - - # Get data - -} - -################ -# FIND CCDS ID # -################ - -.find.ccds.id <- function(xml) { - - # 1) Get all CCDS tags. - ccds_elements <- XML::getNodeSet(xml, "//Dbtag_db[text()='CCDS']/..//Object-id_str") - - # 2) If all CCDS are the same, go to point 4. - ccds <- NA_character_ - for (e in ccds_elements) { - current_ccds <- XML::xmlValue(e) - if (is.na(ccds)) - ccds <- current_ccds - else { - if (current_ccds != ccds) { - ccds <- NA_character_ - break - } - } - } - - # 3) There are several CCDS values, we need to find the best one (i.e.: the most current one). - if (is.na(ccds)) { - # For each CCDS, look for the parent Gene-commentary tag. Then look for the text content of the Gene-commentary_label which is situed under. Ignore CCDS that have no Gene-commentary_label associated. Choose the CCDS that has the smallest Gene-commentary_label in alphabetical order. - version <- NA_character_ - for (e in ccds_elements) { - versions <- XML::xpathSApply(e, "ancestor::Gene-commentary/Gene-commentary_label", XML::xmlValue) - if (length(versions) < 1) next - current_version <- versions[[length(versions)]] - if (is.na(version) || current_version < version) { - version <- current_version - ccds <- XML::xmlValue(e) - } - } - } - - return(ccds) -}
--- a/PubchemConn.R Tue Mar 14 12:40:22 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,96 +0,0 @@ -##################### -# CLASS DECLARATION # -##################### - -PubchemConn <- methods::setRefClass("PubchemConn", contains = "RemotedbConn", fields = list( .db = "character" )) - -############### -# CONSTRUCTOR # -############### - -PubchemConn$methods( initialize = function(db = BIODB.PUBCHEMCOMP, ...) { - .db <<- db - callSuper(...) -}) - -########################## -# GET ENTRY CONTENT TYPE # -########################## - -PubchemConn$methods( getEntryContentType = function() { - return(BIODB.XML) -}) - -##################### -# GET ENTRY CONTENT # -##################### - -PubchemConn$methods( getEntryContent = function(ids) { - - # Debug - .self$.print.debug.msg(paste0("Get entry content(s) for ", length(ids)," id(s)...")) - - URL.MAX.LENGTH <- 2083 - - # Initialize return values - content <- rep(NA_character_, length(ids)) - - # Loop on all - n <- 0 - while (n < length(ids)) { - - # Get list of accession ids to retrieve - accessions <- ids[(n + 1):length(ids)] - - # Create URL request - x <- get.entry.url(class = .self$.db, accession = accessions, content.type = BIODB.XML, max.length = URL.MAX.LENGTH) - - # Debug - .self$.print.debug.msg(paste0("Send URL request for ", x$n," id(s)...")) - - # Send request - xmlstr <- .self$.get.url(x$url) - - # Increase number of entries retrieved - n <- n + x$n - - # TODO When one of the id is wrong, no content is returned. Only a single error is returned, with the first faulty ID: -# <Fault xmlns="http://pubchem.ncbi.nlm.nih.gov/pug_rest" xmlns:xs="http://www.w3.org/2001/XMLSchema-instance" xs:schemaLocation="http://pubchem.ncbi.nlm.nih.gov/pug_rest https://pubchem.ncbi.nlm.nih.gov/pug_rest/pug_rest.xsd"> -# <Code>PUGREST.NotFound</Code> -# <Message>Record not found</Message> -# <Details>No record data for CID 1246452553</Details> -# </Fault> - - # Parse XML and get included XML - if ( ! is.na(xmlstr)) { - xml <- xmlInternalTreeParse(xmlstr, asText = TRUE) - ns <- c(pcns = "http://www.ncbi.nlm.nih.gov") - returned.ids <- xpathSApply(xml, paste0("//pcns:", if (.self$.db == BIODB.PUBCHEMCOMP) 'PC-CompoundType_id_cid' else 'PC-ID_id'), xmlValue, namespaces = ns) - content[match(returned.ids, ids)] <- vapply(getNodeSet(xml, paste0("//pcns:", if (.self$.db == BIODB.PUBCHEMCOMP) "PC-Compound" else 'PC-Substance'), namespaces = ns), saveXML, FUN.VALUE = '') - } - - # Debug - .self$.print.debug.msg(paste0("Now ", length(ids) - n," id(s) left to be retrieved...")) - } - - return(content) -}) - -################ -# CREATE ENTRY # -################ - -PubchemConn$methods( createEntry = function(content, drop = TRUE) { - return(if (.self$.db == BIODB.PUBCHEMCOMP) createPubchemEntryFromXml(content, drop = drop) else createPubchemSubstanceFromXml(content, drop = drop)) -}) - -######################### -# GET PUBCHEM IMAGE URL # -######################### - -get.pubchem.image.url <- function(id, db = BIODB.PUBCHEMCOMP) { - - url <- paste0('http://pubchem.ncbi.nlm.nih.gov/image/imgsrv.fcgi?', (if (db == BIODB.PUBCHEMCOMP) 'cid' else 'sid'), '=', id, '&t=l') - - return(url) -}
--- a/PubchemEntry.R Tue Mar 14 12:40:22 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,108 +0,0 @@ -##################### -# CLASS DECLARATION # -##################### - -PubchemEntry <- methods::setRefClass("PubchemEntry", contains = "BiodbEntry") -PubchemSubstance <- methods::setRefClass("PubchemSubstance", contains = "BiodbEntry") - -##################### -# SUBSTANCE FACTORY # -##################### - -createPubchemSubstanceFromXml <- function(contents, drop = TRUE) { - - entries <- list() - - # Define xpath expressions - xpath.expr <- character() - xpath.expr[[BIODB.ACCESSION]] <- "//PC-ID_id" - #xpath.expr[[BIODB.PUBCHEMCOMP.ID]] <- "//PC-CompoundType_id_cid" --> Apparently that can be more than one CID for a substance. - - for (content in contents) { - - # Create instance - entry <- PubchemEntry$new() - - if ( ! is.null(content) && ! is.na(content)) { - - # Parse XML - xml <- XML::xmlInternalTreeParse(content, asText = TRUE) - - # Unknown entry - fault <- XML::xpathSApply(xml, "/Fault", XML::xmlValue) - if (length(fault) == 0) { - - # Test generic xpath expressions - for (field in names(xpath.expr)) { - v <- XML::xpathSApply(xml, xpath.expr[[field]], XML::xmlValue) - if (length(v) > 0) - entry$setField(field, v) - } - } - } - - entries <- c(entries, entry) - } - - # Replace elements with no accession id by NULL - entries <- lapply(entries, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x) - - # If the input was a single element, then output a single object - if (drop && length(contents) == 1) - entries <- entries[[1]] - - return(entries) -} - -#################### -# COMPOUND FACTORY # -#################### - -createPubchemEntryFromXml <- function(contents, drop = TRUE) { - - entries <- list() - - # Define xpath expressions - xpath.expr <- character() - xpath.expr[[BIODB.ACCESSION]] <- "//PC-CompoundType_id_cid" - xpath.expr[[BIODB.INCHI]] <- "//PC-Urn_label[text()='InChI']/../../..//PC-InfoData_value_sval" - xpath.expr[[BIODB.INCHIKEY]] <- "//PC-Urn_label[text()='InChIKey']/../../..//PC-InfoData_value_sval" - xpath.expr[[BIODB.FORMULA]] <- "//PC-Urn_label[text()='Molecular Formula']/../../..//PC-InfoData_value_sval" - xpath.expr[[BIODB.MASS]] <- "//PC-Urn_label[text()='Mass']/../../..//PC-InfoData_value_fval" - xpath.expr[[BIODB.COMP.IUPAC.NAME.SYST]] <- "//PC-Urn_label[text()='IUPAC Name']/../PC-Urn_name[text()='Systematic']/../../..//PC-InfoData_value_sval" - - for (content in contents) { - - # Create instance - entry <- PubchemEntry$new() - - if ( ! is.null(content) && ! is.na(content)) { - - # Parse XML - xml <- XML::xmlInternalTreeParse(content, asText = TRUE) - - # Unknown entry - fault <- XML::xpathSApply(xml, "/Fault", XML::xmlValue) - if (length(fault) == 0) { - - # Test generic xpath expressions - for (field in names(xpath.expr)) { - v <- XML::xpathSApply(xml, xpath.expr[[field]], XML::xmlValue) - if (length(v) > 0) - entry$setField(field, v) - } - } - } - - entries <- c(entries, entry) - } - - # Replace elements with no accession id by NULL - entries <- lapply(entries, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x) - - # If the input was a single element, then output a single object - if (drop && length(contents) == 1) - entries <- entries[[1]] - - return(entries) -}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.md Wed Apr 19 10:00:05 2017 -0400 @@ -0,0 +1,43 @@ +LC/MS matching +============== + +[![Build Status](https://travis-ci.org/workflow4metabolomics/lcmsmatching.svg?branch=master)](https://travis-ci.org/workflow4metabolomics/lcmsmatching) + +An LC/MS matching tool for [Galaxy](https://galaxyproject.org/), part of the [Workflow4Metabolomics](http://workflow4metabolomics.org/) project, and developed during the [MetaboHUB](http://www.metabohub.fr/en) project. + +The two matching algorithms used in this tool have been imported from developments made at [CEA](http://www.cea.fr/english) Saclay, inside the *DSV/IBITEC-S/SPI*. They have been translated from C# to R. + +For more information, see the galaxy tool page, help section, available inside `galaxy/lcmsmatching.xml`. + +## search-mz + +This is the script, included in this repository, that allows run on command line an MZ matching on one of the available database types. + +Please run `search-mz -h` for a help page listing all options and presenting some examples. + +## Dependencies + + * `libssl-dev`. + * `libcurl4-openssl-dev`. + * `libxml2-dev`. + * `R` version `3.2.2`. + * `R` packages: + - `getopt` >= `1.20.0`. + - `stringr` >= `1.0.0`. + - `plyr` >= `1.8.3`. + - `XML` >= `3.98`. + - `bitops` >= `1.0_6`. + - `RCurl` >= `1.95`. + - `jsonlite` >= `1.1`. + +## Updates + +### 3.3.1 + + * Correct a bug while trying to connect to Peakforest for getting the list of chromatographic columns. + +### 3.3.0 + + * The file database (in-house) field names are now presented in individual choice lists instead of a single text box where you had to insert a very long keys/values string. + * The tool now tries to guess the names of the file database fields, the values of the MS mode column, and the names of the input file columns. + * Allows to select the unit (minutes or seconds) of retention time values inside the input file, but also inside the file database (in-house).
--- a/RemotedbConn.R Tue Mar 14 12:40:22 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,48 +0,0 @@ -if ( ! exists('RemotedbConn')) { - - ##################### - # CLASS DECLARATION # - ##################### - - RemotedbConn <- methods::setRefClass("RemotedbConn", contains = "BiodbConn", fields = list(.scheduler = "UrlRequestScheduler", .token = "character")) - - ############### - # CONSTRUCTOR # - ############### - - RemotedbConn$methods( initialize = function(useragent = NA_character_, scheduler = NULL, token = NA_character_, ...) { - - # Check useragent - ( ! is.null(useragent) && ! is.na(useragent)) || stop("You must specify a valid useragent string (e.g.: \"myapp ; my.email@address\").") - - # Set token - .token <<- token - - # Set scheduler - if (is.null(scheduler)) - scheduler <- UrlRequestScheduler$new(n = 3) - inherits(scheduler, "UrlRequestScheduler") || stop("The scheduler instance must inherit from UrlRequestScheduler class.") - scheduler$setUserAgent(useragent) # set agent - .scheduler <<- scheduler - - callSuper(...) # calls super-class initializer with remaining parameters - }) - - ########### - # GET URL # - ########### - - RemotedbConn$methods( .get.url = function(url) { - .self$.print.debug.msg(paste0("Sending URL request '", url, "'...")) - return(.self$.scheduler$getUrl(url)) - }) - - ########### - # GET URL # - ########### - - RemotedbConn$methods( .set.useragent = function(useragent) { - .scheduler$setUserAgent(useragent) # set agent - }) - -}
--- a/UniprotConn.R Tue Mar 14 12:40:22 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,36 +0,0 @@ -##################### -# CLASS DECLARATION # -##################### - -UniprotConn <- methods::setRefClass("UniprotConn", contains = "RemotedbConn") - -########################## -# GET ENTRY CONTENT TYPE # -########################## - -UniprotConn$methods( getEntryContentType = function() { - return(BIODB.XML) -}) - -##################### -# GET ENTRY CONTENT # -##################### - -UniprotConn$methods( getEntryContent = function(ids) { - - # Initialize return values - content <- rep(NA_character_, length(ids)) - - # Request - content <- vapply(ids, function(x) .self$.get.url(get.entry.url(BIODB.UNIPROT, x, content.type = BIODB.XML)), FUN.VALUE = '') - - return(content) -}) - -################ -# CREATE ENTRY # -################ - -UniprotConn$methods( createEntry = function(content, drop = TRUE) { - return(createUniprotEntryFromXml(content, drop = drop)) -})
--- a/UniprotEntry.R Tue Mar 14 12:40:22 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,74 +0,0 @@ -##################### -# CLASS DECLARATION # -##################### - -UniprotEntry <- methods::setRefClass("UniprotEntry", contains = "BiodbEntry") - -########### -# FACTORY # -########### - -createUniprotEntryFromXml <- function(contents, drop = FALSE) { - - # Set XML namespace - ns <- c(uniprot = "http://uniprot.org/uniprot") - - entries <- list() - - # Define xpath expressions - xpath.values <- character() - xpath.values[[BIODB.NAME]] <- "/uniprot:uniprot/uniprot:entry/uniprot:name" - xpath.values[[BIODB.GENE.SYMBOLS]] <- "//uniprot:gene/uniprot:name" - xpath.values[[BIODB.FULLNAMES]] <- "//uniprot:protein//uniprot:fullName" - xpath.values[[BIODB.SEQUENCE]] <- "//uniprot:entry/uniprot:sequence" - xpath.values[[BIODB.ACCESSION]] <- "//uniprot:accession[1]" - xpath.attr <- list() - xpath.attr[[BIODB.KEGG.ID]] <- list(path = "//uniprot:dbReference[@type='KEGG']", attr = 'id') - xpath.attr[[BIODB.NCBI.GENE.ID]] <- list(path = "//uniprot:dbReference[@type='GeneID']", attr = 'id') - xpath.attr[[BIODB.ENZYME.ID]] <- list(path = "//uniprot:dbReference[@type='EC']", attr = 'id') - xpath.attr[[BIODB.MASS]] <- list(path = "//uniprot:entry/uniprot:sequence", attr = 'mass') - xpath.attr[[BIODB.LENGTH]] <- list(path = "//uniprot:entry/uniprot:sequence", attr = 'length') - - for (content in contents) { - - # Create instance - entry <- UniprotEntry$new() - - # If the entity doesn't exist (i.e.: no <id>.xml page), then it returns an HTML page - if ( ! grepl("^<!DOCTYPE html ", content, perl = TRUE)) { - - # Parse XML - xml <- XML::xmlInternalTreeParse(content, asText = TRUE) - - # Test value xpath - for (field in names(xpath.values)) { - v <- XML::xpathSApply(xml, xpath.values[[field]], XML::xmlValue, namespaces = ns) - if (length(v) > 0) - entry$setField(field, v) - } - - # Test attribute xpath - for (field in names(xpath.attr)) { - v <- XML::xpathSApply(xml, xpath.attr[[field]]$path, XML::xmlGetAttr, xpath.attr[[field]]$attr, namespaces = ns) - if (length(v) > 0) - entry$setField(field, v) - } - - # Remove new lines from sequence string - seq <- entry$getField(BIODB.SEQUENCE) - if ( ! is.na(seq)) - entry$setField(BIODB.SEQUENCE, gsub("\\n", "", seq)) - } - - entries <- c(entries, entry) - } - - # Replace elements with no accession id by NULL - entries <- lapply(entries, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x) - - # If the input was a single element, then output a single object - if (drop && length(contents) == 1) - entries <- entries[[1]] - - return(entries) -}
--- a/UrlRequestScheduler.R Tue Mar 14 12:40:22 2017 -0400 +++ b/UrlRequestScheduler.R Wed Apr 19 10:00:05 2017 -0400 @@ -1,135 +1,126 @@ -############# -# CONSTANTS # -############# - -BIODB.GET <- 'GET' -BIODB.POST <- 'POST' - -##################### -# CLASS DECLARATION # -##################### - -UrlRequestScheduler <- methods::setRefClass("UrlRequestScheduler", fields = list(.n = "numeric", .t = "numeric", .time.of.last.request = "ANY", .useragent = "character", .ssl.verifypeer = "logical", .nb.max.tries = "integer", .verbose = "integer")) - -# n: number of connections -# t: time (in seconds) - -# The scheduler restrict the number of connections at n per t seconds. - -############### -# CONSTRUCTOR # -############### - -UrlRequestScheduler$methods( initialize = function(n = 1, t = 1, useragent = NA_character_, ssl.verifypeer = TRUE, ...) { - .n <<- n - .t <<- t - .time.of.last.request <<- -1 - .useragent <<- useragent - .nb.max.tries <<- 10L - .ssl.verifypeer <<- ssl.verifypeer - .verbose <<- 0L - callSuper(...) # calls super-class initializer with remaining parameters -}) +if ( ! exists('UrlRequestScheduler')) { # Do not load again if already loaded -################## -# SET USER AGENT # -################## - -UrlRequestScheduler$methods( setUserAgent = function(useragent) { - .useragent <<- useragent -}) - -############### -# SET VERBOSE # -############### - -UrlRequestScheduler$methods( setVerbose = function(verbose) { - .verbose <<- verbose -}) + ############# + # CONSTANTS # + ############# -################## -# WAIT AS NEEDED # -################## - -# Wait the specified between two requests. -UrlRequestScheduler$methods( .wait.as.needed = function() { - - # Compute minimum waiting time between two URL requests - waiting_time <- .self$.t / .self$.n - - # Wait, if needed, before previous URL request and this new URL request. - if (.self$.time.of.last.request > 0) { - spent_time <- Sys.time() - .self$.time.of.last.request - if (spent_time < waiting_time) - Sys.sleep(waiting_time - spent_time) - } + RLIB.GET <- 'GET' + RLIB.POST <- 'POST' - # Store current time - .time.of.last.request <<- Sys.time() -}) - -######################### -# GET CURL OPTIONS {{{1 # -######################### - -UrlRequestScheduler$methods( .get.curl.opts = function(opts = list()) { - opts <- RCurl::curlOptions(useragent = .self$.useragent, timeout.ms = 60000, verbose = FALSE, .opts = opts) - return(opts) -}) - -################### -# DO GET URL {{{1 # -################### - -UrlRequestScheduler$methods( .doGetUrl = function(url, params = list(), method = BIODB.GET, opts = .self$.get.curl.opts()) { - - content <- NA_character_ - - # Use form to send URL request - if ( method == BIODB.POST || ( ! is.null(params) && ! is.na(params) && length(params) > 0)) { - switch(method, - GET = { content <- RCurl::getForm(url, .opts = opts, .params = params) }, - POST = { content <- RCurl::postForm(url, .opts = opts, .params = params) }, - stop(paste('Unknown method "', method, '".')) - ) - } - - # Get URL normally - else { - content <- RCurl::getURL(url, .opts = opts, ssl.verifypeer = .self$.ssl.verifypeer) - } - return(content) -}) - -########################## -# SEND SOAP REQUEST {{{1 # -########################## - -UrlRequestScheduler$methods( sendSoapRequest = function(url, request) { - header <- c(Accept="text/xml", Accept="multipart/*", 'Content-Type'="text/xml; charset=utf-8") - opts <- .self$.get.curl.opts(list(httpheader = header, postfields = request)) - results <- .self$getUrl(url, method = BIODB.POST, opts = opts) - return(results) -}) - -################ -# GET URL {{{1 # -################ - -UrlRequestScheduler$methods( getUrl = function(url, params = list(), method = BIODB.GET, opts = .self$.get.curl.opts()) { - - content <- NA_character_ - - # Wait required time between two requests - .self$.wait.as.needed() - - # Run query - for (i in seq(.self$.nb.max.tries)) { - tryCatch({ content <- .self$.doGetUrl(url, params = params, method = method, opts = opts) }, - error = function(e) { if (.self$.verbose > 0) print("Retry connection to server...") } ) - if ( ! is.na(content)) - break - } - - return(content) -}) + ##################### + # CLASS DECLARATION # + ##################### + + UrlRequestScheduler <- setRefClass("UrlRequestScheduler", fields = list(.n = "numeric", .t = "numeric", .time.of.last.request = "ANY", .useragent = "character", .ssl.verifypeer = "logical", .nb.max.tries = "integer", .verbose = "integer")) + + # n: number of connections + # t: time (in seconds) + + # The scheduler restrict the number of connections at n per t seconds. + + ############### + # CONSTRUCTOR # + ############### + + UrlRequestScheduler$methods( initialize = function(n = 1, t = 1, useragent = NA_character_, ssl.verifypeer = TRUE, ...) { + .n <<- n + .t <<- t + .time.of.last.request <<- -1 + .useragent <<- useragent + .nb.max.tries <<- 10L + .ssl.verifypeer <<- ssl.verifypeer + .verbose <<- 0L + callSuper(...) # calls super-class initializer with remaining parameters + }) + + ################## + # SET USER AGENT # + ################## + + UrlRequestScheduler$methods( setUserAgent = function(useragent) { + .useragent <<- useragent + }) + + ############### + # SET VERBOSE # + ############### + + UrlRequestScheduler$methods( setVerbose = function(verbose) { + .verbose <<- verbose + }) + + ################## + # WAIT AS NEEDED # + ################## + + # Wait the specified between two requests. + UrlRequestScheduler$methods( .wait.as.needed = function() { + + # Compute minimum waiting time between two URL requests + waiting_time <- .self$.t / .self$.n + + # Wait, if needed, before previous URL request and this new URL request. + if (.self$.time.of.last.request > 0) { + spent_time <- Sys.time() - .self$.time.of.last.request + if (spent_time < waiting_time) + Sys.sleep(waiting_time - spent_time) + } + + # Store current time + .time.of.last.request <<- Sys.time() + }) + + #################### + # GET CURL OPTIONS # + #################### + + UrlRequestScheduler$methods( .get_curl_opts = function(url) { + opts <- curlOptions(useragent = .self$.useragent, timeout.ms = 60000, verbose = FALSE) + return(opts) + }) + + ########### + # GET URL # + ########### + + UrlRequestScheduler$methods( .doGetUrl = function(url, params = NULL, method = RLIB.GET) { + + content <- NA_character_ + + # Use form to send URL request + if ( ! is.null(params) && ! is.na(params)) + switch(method, + GET = { content <- getForm(url, .opts = .self$.get_curl_opts(), .params = params) }, + POST = { content <- postForm(url, .opts = .self$.get_curl_opts(), .params = params) }, + stop(paste('Unknown method "', method, '".')) + ) + + # Get URL normally + else + content <- getURL(url, .opts = .self$.get_curl_opts(), ssl.verifypeer = .self$.ssl.verifypeer) + + return(content) + }) + + UrlRequestScheduler$methods( getUrl = function(url, params = NULL, method = RLIB.GET) { + + # Load library here and not inside .doGetUrl() since it is called from inside a try/catch clause, hence if library is missing the error will be ignored. + library(bitops) + library(RCurl) + + content <- NA_character_ + + # Wait required time between two requests + .self$.wait.as.needed() + + # Run query + for (i in seq(.self$.nb.max.tries)) { + tryCatch({ content <- .self$.doGetUrl(url, params = params, method = method) }, + error = function(e) { if (.self$.verbose > 0) print("Retry connection to server...") } ) + if ( ! is.na(content)) + break + } + + return(content) + }) +}
--- a/biodb-package.R Tue Mar 14 12:40:22 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,4 +0,0 @@ -#' @useDynLib biodb -#' @importFrom methods new -#' @importFrom methods getGeneric -NULL
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/build.xml Wed Apr 19 10:00:05 2017 -0400 @@ -0,0 +1,396 @@ +<project name="w4m.tool.lcmsmatching" default="all"> + + <dirname property="this.dir" file="${ant.file.w4m.tool.lcmsmatching}"/> + <property name="conda.dir" value="${user.home}/w4m-conda"/> + <property name="planemo.dir" value="${user.home}/.planemo"/> + + <!--~~~~~~~~~~~~~~~~~ + ~ PUBLIC PROPERTIES ~ + ~~~~~~~~~~~~~~~~~~--> + + <!-- These properties can be set when calling Ant: `ant -DPROP=value ...`. --> + + <property name="TIMESTAMP" value="true"/> + <property name="VERSION" value="true"/> + <property name="DIST.TEST" value="true"/> + <property name="TOOL.PREFIX" value="$__tool_directory__/"/> + <property name="PKG.PREFIX" value="w4m-tool-lcmsmatching"/> + + <!--~~~~~~~~~~~~~~~~~~~ + ~ INTERNAL PROPERTIES ~ + ~~~~~~~~~~~~~~~~~~~~--> + + <!-- Version --> + <property name="version" value="2.1.3"/> + <condition property="version.suffix" value="" else="-${version}"> + <isfalse value="${VERSION}"/> + </condition> + + <!-- Distribution directories --> + <property name="dist.dir" value="dist"/> + <property name="dist.code.dir" value="${dist.dir}/code"/> + <property name="dist.test.dir" value="${dist.dir}/test"/> + + <!-- Tool XML paths --> + <property name="tool.xml" value="lcmsmatching.xml"/> + <property name="orig.tool.xml" value="${tool.xml}"/> + <property name="dest.tool.xml" value="${dist.code.dir}/${tool.xml}"/> + + <!-- Time stamp --> + <tstamp/> + <property name="timestamp" value="${DSTAMP}-${TSTAMP}"/> + <condition property="timestamp.suffix" value="" else="-${timestamp}"> + <isfalse value="${TIMESTAMP}"/> + </condition> + + <!-- Package --> + <property name="pkg.ext" value="tar.gz"/> + <property name="pkg.name" value="${PKG.PREFIX}${version.suffix}${timestamp.suffix}"/> + <property name="pkg.path" value="${dist.dir}/${pkg.name}.${pkg.ext}"/> + + <!--~~~ + ~ ALL ~ + ~~~~~--> + + <target name="all"/> + + <!--~~~~ + ~ DIST ~ + ~~~~~--> + + <target name="dist" depends="dist.code,dist.tar,dist.test"/> + + <!--~~~~~~~~ + ~ DIST W4M ~ + ~~~~~~~~~--> + + <target name="dist.w4m" depends="w4m.code,dist.tar,dist.test"/> + + <!--~~~~~~~~~ + ~ DIST TEST ~ + ~~~~~~~~~~--> + + <target name="dist.test" if="${DIST.TEST}"> + + <!-- Make temp dir --> + <delete dir="${dist.test.dir}"/> + <mkdir dir="${dist.test.dir}"/> + + <!-- Extract package in temp dir --> + <untar src="${pkg.path}" dest="${dist.test.dir}" compression="gzip"/> + <chmod file="${dist.test.dir}/search-mz" perm="u+x"/> <!-- This file should be already executable, since it has been put executable inside the tar. It seems the untar task of Ant does not handle the file permissions. --> + + <!-- Run search-mz on sample input file --> + <exec executable="${dist.test.dir}/search-mz" failonerror="true"> + <arg value="-d"/> + <arg value="file"/> + <arg value="--url"/> + <arg value="test/filedb.tsv"/> + <arg value="-m"/> + <arg value="pos"/> + <arg value="-i"/> + <arg value="test/mzrt-input.tsv"/> + <arg value="-o"/> + <arg value="mzrt-output.tsv"/> + </exec> + + </target> + + <!--~~~~~~~~ + ~ W4M CODE ~ + ~~~~~~~~~--> + + <target name="w4m.code" depends="dist.code"> + + <!-- Copy and transform tool XML file. --> + <copy file="${orig.tool.xml}" tofile="${dest.tool.xml}"/> + + <!-- Copy python script. --> + <copy todir="${dist.code.dir}"> + <fileset dir="." includes="*.py"/> + </copy> + </target> + + <!--~~~~~~~~~ + ~ DIST CODE ~ + ~~~~~~~~~~--> + + <target name="dist.code"> + + <!-- Clean directory --> + <delete dir="${dist.code.dir}"/> + <mkdir dir="${dist.code.dir}"/> + + <!-- Copy R code --> + <copy todir="${dist.code.dir}"> + <fileset dir="." includes="search-mz,*.R"/> + </copy> + + </target> + + <!--~~~~~~~~ + ~ DIST TAR ~ + ~~~~~~~~~--> + + <target name="dist.tar"> + + <!-- Build tar file --> + <tar destfile="${pkg.path}" compression="gzip"> + + <!-- Include script with execution rights --> + <tarfileset dir="${dist.code.dir}" filemode="755"> + <include name="search-mz"/> + </tarfileset> + + <!-- Include remaining code and data files --> + <tarfileset dir="${dist.code.dir}"> + <include name="**"/> + <exclude name="search-mz"/> + </tarfileset> + </tar> + </target> + + <!--~~~~~ + ~ CLEAN ~ + ~~~~~~--> + + <target name="clean"> + <delete dir="${dist.dir}"/> + <delete dir="${conda.dir}"/> + <delete dir="${planemo.dir}"/> + </target> + + <!--~~~~~~~~~~~ + ~ GALAXY TEST ~ + ~~~~~~~~~~~~--> + + <target name="galaxy.test" depends="planemo.lint,planemo.test"/> + + <!--~~~~~~~~~~~~ + ~ PLANEMO LINT ~ + ~~~~~~~~~~~~~--> + + <target name="planemo.lint" depends="planemo.env"> + <exec executable="planemo" dir="${dist.code.dir}" failonerror="true"> + <arg value="lint"/> + <arg value="--no_xsd"/> + <arg value="${tool.xml}"/> + </exec> + </target> + + <!--~~~~~~~~~~~~ + ~ PLANEMO TEST ~ + ~~~~~~~~~~~~~--> + + <target name="planemo.test" depends="planemo.conda.install"> + <exec executable="planemo" dir="${dist.code.dir}" failonerror="true"> + <arg value="test"/> + <arg value="--conda_prefix"/> + <arg value="${conda.dir}"/> + <arg value="--galaxy_branch"/> + <arg value="release_16.01"/> + <arg value="--conda_dependency_resolution"/> + <arg value="${tool.xml}"/> + </exec> + </target> + + <!--~~~~~~~~~~~~~~~~~~~~~ + ~ PLANEMO CONDA INSTALL ~ + ~~~~~~~~~~~~~~~~~~~~~~--> + + <target name="planemo.conda.install" depends="planemo.conda.init,planemo.env"> + <exec executable="planemo" dir="${dist.code.dir}" failonerror="true"> + <arg value="conda_install"/> + <arg value="--conda_prefix"/> + <arg value="${conda.dir}"/> + <arg value="${tool.xml}"/> + </exec> + </target> + + <!--~~~~~~~~~~~~~~~~~~ + ~ PLANEMO CONDA INIT ~ + ~~~~~~~~~~~~~~~~~~~--> + + <target name="planemo.conda.init"> + <exec executable="planemo" failonerror="true"> + <arg value="conda_init"/> + <arg value="--conda_prefix"/> + <arg value="${conda.dir}"/> + </exec> + </target> + + <!--~~~~~~~~~~~~~~~~~ + ~ PLANEMO SHED LINT ~ + ~~~~~~~~~~~~~~~~~~--> + + <target name="planemo.shed.lint" depends="planemo.env"> + <exec executable="planemo" dir="${dist.code.dir}" failonerror="true"> + <arg value="shed_lint"/> + <arg value="--tools"/> + </exec> + </target> + + <!--~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ~ PLANEMO TEST TOOLSHED CREATE ~ + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~--> + + <target name="planemo.testtoolshed.create" depends="planemo.env"> + <exec executable="planemo" dir="${dist.code.dir}" failonerror="true"> + <arg value="shed_create"/> + <arg value="--shed_target"/> + <arg value="testtoolshed"/> + </exec> + </target> + + <!--~~~~~~~~~~~~~~~~~~~~~~~~~~ + ~ PLANEMO TEST TOOLSHED DIFF ~ + ~~~~~~~~~~~~~~~~~~~~~~~~~~~--> + + <target name="planemo.testtoolshed.diff" depends="planemo.env"> + <exec executable="planemo" dir="${dist.code.dir}" failonerror="true"> + <arg value="shed_diff"/> + <arg value="--shed_target"/> + <arg value="testtoolshed"/> + </exec> + </target> + + <!--~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ~ PLANEMO TEST TOOLSHED UPDATE ~ + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~--> + + <target name="planemo.testtoolshed.update" depends="planemo.env"> + <exec executable="planemo" dir="${dist.code.dir}" failonerror="true"> + <arg value="shed_update"/> + <arg value="--check_diff"/> + <arg value="--shed_target"/> + <arg value="testtoolshed"/> + </exec> + </target> + + <!--~~~~~~~~~~~~~~~~~~~~~~~~~~ + ~ PLANEMO TEST TOOLSHED TEST ~ + ~~~~~~~~~~~~~~~~~~~~~~~~~~~--> + + <target name="planemo.testtoolshed.test" depends="planemo.env"> + <exec executable="planemo" dir="${dist.code.dir}" failonerror="true"> + <arg value="shed_test"/> + <arg value="--shed_target"/> + <arg value="testtoolshed"/> + <arg value="--install_galaxy"/> + <arg value="--galaxy_branch"/> + <arg value="release_16.01"/> + </exec> + </target> + + <!--~~~~~~~~~~~~~~~~~~~~~~~ + ~ PLANEMO TOOLSHED CREATE ~ + ~~~~~~~~~~~~~~~~~~~~~~~~--> + + <target name="planemo.toolshed.create" depends="planemo.env"> + <exec executable="planemo" dir="${dist.code.dir}" failonerror="true"> + <arg value="shed_create"/> + <arg value="--shed_target"/> + <arg value="toolshed"/> + </exec> + </target> + + <!--~~~~~~~~~~~~~~~~~~~~~ + ~ PLANEMO TOOLSHED DIFF ~ + ~~~~~~~~~~~~~~~~~~~~~~--> + + <target name="planemo.toolshed.diff" depends="planemo.env"> + <exec executable="planemo" dir="${dist.code.dir}" failonerror="true"> + <arg value="shed_diff"/> + <arg value="--shed_target"/> + <arg value="toolshed"/> + </exec> + </target> + + <!--~~~~~~~~~~~~~~~~~~~~~~~ + ~ PLANEMO TOOLSHED UPDATE ~ + ~~~~~~~~~~~~~~~~~~~~~~~~--> + + <target name="planemo.toolshed.update" depends="planemo.env"> + <exec executable="planemo" dir="${dist.code.dir}" failonerror="true"> + <arg value="shed_update"/> + <arg value="--check_diff"/> + <arg value="--shed_target"/> + <arg value="toolshed"/> + </exec> + </target> + + <!--~~~~~~~~~~~ + ~ PLANEMO ENV ~ + ~~~~~~~~~~~~--> + + <target name="planemo.env" depends="w4m.code"> + <chmod file="${dist.code.dir}/search-mz" perm="u+x"/> + <ant dir="test" target="input.files"/> + <mkdir dir="${dist.code.dir}/test-data"/> + <copy todir="${dist.code.dir}/test-data"> + <fileset dir="test" includes="filedb.tsv"/> + <fileset dir="test" includes="mz-input-small.tsv"/> + <fileset dir="test/res" includes="filedb-small-mz-match-*"/> + </copy> + <copy file="shed.yml" tofile="${dist.code.dir}/.shed.yml"/> + </target> + + <!--************************************************ + ******************** DEPRECATED ******************** + *************************************************--> + + <!--~~~~~~~~~~~~~ + ~ UPDATE W4M VM ~ + ~~~~~~~~~~~~~~--> + + <!-- This task is used when developping, for updating quickly the tool inside the local W4M virtual machine. --> + <target name="update.w4m.vm" depends="clean,dist"> + + <property name="w4m.login" value="galaxy@w4m"/> + <property name="tool.path" value="galaxy-pfem/tools/metabolomics/annotation/lcmsmatching"/> + + <!-- Stop Galaxy --> + <exec executable="ssh" failonerror="true"> + <arg value="${w4m.login}"/> + <arg value="/sbin/service galaxy stop"/> + </exec> + + <!-- Remove current tool version --> + <exec executable="ssh" failonerror="true"> + <arg value="${w4m.login}"/> + <arg value="rm -rf ${tool.path}"/> + </exec> + + <!-- Remove old packages on W4M instance --> + <exec executable="ssh" failonerror="true"> + <arg value="${w4m.login}"/> + <arg value="rm -f ${PKG.PREFIX}-*.${pkg.ext}"/> + </exec> + + <!-- Copy new package on W4M instance --> + <exec executable="scp" failonerror="true"> + <arg value="${dist.dir}/${pkg.name}.${pkg.ext}"/> + <arg value="${w4m.login}:."/> + </exec> + + <!-- Make tool directory --> + <exec executable="ssh" failonerror="true"> + <arg value="${w4m.login}"/> + <arg value="mkdir -p ${tool.path}"/> + </exec> + + <!-- Install new tool version --> + <exec executable="ssh" failonerror="true"> + <arg value="${w4m.login}"/> + <arg value="tar -xzf ${pkg.name}.${pkg.ext} -C ${tool.path}"/> + </exec> + + <!-- Restart Galaxy --> + <exec executable="ssh" failonerror="true"> + <arg value="${w4m.login}"/> + <arg value="/sbin/service galaxy start"/> + </exec> + + </target> + +</project>
--- a/chem.R Tue Mar 14 12:40:22 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,119 +0,0 @@ -if ( ! exists('load.sdf')) { # Do not load again if already loaded - - ############# - # CONSTANTS # - ############# - - R.LIB.CHEM.FILE.PATH <- parent.frame(2)$ofile - - CARBOXYL.GROUP <- "carboxyl" - - ################## - # LOAD JAVA CHEM # - ################## - - load.java.chem <- function() { - library(rJava) - .jinit() - .jcall('java/lang/System', 'S', 'setProperty', "rJava.debug", "1") # DEBUG/VERBOSE mode --> TODO does not work - cmd <- c("mvn", "-f", file.path(dirname(R.LIB.CHEM.FILE.PATH), '..', 'java-chem'), "org.apache.maven.plugins:maven-dependency-plugin:2.10:build-classpath") - classpath <- system(paste(cmd, collapse = " "), intern = TRUE) - classpath <- grep("^\\[INFO]", classpath, invert = TRUE, value = TRUE) - classpath <- strsplit(classpath, split = ':')[[1]] # TODO make it portable (classpath under Windows use ';' instead of ':') - .jaddClassPath(classpath) - .jaddClassPath(file.path(dirname(R.LIB.CHEM.FILE.PATH), '..', 'java-chem', 'target', 'java-chem-1.0.jar')) - } - - ############# - # GET INCHI # - ############# - - get.inchi <- function(mol) { - load.java.chem() - cdkhlp <- .jnew('org/openscience/chem/CdkHelper') - inchi <- .jcall(cdkhlp, 'S', 'getInchi', mol) - return(inchi) - } - - ######################### - # CONTAINS SUBSTRUCTURE # - ######################### - - contains.substructure <- function(inchi, group) { - - load.java.chem() - cdkhlp <- .jnew('org/openscience/chem/CdkHelper') - - # Search for substructure - contains <- .jcall(cdkhlp, '[Z', 'containFunctionalGroup', inchi, toupper(group)) - - return(contains) - } - - ############ - # LOAD SDF # - ############ - - load.sdf <- function(file, silent = FALSE) { - - library(stringr) - - # Valid file ? - if ( ! file.exists(file)) { - if ( ! silent) - warning(paste0("SDF File \"", file, "\" does not exist.")) - return(NULL) - } - - info <- data.frame() - - # Read file line by line - con <- file(file) - open(con) - imol <- 1 # Index of molecule inside the file - field.name <- NA_character_ - while (TRUE) { - - # Read one line - line <- readLines(con, n = 1) - if (length(line) == 0) - break - - # Field value - if ( ! is.na(field.name)) { - info[imol, field.name] <- line - field.name <- NA_character_ - next - } - - # Empty line - if (line == "") { - field.name <- NA_character_ - next - } - - # End of molecule - if (substring(line, 1, 4) == "$$$$") { - field.name <- NA_character_ - imol <- imol + 1 - next - } - - # Metadata field - g <- str_match(line, "^> <(.*)>$") - if ( ! is.na(g[1,2])) { - field.name <- g[1,2] - next - } - } - close(con) - - # Load molecule structures - load.java.chem() - cdkhlp <- .jnew('org/openscience/chem/CdkHelper') - struct <- .jcall(cdkhlp, '[Lorg/openscience/cdk/interfaces/IAtomContainer;', 'loadSdf', file) - - return(list(struct = struct, info = info)) - } - -} # end of load safe guard
--- a/dfhlp.R Tue Mar 14 12:40:22 2017 -0400 +++ b/dfhlp.R Wed Apr 19 10:00:05 2017 -0400 @@ -44,21 +44,6 @@ df[c(not.cols, cols)] } - ############## - # READ TABLE # - ############## - - df.read.table <- function(file, sep = "", header = TRUE, remove.na.rows = TRUE, check.names = TRUE, stringsAsFactors = TRUE, trim.header = FALSE, trim.values = FALSE, fileEncoding = "") { - - # Call built-in read.table() - df <- read.table(file, sep = sep, header = header, check.names = check.names, stringsAsFactors = stringsAsFactors, fileEncoding = fileEncoding) - - # Clean data frame - df <- df.clean(df, trim.colnames = trim.header, trim.values = trim.values, remove.na.rows = remove.na.rows) - - return(df) - } - ################# # READ CSV FILE # #################
--- a/hshhlp.R Tue Mar 14 12:40:22 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,20 +0,0 @@ -# Function for testing if a key exists inside a list/hashmap -hHasKey <- function(h, k) { - return(length(which(names(h) == k)) > 0) -} - -# Function for getting a boolean value from a list/hashmap -hGetBool <- function(h, k) { - if (hHasKey(h, k)) return(h[[k]]) else return(FALSE) -} - -# keys A list of keys. -# values A list of values. -# RETURN A hash using keys as keys and values as values. -hCreate <- function(keys, values) { - h <- list() - sz <- min(length(keys), length(values)) - for(i in 1:sz) - h[ keys[[i]] ] <- values[i] - return(h) -}
--- a/lcmsmatching.xml Tue Mar 14 12:40:22 2017 -0400 +++ b/lcmsmatching.xml Wed Apr 19 10:00:05 2017 -0400 @@ -1,33 +1,43 @@ -<tool id="lcmsmatching" name="LC/MS matching" version="3.2.0" profile="16.01"> +<tool id="lcmsmatching" name="LC/MS matching" version="3.3.1" profile="16.01"> <description>Annotation of MS peaks using matching on a spectra database.</description> <requirements> + <!--<requirement type="package" version="3.3.3">r</requirement>--> + <requirement type="package" version="7.0">readline</requirement> <!-- Try readline 7.0 --> <requirement type="package" version="1.20.0">r-getopt</requirement> <requirement type="package" version="1.0.0">r-stringr</requirement> <requirement type="package" version="1.8.3">r-plyr</requirement> <requirement type="package" version="3.98">r-xml</requirement> <requirement type="package" version="1.0_6">r-bitops</requirement> <requirement type="package" version="1.95">r-rcurl</requirement> - <requirement type="package" version="1.3">r-rjsonio</requirement> + <requirement type="package" version="1.1">r-jsonlite</requirement> </requirements> <code file="list-chrom-cols.py"/> + <code file="list-file-cols.py"/> + <code file="list-ms-mode-values.py"/> - <!--~~~~~~~ - ~ COMMAND ~ - ~~~~~~~~--> + <!--======= + = COMMAND = + ========--> <command> <![CDATA[ ## @@@BEGIN_CHEETAH@@@ - $__tool_directory__/search-mz -i "$mzrtinput" + $__tool_directory__/search-mz + + ## Input file + -i "$mzrtinput" + --input-col-names "mz=$inputmzfield,rt=$inputrtfield" + --rtunit "$inputrtunit" ## Database #if $db.dbtype == "inhouse" -d file - --db-fields "$db.dbfields" - --db-ms-modes "$db.dbmsmodes" + --db-fields "mztheo=$db.dbmzreffield,chromcolrt=$db.dbchromcolrtfield,compoundid=$db.dbspectrumidfield,chromcol=$db.dbchromcolfield,msmode=$db.dbmsmodefield,peakattr=$db.dbpeakattrfield,pubchemcompid=$db.dbpubchemcompidfield,chebiid=$db.dbchebiidfield,hmdbid=$db.dbhmdbidfield,keggid=$db.dbkeggidfield" + --db-ms-modes "pos=$db.dbmsposmode,neg=$db.dbmsnegmode" + --db-rt-unit $db.dbrtunit #end if #if $db.dbtype == "peakforest" -d peakforest @@ -57,22 +67,14 @@ ## HTML output --html-output-file "$htmloutput" --no-main-table-in-html-output - ## Fields of input file - --input-col-names "$inputfields" - ## Ouput setting - #if $out.enabled == "true" - --output-col-names "$out.outputfields" - --molids-sep "$out.molidssep" - #else - --molids-sep "|" - #end if + --molids-sep "$molidssep" ## @@@END_CHEETAH@@@ ]]></command> - <!--~~~~~~ - ~ INPUTS ~ - ~~~~~~~--> + <!--====== + = INPUTS = + =======--> <inputs> @@ -90,10 +92,26 @@ <param name="dburl" label="Database file" type="data" format="tabular,tsv" refresh_on_change="true" help="Decimal: '.', missing: NA, mode: character and numerical, sep: tabular. Retention time values must be in seconds."/> <!-- File database field names --> - <param name="dbfields" label="File database column names" type="text" size="256" value="mztheo=mztheo,chromcolrt=chromcolrt,compoundid=compoundid,chromcol=chromcol,msmode=msmode,peakattr=peakattr,peakcomp=peakcomp,fullnames=fullnames,compoundmass=compoundmass,compoundcomp=compoundcomp,inchi=inchi,inchikey=inchikey,pubchemcompid=pubchemcompid,chebiid=chebiid,hmdbid=hmdbid,keggid=keggid" refresh_on_change="true" help=""/> + <param name="dbspectrumidfield" type="select" label="Database file Spectrum ID column name" dynamic_options="get_file_cols(file = db['dburl'], preferred = 'spectrumid,accession,compoundid,molid')" help="Select the Spectrum ID column of the database file."/> + <param name="dbmzreffield" type="select" label="Database file Reference MZ column name" dynamic_options="get_file_cols(file = db['dburl'], preferred = 'mztheo,mzexp,mz')" help="Select the Reference MZ column of the database file."/> + <param name="dbchromcolfield" type="select" label="Database file Chromatographic Column Name column name" dynamic_options="get_file_cols(file = db['dburl'], preferred = 'chromcol,col')" help="Select the Chromatographic Column Name column of the database file." refresh_on_change="true"/> + <param name="dbchromcolrtfield" type="select" label="Database file Chromatographic Column Retention Time column name" dynamic_options="get_file_cols(file = db['dburl'], preferred = 'chromcolrt,colrt,rt')" help="Select the Chromatographic Column Retention Time column of the database file."/> + <param name="dbmsmodefield" type="select" label="Database file MS Mode column name" dynamic_options="get_file_cols(file = db['dburl'], preferred = 'msmode,mode')" help="Select the MS Mode column of the database file." refresh_on_change="true"/> + <param name="dbpeakattrfield" type="select" label="Database file Peak Attribution column name" dynamic_options="get_file_cols(file = db['dburl'], preferred = 'peakattr,attr')" help="Select the Peak Attribution column of the database file."/> + <param name="dbpubchemcompidfield" type="select" label="Database file PubChem Compound ID column name" dynamic_options="get_file_cols(file = db['dburl'], preferred = 'pubchemcompid,pubchemid,pubchemcomp,pubchem')" help="Select the PubChem Compound ID column of the database file."/> + <param name="dbchebiidfield" type="select" label="Database file ChEBI ID column name" dynamic_options="get_file_cols(file = db['dburl'], preferred = 'chebiid,chebi')" help="Select the ChEBI ID column of the database file."/> + <param name="dbhmdbidfield" type="select" label="Database file HMDB Metabolite ID column name" dynamic_options="get_file_cols(file = db['dburl'], preferred = 'hmdbid,hmdb')" help="Select the HMDB Metabolite ID column of the database file."/> + <param name="dbkeggidfield" type="select" label="Database file KEGG Compound ID column name" dynamic_options="get_file_cols(file = db['dburl'], preferred = 'keggid,kegg')" help="Select the KEGG Compound ID column of the database file."/> <!-- File database MS modes --> - <param name="dbmsmodes" label="File database MS modes" type="text" size="32" value="pos=POS,neg=NEG" help=""/> + <param name="dbmsposmode" label="File database MS Positive mode" type="select" dynamic_options="get_ms_mode_value(file = db['dburl'], col = db['dbmsmodefield'], preferred = 'POS,pos,+')" help="Select the value used to identify the positive MS mode."/> + <param name="dbmsnegmode" label="File database MS Negative mode" type="select" dynamic_options="get_ms_mode_value(file = db['dburl'], col = db['dbmsmodefield'], preferred = 'NEG,neg,-')" help="Select the value used to identify the negitive MS mode."/> + + <!-- File database RT unit --> + <param name="dbrtunit" label="Retention time unit" type="select" display="radio" multiple="false" help=""> + <option value="sec">Seconds</option> + <option value="min">Minutes</option> + </param> <param name="dbtoken" type="text" size="32" value="" hidden="true"/> </when> @@ -103,17 +121,24 @@ <param name="dbtoken" label="Peakforest security token" type="text" size="32" value="" refresh_on_change="true" help="If you do not have yet a Peakforest token, go to Peakforest website and request one from your account."/> - <param name="dbfields" type="text" size="32" value="" hidden="true"/> + <param name="dbchromcolfield" type="text" size="32" value="" hidden="true"/> </when> </conditional> <!-- INPUT --> <!-- Input file --> - <param name="mzrtinput" label="Input file - MZ(/RT) values" type="data" format="tabular,tsv" help="Decimal: '.', missing: NA, mode: character and numerical, sep: tabular. RT values must be in seconds."/> + <param name="mzrtinput" label="Input file - MZ(/RT) values" type="data" format="tabular,tsv" refresh_on_change="true" help="Decimal: '.', missing: NA, mode: character and numerical, sep: tabular. RT values must be in seconds."/> + + <!-- Input field field names --> + <param name="inputmzfield" type="select" label="Input file MZ column name" dynamic_options="get_file_cols(file = mzrtinput, preferred = 'mzmed,mz')" help="Select the MZ column of the input file."/> + <param name="inputrtfield" type="select" label="Input file RT column name" dynamic_options="get_file_cols(file = mzrtinput, preferred = 'rtmed,rt')" help="Select the RT column of the input file."/> - <!-- Input field names --> - <param name="inputfields" label="Input file column names" type="text" size="32" value="mz=mzmed,rt=rtmed" help=""/> + <!-- Input file RT unit --> + <param name="inputrtunit" label="Retention time unit" type="select" display="radio" multiple="false" help=""> + <option value="sec">Seconds</option> + <option value="min">Minutes</option> + </param> <!-- M/Z MATCHING --> @@ -130,7 +155,7 @@ <!-- RETENTION TIME PARAMETERS --> <!-- List of chromatographic columns --> - <param name="chromcols" type="select" label="Chromatographic columns" multiple="true" dynamic_options="get_chrom_cols(dbtype = db['dbtype'], dburl = db['dburl'], dbtoken = db['dbtoken'], dbfields = db['dbfields'])" help="Select here the set of chromatographic columns against which the retention time matching will be run."/> + <param name="chromcols" type="select" label="Chromatographic columns" multiple="true" dynamic_options="get_chrom_cols(dbtype = db['dbtype'], dburl = db['dburl'], dbtoken = db['dbtoken'], col_field = db['dbchromcolfield'])" help="Select here the set of chromatographic columns against which the retention time matching will be run."/> <!-- Tolerances --> <param name="tolx" label="RTX retention time tolerance, parameter x (in seconds)" type="float" help="" value="5"/> @@ -174,38 +199,23 @@ </conditional> <!-- OUTPUT --> - <conditional name="out"> - - <param name="enabled" label="Output settings" type="select"> - <option value="false">Default</option> - <option value="true">Customized</option> - </param> - - <when value="false"></when> - <when value="true"> - - <!-- Output field names --> - <param name="outputfields" label="Output column names" type="text" size="256" value="mz=mz,rt=rt,chromcol=chromcol,chromcolrt=chromcolrt,compoundid=compoundid,peakattr=peakattr,peakcomp=peakcomp,intensity=intensity,relative.intensity=relative.intensity,mzexp=mzexp,mztheo=mztheo,fullnames=fullnames,compoundmass=compoundmass,compoundcomp=compoundcomp,inchi=inchi,inchikey=inchikey,pubchemcompid=pubchemcompid,chebiid=chebiid,hmdbid=hmdbid,keggid=keggid" help=""/> - - <!-- Molecule IDs separator character --> - <param name="molidssep" label="Molecule IDs separator character" type="text" size="3" value="|" help=""> - <sanitizer> - <valid initial="string.printable"> - <remove value='"'/> - </valid> - <mapping initial="none"> - <add source='"' target='\"'/> - </mapping> - </sanitizer> - </param> - </when> - </conditional> + <!-- Molecule IDs separator character --> + <param name="molidssep" label="Molecule IDs separator character" type="text" size="3" value="|" help=""> + <sanitizer> + <valid initial="string.printable"> + <remove value='"'/> + </valid> + <mapping initial="none"> + <add source='"' target='\"'/> + </mapping> + </sanitizer> + </param> </inputs> - <!--~~~~~~~ - ~ OUTPUTS ~ - ~~~~~~~~--> + <!--======= + = OUTPUTS = + ========--> <outputs> @@ -216,9 +226,9 @@ </outputs> - <!--~~~~~ - ~ TESTS ~ - ~~~~~~--> + <!--===== + = TESTS = + ======--> <tests> @@ -229,7 +239,8 @@ <param name="dbfields" value=""/> <param name="dbmsmodes" value=""/> <param name="mzrtinput" value="mz-input-small.tsv"/> - <param name="inputfields" value=""/> + <param name="inputmzfield" value="mzmed"/> + <param name="inputrtfield" value="rtmed"/> <param name="mzmode" value="pos"/> <output name="mainoutput" file="filedb-small-mz-match-output.tsv"/> <output name="peaksoutput" file="filedb-small-mz-match-peaks-output.tsv"/> @@ -253,9 +264,9 @@ --> </tests> - <!--~~~~ - ~ HELP ~ - ~~~~~--> + <!--==== + = HELP = + =====--> <help> <!-- @@@BEGIN_RST@@@ --> @@ -272,7 +283,7 @@ When selecting the database, you have the choice between a Peakforest database or an in-house file. -For the Peakforest database, a default REST web base address is already provided. But you can change it of you want to use a custom database. A field is also available for setting a token key in case the access to the Peakforest database you want to use is restricted. This is the case of the default database. +For the Peakforest database, a default REST web base address is already provided. But you can change it to use a custom database. A field is also available for setting a token key in case the access to the Peakforest database you want to use is restricted. This is the case of the default database URL. For the in-house file, please refer to the paragraph "Single file database" below. @@ -285,50 +296,13 @@ Single file database ==================== -The database used is provided as a single file, in tabular format, through the *Database file* field. This file contains a list of MS peaks, with retention times. -Peaks are "duplicated" as much as necessary. For instance if 3 retention times are available on a compound with 10 peaks in positive mode, then there will be 30 lines for this compounds in positive mode. - -The file must contain a header with the column names. The names are free, but must be provided through the *File database column names* field. -In this field, each column is identified with a tag, and the columns names are listed as a comma separated list of tag/name couples (separated by character `=`). The allowed tags are the following ones: +The database used is provided as a single file, in tabular format, through the *Database file* field. This file must contain a list of MS peaks, with possibly retention times. +Peaks are "duplicated" as much as necessary. For instance if 3 retention times are available on a compound with 10 peaks in positive mode, then there will be 30 lines for this compound in positive mode. -+--------------+------------+------------------------------------------------------------------------------------------------------------+ -| Column tag | Compulsory | Values | -+==============+============+============================================================================================================+ -| mztheo | Yes | The m/z values. | -+--------------+------------+------------------------------------------------------------------------------------------------------------+ -| mode | Yes | The MS mode. | -+--------------+------------+------------------------------------------------------------------------------------------------------------+ -| molid | Yes | This is the identifier of your compound. | -+--------------+------------+------------------------------------------------------------------------------------------------------------+ -| colrt | No | The retention time values in seconds. | -+--------------+------------+------------------------------------------------------------------------------------------------------------+ -| col | No | The chromatographic column associated with the retention time. Compulsory if retention times are provided. | -+--------------+------------+------------------------------------------------------------------------------------------------------------+ -| attr | No | The attribution of the peak (e.g.: ``[(M+H)-(H2O)-(NH3)]+``). | -+--------------+------------+------------------------------------------------------------------------------------------------------------+ -| comp | No | The composition of the peak (e.g.: ``C6 H10 N O``). | -+--------------+------------+------------------------------------------------------------------------------------------------------------+ -| molcomp | No | The composition of the molecule. (e.g.: ``C6H14N2O2``). | -+--------------+------------+------------------------------------------------------------------------------------------------------------+ -| molmass | No | The mass of the molecule. | -+--------------+------------+------------------------------------------------------------------------------------------------------------+ -| molnames | No | The names of the molecule, as a semicolon separated list. | -+--------------+------------+------------------------------------------------------------------------------------------------------------+ -| inchi | No | The InChI of the molecule. | -+--------------+------------+------------------------------------------------------------------------------------------------------------+ -| inchikey | No | The InChI key of the molecule. | -+--------------+------------+------------------------------------------------------------------------------------------------------------+ -| pubchem | No | The PubChem ID of the molecule. | -+--------------+------------+------------------------------------------------------------------------------------------------------------+ -| chebi | No | The ChEBI ID of the molecule. | -+--------------+------------+------------------------------------------------------------------------------------------------------------+ -| hmdb | No | The HMDB ID of the molecule. | -+--------------+------------+------------------------------------------------------------------------------------------------------------+ -| kegg | No | The KEGG ID of the molecule. | -+--------------+------------+------------------------------------------------------------------------------------------------------------+ +The file must contain a header with the column names. The names are free, but must be provided through the different fields named *Database file ... column name*. +Then you must provide the values used to identify the MS modes (positive and negative). -The field *File database MS modes* allows you to personalize the MS mode identifiers. The value of the field is a comma separated list of mode/name couples (separated by character `=`).. -For instance, if in your database file you use characters '+' and '-' to identify the modes, then you must set the field to `pos=+,neg=-`. +A last information about the single file database is the unit of the retention times, either in seconds or in minutes. Example of database file (totally fake, no meaning): @@ -361,24 +335,12 @@ MZ/RT input file ================ -The input to provide is a file, in a tabular format (or TSV: Tab Seperated Values), containing the list of MZ/RT values. - -The following columns will be used: +The input to provide is a file, in a tabular format (or TSV: Tab Seperated Values), containing the list of M/Z values, with possibly also RT values. -+--------------+------------+---------------------------------------+ -| Column tag | Compulsory | Values | -+==============+============+=======================================+ -| mz | Yes | The m/z values. | -+--------------+------------+---------------------------------------+ -| rt | No | The retention time values in seconds. | -+--------------+------------+---------------------------------------+ +The column names for the M/Z and RT values must be provided through the fields *Input file MZ column name* and *Input file RT column name*. +As a consequence, the file must contain a header line. -The file may contain a header line, in which case you have to provide the column names through the *Input file column names* field, which consists in a comma separated list of tag/name couples (separated by character `=`). If your file does not contain a header line, then you must provide the column numbers. Examples: - - * With a header line having name MASS for mz column and RET for rt column: `mz=MASS,rt=RET`. - * With no header line: `mz=1,rt=2`. - -Since the MS spectrum mode can not be known from the file, an *MS mode* radio button field is provided for setting the mode. +The unit of the retention time has to be provided with the field *Retention time unit*. Example of file input: @@ -408,15 +370,15 @@ The parameters *M/Z precision* and *M/Z shift* are used by the algorithm in the following formula in order to match an *m/z* value: - mz (1 + (- shift - precision) / 10^6) < mztheo < mz (1 + (- shift - precision) / 10^6) + mz (1 + (- shift - precision) / 10^6) < mzref < mz (1 + (- shift - precision) / 10^6) -Where *mztheo* is the theoretical mass of the database peak that is tested. If this double inequality is true, then the *m/z* value is matched with this peak. +Where *mzref* is the M/Z of reference from the database peak that is tested. If this double inequality is true, then the *m/z* value is matched with this peak. -------------------- Retention time match -------------------- -If at least one column is checked inside the *Columns* parameter section, then retention time is also matched, in addition to the *m/z* value, according to the following formula: +If at least one column is selected inside the *Chromatographic columns* parameter section, then retention time is also matched, in addition to the *m/z* value, according to the following formula: rt - x - rt^y < colrt < rt + x + rt^y @@ -452,48 +414,6 @@ Output settings --------------- -The *Output column names* parameter is used to customize the columns of the output files. As with the *File database column names* parameter, each column is identified with a tag, and the columns names are listed as a comma separated list of tag/name couples (separated by character `=`). The allowed tags are the following ones: - -+--------------+---------------------------------------------------------------------------------------------------------------------------------+ -| Column tag | Values | -+==============+=================================================================================================================================+ -| mz | The m/z values from the input file. | -+--------------+---------------------------------------------------------------------------------------------------------------------------------+ -| mztheo | The m/z values from the database. | -+--------------+---------------------------------------------------------------------------------------------------------------------------------+ -| molid | This is the identifier of your compound. | -+--------------+---------------------------------------------------------------------------------------------------------------------------------+ -| rt | The retention time values in seconds from the input file. | -+--------------+---------------------------------------------------------------------------------------------------------------------------------+ -| col | The chromatographic column associated with the retention time. | -+--------------+---------------------------------------------------------------------------------------------------------------------------------+ -| colrt | The retention time associated with the matched chromatographic column. | -+--------------+---------------------------------------------------------------------------------------------------------------------------------+ -| msmatching | The list IDs of matched molecules. IDs are separated by the character specified in the *Molecule IDs separator character* field | -+--------------+---------------------------------------------------------------------------------------------------------------------------------+ -| attr | The attribution of the peak (e.g.: ``[(M+H)-(H2O)-(NH3)]+``). | -+--------------+---------------------------------------------------------------------------------------------------------------------------------+ -| comp | The composition of the peak (e.g.: ``C6 H10 N O``). | -+--------------+---------------------------------------------------------------------------------------------------------------------------------+ -| molcomp | The composition of the molecule. (e.g.: ``C6H14N2O2``). | -+--------------+---------------------------------------------------------------------------------------------------------------------------------+ -| molmass | The mass of the molecule. | -+--------------+---------------------------------------------------------------------------------------------------------------------------------+ -| molnames | The names of the molecule, as a semicolon separated list. | -+--------------+---------------------------------------------------------------------------------------------------------------------------------+ -| inchi | The InChI of the molecule. | -+--------------+---------------------------------------------------------------------------------------------------------------------------------+ -| inchikey | The InChI key of the molecule. | -+--------------+---------------------------------------------------------------------------------------------------------------------------------+ -| pubchem | The PubChem ID of the molecule. | -+--------------+---------------------------------------------------------------------------------------------------------------------------------+ -| chebi | The ChEBI ID of the molecule. | -+--------------+---------------------------------------------------------------------------------------------------------------------------------+ -| hmdb | The HMDB ID of the molecule. | -+--------------+---------------------------------------------------------------------------------------------------------------------------------+ -| kegg | The KEGG ID of the molecule. | -+--------------+---------------------------------------------------------------------------------------------------------------------------------+ - The *Molecule IDs separator character* is used to customize the character used to separate the molecule IDs of the **molid** column inside the *main* output file. Output files @@ -540,9 +460,9 @@ <!-- @@@END_RST@@@ --> </help> - <!--~~~~~~~~~ - ~ CITATIONS ~ - ~~~~~~~~~~--> + <!--========= + = CITATIONS = + ==========--> <citations/>
--- a/list-chrom-cols.py Tue Mar 14 12:40:22 2017 -0400 +++ b/list-chrom-cols.py Wed Apr 19 10:00:05 2017 -0400 @@ -1,4 +1,5 @@ #!/usr/bin/env python +# vi: fdm=marker import argparse import subprocess @@ -7,7 +8,10 @@ import json import csv -def get_chrom_cols(dbtype, dburl, dbtoken = None, dbfields = None): +# Get chrom cols {{{1 +################################################################ + +def get_chrom_cols(dbtype, dburl, dbtoken = None, col_field = 'chromcol'): cols = [] @@ -19,19 +23,14 @@ v = json.JSONDecoder().decode(result) i = 0 for colid, coldesc in v.iteritems(): - cols.append( (coldesc['name'], colid, i == 0) ) + s = coldesc['name'] + ' - ' + coldesc['constructor'] + ' - L' + str(coldesc['length']) + ' - diam. ' + str(coldesc['diameter']) + ' - part. ' + str(coldesc['particule_size']) + ' - flow ' + str(coldesc['flow_rate']) + cols.append( (s , colid, i == 0) ) ++i elif dbtype == 'inhouse': - # Get field for chromatographic column name - col_field = 'chromcol' - if dbfields is not None: - fields = dict(u.split("=") for u in dbfields.split(",")) - if 'chromcol' in fields: - col_field = fields['chromcol'] # Get all column names from file - with open(dburl if isinstance(dburl, str) else dburl.get_file_name(), 'rb') as dbfile: + with open(dburl if isinstance(dburl, str) else dburl.get_file_name(), 'r') as dbfile: reader = csv.reader(dbfile, delimiter = "\t", quotechar='"') header = reader.next() if col_field in header: @@ -46,9 +45,8 @@ return cols -######## -# MAIN # -######## +# Main {{{1 +################################################################ if __name__ == '__main__': @@ -57,7 +55,7 @@ parser.add_argument('-d', help = 'Database type', dest = 'dbtype', required = True) parser.add_argument('-u', help = 'Database URL', dest = 'dburl', required = True) parser.add_argument('-t', help = 'Database token', dest = 'dbtoken', required = False) - parser.add_argument('-f', help = 'Database fields', dest = 'dbfields', required = False) + parser.add_argument('-f', help = 'Chromatogrphic column field name', dest = 'col_field', required = False) args = parser.parse_args() args_dict = vars(args)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/list-file-cols.py Wed Apr 19 10:00:05 2017 -0400 @@ -0,0 +1,61 @@ +#!/usr/bin/env python +# vi: fdm=marker + +import csv +import re +import argparse + +# Get file cols {{{1 +################################################################ + +def get_file_cols(file, preferred): + + cols = [] + + with open(file if isinstance(file, str) else file.get_file_name(), 'r') as f: + + # Read file header + reader = csv.reader(f, delimiter = "\t", quotechar='"') + header = reader.next() + + preferred = preferred.split(',') + + # Determine default value + perfect_matches = [] + partial_matches = [] + for p in preferred: + for c in header: + if c == p: + perfect_matches.append(c) # Perfect match ! + elif re.match(p, c): + partial_matches.append(c) # Keep this partial match in case we find no perfect match + + ordered_cols = perfect_matches + partial_matches + for c in header: + if not c in ordered_cols: + ordered_cols.append(c) + ordered_cols.append('NA') + + default = 0 + if len(perfect_matches) + len(partial_matches) == 0: + default = len(ordered_cols) - 1 + + # Build list of cols + for i, c in enumerate(ordered_cols): + cols.append( (c, c, i == default) ) + + return cols + +# Main {{{1 +################################################################ + +if __name__ == '__main__': + + # Parse command line arguments + parser = argparse.ArgumentParser(description='Script for getting column names in a csv file.') + parser.add_argument('-f', help = 'CSV File (separator must be TAB)', dest = 'file', required = True) + parser.add_argument('-p', help = 'List (comma separated values) of preferred column names for default one.', dest = 'preferred', required = True) + args = parser.parse_args() + args_dict = vars(args) + + print(get_file_cols(**args_dict))
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/list-ms-mode-values.py Wed Apr 19 10:00:05 2017 -0400 @@ -0,0 +1,60 @@ +#!/usr/bin/env python +# vi: fdm=marker + +import csv +import re +import argparse + +# Get MS mode values {{{1 +################################################################ + +def get_ms_mode_value(file, col, preferred): + + modes = [] + cols = [] + preferred = preferred.split(',') + + with open(file if isinstance(file, str) else file.get_file_name(), 'r') as f: + + # Read file header + reader = csv.reader(f, delimiter = "\t", quotechar='"') + header = reader.next() + try: + index = header.index(col) + for row in reader: + v = row[index] + if v not in modes: + modes.append(v) + + # Find default value + default = 0 + for p in preferred: + for i, m in enumerate(modes): + if m == p: + default = i + break + if default != 0: + break + + # Build list of cols + for i, c in enumerate(modes): + cols.append( (c, c, i == default) ) + except: + pass + + return cols + +# Main {{{1 +################################################################ + +if __name__ == '__main__': + + # Parse command line arguments + parser = argparse.ArgumentParser(description='Script for getting column names in a csv file.') + parser.add_argument('-f', help = 'CSV File (separator must be TAB)', dest = 'file', required = True) + parser.add_argument('-c', help = 'MS mode column name.', dest = 'col', required = True) + parser.add_argument('-p', help = 'List (comma separated values) of preferred column names for default one.', dest = 'preferred', required = True) + args = parser.parse_args() + args_dict = vars(args) + + print(get_ms_mode_value(**args_dict))
--- a/massdb-helper.R Tue Mar 14 12:40:22 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,92 +0,0 @@ -simplifySpectrum <- function(spec) { - if(length(spec) == 0){ - return(NA_real_) - } - #print(spec) - if (nrow(spec) == 0) - return(NA_real_) - if (ncol(spec) != 2) { - spec[, BIODB.PEAK.MZ] - mint <- BIODB.GROUP.INTENSITY %in% colnames(spec) - pint <- which(mint[1]) - if (length(pint) == 0) - stop( - "No intensity column founds, if there is more than 2 column, columns should be named", - paste0(BIODB.GROUP.INTENSITY, collapse = ", ") - ) - spec <- spec[, c(BIODB.PEAK.MZ, BIODB.GROUP.INTENSITY[pint[1]])] - ###Normalizing the intenities. - } - spec[, 2] <- as.numeric(spec[, 2]) * 100 / max(as.numeric(spec[, 2])) - colnames(spec) <- c(BIODB.PEAK.MZ, BIODB.PEAK.RELATIVE.INTENSITY) - spec -} - - - -calcDistance <- - function(spec1 , - spec2, - npmin = 2, - fun = c("wcosine"), - params = list()) { - #fun <- match.arg(fun) - - #SPec are always notmlized in pourcentage toa voir issues; - spec1 <- simplifySpectrum(spec1) - spec2 <- simplifySpectrum(spec2) - if(is.na(spec1)||is.na(spec2)) return(list(matched=numeric(0),similarity=0)) - params$mz1 <- as.numeric(spec1[, BIODB.PEAK.MZ]) - params$mz2 <- as.numeric(spec2[, BIODB.PEAK.MZ]) - params$int1 <- as.numeric(spec1[, BIODB.PEAK.RELATIVE.INTENSITY]) - params$int2 <- as.numeric(spec2[, BIODB.PEAK.RELATIVE.INTENSITY]) - res <- do.call(fun, args = params) - if (sum(res$matched != -1) < npmin) - return(list(matched = res$matched, similarity = 0)) - list(matched = res$matched, - similarity = res$measure) - } - - - -###The returned sim list is not ordered -compareSpectra <- - function(spec, - libspec, - npmin = 2, - fun = BIODB.MSMS.DIST.WCOSINE, - params = list(), - decreasing = TRUE) { - #fun <- match.arg(fun) - if (length(libspec) == 0) { - return(NULL) - } - if (nrow(spec) == 0) { - return(NULL) - } - - ####spec is directly normalized. - vall <- - sapply( - libspec, - calcDistance, - spec1 = spec, - params = params, - fun = fun, - simplify = FALSE - ) - ####the list is ordered with the chosen metric. - sim <- - vapply(vall, - '[[', - i = "similarity", - FUN.VALUE = ifelse(decreasing, 0, 1)) - osim <- order(sim, decreasing = decreasing) - matched <- sapply(vall, '[[', i = "matched", simplify = FALSE) - - return(list( - ord = osim, - matched = matched, - similarity = sim - )) - }
--- a/msdb-common.R Tue Mar 14 12:40:22 2017 -0400 +++ b/msdb-common.R Wed Apr 19 10:00:05 2017 -0400 @@ -9,33 +9,31 @@ ############# # Field tags - MSDB.TAG.MZ <- BIODB.PEAK.MZ - MSDB.TAG.MZEXP <- BIODB.PEAK.MZEXP - MSDB.TAG.MZTHEO <- BIODB.PEAK.MZTHEO - MSDB.TAG.RT <- BIODB.PEAK.RT - MSDB.TAG.MODE <- BIODB.MSMODE - MSDB.TAG.MOLID <- BIODB.COMPOUND.ID - MSDB.TAG.COL <- BIODB.CHROM.COL - MSDB.TAG.COLRT <- BIODB.CHROM.COL.RT - MSDB.TAG.ATTR <- BIODB.PEAK.ATTR - MSDB.TAG.INT <- BIODB.PEAK.INTENSITY - MSDB.TAG.REL <- BIODB.PEAK.RELATIVE.INTENSITY - MSDB.TAG.COMP <- BIODB.PEAK.COMP - MSDB.TAG.MOLNAMES <- BIODB.FULLNAMES - MSDB.TAG.MOLCOMP <- BIODB.COMPOUND.MASS -# MSDB.TAG.MOLATTR <- 'molattr' - MSDB.TAG.MOLMASS <- BIODB.COMPOUND.COMP - MSDB.TAG.INCHI <- BIODB.INCHI - MSDB.TAG.INCHIKEY <- BIODB.INCHIKEY - # TODO Use BIODB tags. - MSDB.TAG.PUBCHEM <- BIODB.PUBCHEMCOMP.ID - MSDB.TAG.CHEBI <- BIODB.CHEBI.ID - MSDB.TAG.HMDB <- BIODB.HMDB.ID - MSDB.TAG.KEGG <- BIODB.KEGG.ID + MSDB.TAG.MZ <- 'mz' + MSDB.TAG.MZEXP <- 'mzexp' + MSDB.TAG.MZTHEO <- 'mztheo' + MSDB.TAG.RT <- 'rt' + MSDB.TAG.MODE <- 'msmode' + MSDB.TAG.MOLID <- 'compoundid' + MSDB.TAG.COL <- 'chromcol' + MSDB.TAG.COLRT <- 'chromcolrt' + MSDB.TAG.ATTR <- 'peakattr' + MSDB.TAG.INT <- 'intensity' + MSDB.TAG.REL <- 'relative.intensity' + MSDB.TAG.COMP <- 'peakcomp' + MSDB.TAG.MOLNAMES <- 'fullnames' + MSDB.TAG.MOLCOMP <- 'compoundmass' + MSDB.TAG.MOLMASS <- 'compoundcomp' + MSDB.TAG.INCHI <- 'inchi' + MSDB.TAG.INCHIKEY <- 'inchikey' + MSDB.TAG.PUBCHEM <- 'pubchemcompid' + MSDB.TAG.CHEBI <- 'chebiid' + MSDB.TAG.HMDB <- 'hmdbid' + MSDB.TAG.KEGG <- 'keggid' # Mode tags - MSDB.TAG.POS <- BIODB.MSMODE.NEG - MSDB.TAG.NEG <- BIODB.MSMODE.POS + MSDB.TAG.POS <- 'neg' + MSDB.TAG.NEG <- 'pos' # Fields containing multiple values MSDB.MULTIVAL.FIELDS <- c(MSDB.TAG.MOLNAMES) @@ -46,11 +44,15 @@ MSDB.MZTOLUNIT.PLAIN <- 'plain' # same as mz: mass-to-charge ratio MSDB.MZTOLUNIT.VALS <- c(MSDB.MZTOLUNIT.PPM, MSDB.MZTOLUNIT.PLAIN) + # Authorized rt units + MSDB.RTUNIT.SEC <- 'sec' + MSDB.RTUNIT.MIN <- 'min' + MSDB.RTUNIT.VALS <- c(MSDB.RTUNIT.SEC ,MSDB.RTUNIT.MIN) + # Default values MSDB.DFT.PREC <- list() MSDB.DFT.PREC[[MSDB.TAG.POS]] <- c("[(M+H)]+", "[M+H]+", "[(M+Na)]+", "[M+Na]+", "[(M+K)]+", "[M+K]+") MSDB.DFT.PREC[[MSDB.TAG.NEG]] <- c("[(M-H)]-", "[M-H]-", "[(M+Cl)]-", "[M+Cl]-") - MSDB.DFT.OUTPUT.FIELDS <- list( mz = 'mz', rt = 'rt', col = 'col', colrt = 'colrt', molid = 'id', attr = 'attribution', comp = 'composition', int = 'intensity', rel = 'relative', mzexp = 'mzexp', mztheo = 'mztheo', msmatching = 'msmatching', molnames = 'molnames', molcomp = 'molcomp', molmass = 'molmass', inchi = 'inchi', inchikey = 'inchikey', pubchem = 'pubchem', chebi = 'chebi', hmdb = 'hmdb', kegg = 'kegg') MSDB.DFT.OUTPUT.MULTIVAL.FIELD.SEP <- MSDB.MULTIVAL.FIELD.SEP MSDB.DFT.MATCH.FIELDS <- list( molids = 'molid', molnames = 'molnames') MSDB.DFT.MATCH.SEP <- ',' @@ -71,20 +73,6 @@ return(dft.fields) } - ############################# - # GET DEFAULT OUTPUT FIELDS # - ############################# - - msdb.get.dft.output.fields <- function () { - - dft.fields <- list() - - for(f in c(MSDB.TAG.MZ, MSDB.TAG.RT, MSDB.TAG.COL, MSDB.TAG.COLRT, MSDB.TAG.MOLID, MSDB.TAG.ATTR, MSDB.TAG.COMP, MSDB.TAG.INT, MSDB.TAG.REL, MSDB.TAG.MZEXP, MSDB.TAG.MZTHEO, MSDB.TAG.MOLNAMES, MSDB.TAG.MOLCOMP, MSDB.TAG.MOLMASS, MSDB.TAG.INCHI, MSDB.TAG.INCHIKEY, MSDB.TAG.PUBCHEM, MSDB.TAG.CHEBI, MSDB.TAG.HMDB, MSDB.TAG.KEGG)) - dft.fields[[f]] <- f - - return(dft.fields) - } - ######################### # GET DEFAULT DB FIELDS # ######################### @@ -118,7 +106,7 @@ # MAKE INPUT DATA FRAME # ######################### - msdb.make.input.df <- function(mz, rt = NULL) { + msdb.make.input.df <- function(mz, rt = NULL, rtunit = MSDB.RTUNIT.SEC) { field <- msdb.get.dft.input.fields() @@ -134,6 +122,8 @@ # Set rt if ( ! is.null(rt)) { + if (rtunit == MSDB.RTUNIT.MIN) + rtunit <- rtunit * 60 if (length(rt) > 1) x[seq(rt), field[[MSDB.TAG.RT]]] <- rt else if (length(rt) == 1)
--- a/mysql.R Tue Mar 14 12:40:22 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,207 +0,0 @@ -library(RMySQL, quietly = TRUE) - -############# -# RUN QUERY # -############# - -# conn The connection to the database. -# queries A query or a list of queries. -# close Close query with a ';' if not already done. -# RETURN The last query result. -run_query <- function(conn, queries, close = TRUE) { - - for (query in queries) { - - # Append ';' - if (close) { - n <- nchar(query) - if (substr(query, n, n) != ';') - query <- paste0(query, ';') - } - - # Send query - result <- dbSendQuery(conn, query) - - # Test that everything went right -# if ( ! dbHasCompleted(result)) -# stop("Can't run the following query : ", query) - } - - # Return result - return(invisible(result)) -} - -################ -# RUN SQL FILE # -################ - -# conn The connection to the DBMS. -# file The path to the SQL file. -run_sql_file <- function(conn, file) { - - # Split SQL into single queries and put them into a list - queries <- character() - query <- "" - for (line in readLines(file)) { - line <- sub('^(.*)\\s*--.*$', '\\1', line, perl = TRUE) # remove one line comment - if (grepl("^\\s*$", line)) next # empty line - query <- paste(query, line) - if (grepl(";\\s*$", line, perl = TRUE)) { - query <- gsub("\t", " ", query, perl = TRUE) # replace tabulation by spaces - query <- gsub("/\\*.*\\*/", "", query, perl = TRUE) # remove multiline comments - queries <- c(queries, query) - query <- "" - } - } - - # Run queries - invisible(run_query(conn, queries)) -} - -################# -# DROP DATABASE # -################# - -# conn The connection to the DBMS. -# db The name of the database to drop. -# fail_if_doesnt_exist Fails if database doesn't exist. -drop_database <- function(conn, db, fail_if_doesnt_exist = FALSE) { - invisible(run_query(conn, paste("drop database", if (fail_if_doesnt_exist) "" else "if exists", db))) -} - -################### -# CREATE DATABASE # -################### - -# conn The connection to the DBMS. -# db The name of the database to create. -# drop Drop/erase existing database. -# encoding Set the character set encoding to use as default for the database. -# use If true, switch to the newly created database. -create_database <- function(conn, db, drop = FALSE, encoding = 'utf8', use = TRUE) { - - # Drop database - if (drop) drop_database(conn, db) - - # Create database - enc <- if (is.null(encoding) || is.na(encoding)) "" else paste("character set", encoding) - run_query(conn, paste("create database", db, enc)) - - # Switch to database - invisible(run_query(conn, paste("use", db))) -} - -############################## -# CONVERT VALUE TO SQL VALUE # -############################## - -to_sql_value <- function(x) { - - # NA or NULL - if (length(x) == 0 || is.na(x) || is.null(x)) - return('null') - - # String - if (is.character(x)) - return(paste0('"', as.character(x), '"')) - - return(x) -} - -#################### -# MAKE INSERT LINE # -#################### - -make_insert_line <- function(values) { - values <- lapply(values, to_sql_value) - return(paste0("(", paste(values, collapse=','), ")")) -} - -########## -# INSERT # -########## - -# Run a insert query on a MySQL database. -# conn Connection to a database. -# table Table name. -# fields List of field names. -# values List of list of values. NA values will be translated as NULL. -insert <- function(conn, table, fields, values) { - - # Do nothing if no values - if (length(values) == 0 ) return - - # Build header - h <- paste("insert into", table) - h <- paste0(h, "(", paste(fields, collapse = ','), ")") - h <- paste(h, "values") - - qr <- paste(h, paste0(lapply(values, make_insert_line), collapse=','), ';') - - # Send query - run_query(conn, qr) -} - -######## -# JOIN # -######## - -Join <- setRefClass("Join", fields = list(table = "character", left_field = "character", right_field = "character", outer = "character")) - -Join$methods( initialize = function(table, left_field, right_field, outer = NA_character_) { - table <<- table - left_field <<- left_field - right_field <<- right_field - outer <<- outer -}) - -Join$methods( getStatement = function() { - type <- 'INNER JOIN' - if ( ! is.na(outer)) - switch(tolower(outer), - left = type <- 'LEFT OUTER JOIN', - right = type <- 'RIGHT OUTER JOIN', - stop('Error in join outer type. "', outer ,'" is unknown. You must choose between "LEFT" and "RIGHT".') - ) - - return(paste(type, .self$table, 'ON', .self$left_field, '=', .self$right_field)) -}) - -########## -# SELECT # -########## - -# Run a select query on a MySQL database. Returns the dataframe of results. -# conn Connection to a database. -select <- function(conn, fields, from, joins = NULL , where = NULL, orderby = NULL) { - - # Select/from - rq <- paste("SELECT ", paste(fields, collapse = ', '), 'FROM', from) - - # Joins - if ( ! is.null(joins) && length(joins) > 0) - rq <- paste(rq, paste(lapply(joins, function (x) x$getStatement() ), collapse = ' ')) - - # Where - if ( ! is.null(where)) rq <- paste(rq, 'WHERE', where) - - # Order by - if ( ! is.null(orderby)) rq <- paste(rq, 'ORDER BY', orderby) - - # End request, send it and get results - rq <- paste0(rq, ';') - res <- try(dbSendQuery(conn, rq)) - data <- fetch(res, n=-1) - - return(data) -} - -####################### -# SELECT SINGLE FIELD # -####################### - -select_single_field <- function(conn, field, from, where = NULL) { - values <- select(conn, fields = field, from = from, where = where) - val <- if (field %in% colnames(values) && length(values[field][[1]]) > 0) values[field][[1]] else NA_character_ - return(val) -}
--- a/search-mz Tue Mar 14 12:40:22 2017 -0400 +++ b/search-mz Wed Apr 19 10:00:05 2017 -0400 @@ -1,13 +1,9 @@ #!/usr/bin/env Rscript -# vi: ft=R +# vi: ft=R fdm=marker args <- commandArgs(trailingOnly = F) script.path <- sub("--file=","",args[grep("--file=",args)]) library(getopt) source(file.path(dirname(script.path), 'msdb-common.R'), chdir = TRUE) -source(file.path(dirname(script.path), 'MsFileDb.R'), chdir = TRUE) -source(file.path(dirname(script.path), 'MsPeakForestDb.R'), chdir = TRUE) -source(file.path(dirname(script.path), 'MsXlsDb.R'), chdir = TRUE) -source(file.path(dirname(script.path), 'Ms4TabSqlDb.R'), chdir = TRUE) source(file.path(dirname(script.path), 'MsDbLogger.R'), chdir = TRUE) source(file.path(dirname(script.path), 'MsDbInputDataFrameStream.R'), chdir = TRUE) source(file.path(dirname(script.path), 'MsDbOutputDataFrameStream.R'), chdir = TRUE) @@ -21,9 +17,8 @@ if (as.integer(R.Version()$major) == 2 && as.numeric(R.Version()$minor) < 15) paste0 <- function(...) paste(..., sep = '') -############# -# CONSTANTS # -############# +# Constants {{{1 +################################################################ PROG <- sub('^.*/([^/]+)$', '\\1', commandArgs()[4], perl = TRUE) USERAGENT <- 'search-mz ; pierrick.roger@gmail.com' @@ -34,6 +29,11 @@ MSDB.FILE <- 'file' MSDB.PEAKFOREST <- 'peakforest' MSDB.VALS <- c(MSDB.XLS, MSDB.4TABSQL, MSDB.FILE, MSDB.PEAKFOREST) +DB.SRC.FILE <- list () +DB.SRC.FILE[[MSDB.FILE]] <- 'MsFileDb.R' +DB.SRC.FILE[[MSDB.PEAKFOREST]] <- 'MsPeakForestDb.R' +DB.SRC.FILE[[MSDB.XLS]] <- 'MsXlsDb.R' +DB.SRC.FILE[[MSDB.4TABSQL]] <- 'Ms4TabSqlDb.R' # Authorized mode values POS_MODE <- 'pos' @@ -51,22 +51,64 @@ MSDB.DFT[['db-ms-modes']] <- concat.kv.list(MSDB.DFT.MODES) MSDB.DFT[['pos-prec']] <- paste(MSDB.DFT.PREC[[MSDB.TAG.POS]], collapse = ',') MSDB.DFT[['neg-prec']] <- paste(MSDB.DFT.PREC[[MSDB.TAG.NEG]], collapse = ',') +MSDB.DFT[['db-rt-unit']] <- MSDB.RTUNIT.SEC +MSDB.DFT[['rtunit']] <- MSDB.RTUNIT.SEC DEFAULT.ARG.VALUES <- MSDB.DFT DEFAULT.ARG.VALUES[['input-col-names']] <- concat.kv.list(msdb.get.dft.input.fields()) -DEFAULT.ARG.VALUES[['output-col-names']] <- concat.kv.list(msdb.get.dft.output.fields()) + +# Print help {{{1 +################################################################ + +print.help <- function() { + + cat("USAGE:\n") + prog.mz.match <- paste(PROG, ' -d (', paste(MSDB.VALS, collapse = '|'), ') --url (file|dir|database URL) -i <file> -m (', paste(MSDB.MODE.VALS, collapse = '|'), ") -p <mz precision> -s <mz shift> -u (", paste(MSDB.MZTOLUNIT.VALS, collapse = '|'), ") -o <file>", sep = '') + cat("\t(1) ", prog.mz.match, " ...\n", sep = '') + cat("\n") + cat("\t(2) ", prog.mz.match, "(--all-cols|-c <cols>) -x <X RT tolerance> -y <Y RT tolerance>", " ...\n", sep = '') + cat("\n") + cat("\t(3) ", PROG, ' -d (', paste(MSDB.VALS, collapse = '|'), ") --url (file|dir|database URL) --list-cols\n", sep = '') + + cat("\nDETAILS:\n") + cat("Form (1) is for running an MZ match on a database.\n") + cat("Form (2) is for running an MZ/RT match on a database.\n") + cat("Form (3) is for getting a list of available chromatographic columns in a database.\n") -############## -# PRINT HELP # -############## + cat("\nOPTIONS:\n") + spec <- matrix(make.getopt.spec(), byrow = TRUE, ncol = 5) + max.length.opt.cols <- max(nchar(spec[,1])) + 1 + sections <- list(database = "Database setting", input = "Input file", output = "Output files", mz = "M/Z matching", rt = "RT matching", precursor = "Precursor matching", misc = "Miscellaneous") + for (section in names(sections)) { + cat("\n\t", sections[[section]], ":\n", sep = '') + spec <- matrix(make.getopt.spec(section), byrow = TRUE, ncol = 5) + for (i in seq(nrow(spec))) { + opt <- '' + if ( ! is.na(spec[i,2])) + opt <- paste('-', spec[i,2], '|', sep = '') + opt <- paste(opt, '--', spec[i, 1], sep = '') + nb.space.padding <- max.length.opt.cols - nchar(opt) + 6 + padding <- paste(rep(' ', nb.space.padding), sep = '') + cat("\t\t", opt, padding, "\t", spec[i, 5], "\n", sep = '') + } + } -print.help <- function(spec, status = 0) { - cat(getopt(spec, usage = TRUE, command = PROG)) - q(status = status) + cat("\nEXAMPLES:\n") + + cat("\nSimple M/Z matching with a file database:\n") + cat("\t./", PROG, " -d file --url mydbfile.tsv -i input.tsv -m pos -o output.tsv\n", sep = '') + + cat("\nFile database with M/Z tolerance:\n") + cat("\t./", PROG, " -d file --url mydbfile.tsv -i input.tsv -m pos -o output.tsv -p 0.5 -s 0\n", sep = '') + + cat("\nFile database with M/Z tolerance unit:\n") + cat("\t./", PROG, " -d file --url mydbfile.tsv -i input.tsv -m pos -o output.tsv -p 1 -s 0.5 -u plain\n", sep = '') + + cat("\nPeakforest database:\n") + cat("\t./", PROG, " -d peakforest --url https://rest.peakforest.org/ --db-token <your Peakforest token> -i input.tsv -m pos -o output.tsv\n", sep = '') } -############################### -# SET DEFAULT ARGUMENT VALUES # -############################### +# Set default argument values {{{1 +################################################################ set.dft.arg.val <-function(opt) { @@ -84,16 +126,17 @@ return(opt) } -######################### -# PARSE ARGUMENT VALUES # -######################### +# Parse argument values {{{1 +################################################################ parse.arg.val <- function(opt) { # Parse input column names if ( ! is.null(opt[['db-fields']])) { cust <- split.kv.list(opt[['db-fields']]) + cust <- cust[cust != 'NA'] opt[['db-fields']] <- split.kv.list(MSDB.DFT[['db-fields']]) + cust <- cust[names(cust) %in% names(opt[['db-fields']])] opt[['db-fields']][names(cust)] <- cust } @@ -114,24 +157,11 @@ } else { custcols <- split.kv.list(opt[['input-col-names']]) + custcols <- custcols[custcols != 'NA'] dftcols <- msdb.get.dft.input.fields() opt[['input-col-names']] <- c(custcols, dftcols[ ! names(dftcols) %in% names(custcols)]) } - # Parse output column names - if (is.null(opt[['output-col-names']])) { - # By default keep input col names for output - opt[['output-col-names']] <- msdb.get.dft.output.fields() - input.cols <- names(opt[['input-col-names']]) - output.cols <- names(opt[['output-col-names']]) - opt[['output-col-names']] <- c(opt[['input-col-names']][input.cols %in% output.cols], opt[['output-col-names']][ ! output.cols %in% input.cols]) - } - else { - custcols <- split.kv.list(opt[['output-col-names']]) - dftcols <- msdb.get.dft.output.fields() - opt[['output-col-names']] <- c(custcols, dftcols[ ! names(dftcols) %in% names(custcols)]) - } - # Parse lists of precursors if ( ! is.null(opt[['pos-prec']])) opt[['pos-prec']] <- split.str(opt[['pos-prec']], unlist = TRUE) @@ -141,263 +171,209 @@ return(opt) } -################################# -# PRINT DEFAULT ARGUMENT VALUES # -################################# +# Make getopt specifications {{{1 +################################################################ + +make.getopt.spec <- function(sections = NULL) { + + spec <- character(0) + + if (is.null(sections) || 'input' %in% sections) + spec <- c(spec, + 'input-file', 'i', 1, 'character', 'Set input file.', + 'input-col-names', 'j', 1, 'character', paste0('Set the input column names. Default is "', DEFAULT.ARG.VALUES[['input-col-names']], '".') + ) -print.dft.arg.val <- function(opt) { + if (is.null(sections) || 'mz' %in% sections) + spec <- c(spec, + 'mode', 'm', 1, 'character', paste0('MS mode. Possible values are:', paste(MSDB.MODE.VALS, collapse = ", "), '.'), + 'mzshift', 's', 1, 'numeric', paste0('Shift on m/z. Default is ', MSDB.DFT$mzshift,'.'), + 'mzprec', 'p', 1, 'numeric', paste0('Tolerance on m/z. Default is ', MSDB.DFT$mzprec,'.'), + 'mztolunit', 'u', 1, 'character', paste0('Unit used for tolerance values (options -s and -p) on M/Z. Default is ', MSDB.DFT$mztolunit,'.') + ) + + if (is.null(sections) || 'rt' %in% sections) + spec <- c(spec, + 'all-cols', 'A', 0, 'logical', 'Use all available chromatographic columns to match retention times.', + 'rtcol', 'c', 1, 'character', paste0('Chromatographic column to use. Unset by default. If set, use the corresponding column to filter on retention times, if retention times are provided.'), + 'check-cols', 'k', 0, 'logical', 'Check that the chromatographic column names specified with option -c really exist.', + 'list-cols', 'l', 0, 'logical', 'List all chromatographic columns present in the database. Write list inside the file specified by -o option.', + 'rttol', 'r', 1, 'numeric', paste0('Tolerance on retention times. Unset by default.'), + 'rttolx', 'x', 1, 'numeric', paste0('Tolerance on retention times. Unset by default.'), + 'rttoly', 'y', 1, 'numeric', paste0('Tolerance on retention times. Unset by default.'), + 'rtunit', 'v', 1, 'character', paste0('Retention time unit for the input file. Default is ', MSDB.DFT$rtunit, '. Allowed values are:', paste(MSDB.RTUNIT.VALS, collapse = ", "), '.') + ) - print.flags <- DEFAULT.ARG.VALUES - names(print.flags) <- vapply(names(print.flags), function(x) paste0('print-', x), FUN.VALUE = '') - for (f in names(print.flags)) - if ( ! is.null(opt[[f]])) { - cat(print.flags[[f]]) - q(status = 0) - } -} + if (is.null(sections) || 'precursor' %in% sections) + spec <- c(spec, + 'precursor-match', 'Q', 0, 'logical', 'Remove peaks whose molecule precursor peak has not been matched. Unset by default.', + 'precursor-rt-tol', 'R', 1, 'numeric', paste0('Precursor retention time tolerance. Only used when precursor-match is enabled. Default is ', MSDB.DFT[['precursor-rt-tol']], '.'), + 'pos-prec', 'Y', 1, 'character', paste0('Set the list of precursors to use in positive mode. Default is "', MSDB.DFT[['pos-prec']], '".'), + 'neg-prec', 'Z', 1, 'character', paste0('Set the list of precursors to use in negative mode. Default is "', MSDB.DFT[['neg-prec']], '".') + ) -make.getopt.spec.print.dft <- function() { + if (is.null(sections) || 'output' %in% sections) + spec <- c(spec, + 'output-file', 'o', 1, 'character', 'Set file to use for the main output.', + 'peak-output-file', 'O', 1, 'character', 'If set and if --same-rows is set, then output all matches inside the specified file, with one mz match per line. The output columns are: mz, rt, id, col, colrt, composition, attribution. This means that if an mz value is matched several times, then it will repeated on several lines, with one match description per line.', + 'html-output-file', 'H', 1, 'character', 'Set file to use for the HTML output.', + 'no-main-table-in-html-output', 't', 0, 'logical', 'Do not display main table in HTML output.', + 'same-rows', 'a', 0, 'logical', 'If set, output exactly the same number of rows as the input. This means that in case of multiple matches for one mz, then only one line is output (i.e.: the mz value is not duplicated on several lines). In the main output file, an "ms.matching" column is output with inside, for each mz, a comma separated list of matched component/molecule IDs. If unset, then only the main output file is used, and one single is written to it with one line per peak match, and eventual mz line duplicated if there are multiple matches for this mz.', + 'same-cols', 'b', 0, 'logical', 'If set, output the same columns as inside the input. All input columns are copied to the output.', + 'molids-sep', 'S', 1, 'character', paste0('Set character separator used to when concatenating molecule IDs in output. Default is "', MSDB.DFT[['molids-sep']] , '".'), + 'first-val', '1', 0, 'logical', 'Keep only the first value in multi-value fields. Unset by default.', + 'excel2011comp', 'X', 0, 'logical', 'Excel 2011 compatiblity mode. Output ASCII text files instead of UTF-8 files, where greek letters are replaced with their latin names, plusminus sign is replaced with +- and apostrophe is replaced with \"prime\". All other non-ASCII characters are repladed with underscore.' + ) - spec <- character() + if (is.null(sections) || 'database' %in% sections) + spec <- c(spec, + 'database', 'd', 1, 'character', paste0('Set database to use: "xls" for an Excel database, "file" for a single file database, "4tabsql" for a 4Tab SQL database, and "peakforest" for a connection to PeakForest database.'), + 'url', 'W', 1, 'character', 'URL of database. For "peakforest" database it is the HTTP URL, for the "xls" database it is the path to the directory containing the Excel files, for the "file" database it is the path to the file database and for the "4tabsql" database it is the IP address of the server.', + 'cache-dir', 'C', 1, 'character', 'Path to directory where to store cache files. Only used when database flag is set to "xls".', + 'db-name', 'N', 1, 'character', 'Name of the database. Used by the "4tabsql" database.', + 'db-user', 'U', 1, 'character', 'User of the database. Used by the "4tabsql" database.', + 'db-password', 'P', 1, 'character', 'Password of the database user. Used by the "4tabsql" database.', + 'db-ms-modes', 'M', 1, 'character', paste0('Comma separated key/value list giving the MS modes to be used in the single file database. Default is "', MSDB.DFT[['db-ms-modes']], '".'), + 'db-rt-unit', 'V', 1, 'character', paste0('Retention time unit for the database, used in the single file database. Default is "', MSDB.DFT[['db-rt-unit']], '". Allowed values are:', paste(MSDB.RTUNIT.VALS, collapse = ", "), '.'), + 'db-token', 'T', 1, 'character', 'Database token. Used by Peakforest database.', + 'db-fields', 'F', 1, 'character', paste0('Comma separated key/value list giving the field names to be used in the single file database. Default is "', MSDB.DFT[['db-fields']], '".') + ) - for (f in names(DEFAULT.ARG.VALUES)) - spec <- c(spec, paste0('print-', f), NA_character_, 0, 'logical', paste0('Print default value of --', f)) + if (is.null(sections) || 'misc' %in% sections) + spec <- c(spec, + 'help', 'h', 0, 'logical', 'Print this help.', + 'debug', 'g', 0, 'logical', 'Set debug mode.' + ) return(spec) } -############################## -# MAKE GETOPT SPECIFICATIONS # -############################## - -make.getopt.spec <- function() { - spec = c( - 'help', 'h', 0, 'logical', 'Print this help.', - 'mode', 'm', 1, 'character', paste0('MS mode. Possible values are:', paste(MSDB.MODE.VALS, collapse = ", "), '.'), - 'mzshift', 's', 1, 'numeric', paste0('Shift on m/z, in ppm. Default is ', MSDB.DFT$mzshift,'.'), - 'mzprec', 'p', 1, 'numeric', paste0('Tolerance on m/z, in ppm. Default is ', MSDB.DFT$mzprec,'.'), - 'mztolunit', NA_character_, 1, 'character', paste0('Tolerance on m/z, in ppm. Default is ', MSDB.DFT$mztolunit,'.'), - 'rttol', 'r', 1, 'numeric', paste0('Tolerance on retention times. Unset by default.'), - 'rttolx', 'x', 1, 'numeric', paste0('Tolerance on retention times. Unset by default.'), - 'rttoly', 'y', 1, 'numeric', paste0('Tolerance on retention times. Unset by default.'), - 'rtcol', 'c', 1, 'character', paste0('Chromatographic column to use. Unset by default. If set, use the corresponding column to filter on retention times, if retention times are provided.'), - 'all-cols', NA_character_, 0, 'logical', 'Use all available chromatographic columns to match retention times.', - 'check-cols', NA_character_, 0, 'logical', 'Check that the chromatographic column names specified with option -c really exist.', - 'list-cols', NA_character_, 0, 'logical', 'List all chromatographic columns present in the database. Write list inside the file specified by -o option.', - 'same-rows', 'a', 0, 'logical', 'If set, output exactly the same number of rows as the input. This means that in case of multiple matches for one mz, then only one line is output (i.e.: the mz value is not duplicated on several lines). In the main output file, an "ms.matching" column is output with inside, for each mz, a comma separated list of matched component/molecule IDs. If unset, then only the main output file is used, and one single is written to it with one line per peak match, and eventual mz line duplicated if there are multiple matches for this mz.', - 'same-cols', 'b', 0, 'logical', 'If set, output the same columns as inside the input. All input columns are copied to the output.', - 'input-file', 'i', 1, 'character', 'Set input file.', - 'output-file', 'o', 1, 'character', 'Set file to use for the main output.', - 'peak-output-file', NA_character_, 1, 'character', 'If set and if --same-rows is set, then output all matches inside the specified file, with one mz match per line. The output columns are: mz, rt, id, col, colrt, composition, attribution. This means that if an mz value is matched several times, then it will repeated on several lines, with one match description per line.', - 'html-output-file', NA_character_, 1, 'character', 'Set file to use for the HTML output.', - 'no-main-table-in-html-output', NA_character_, 0, 'logical', 'Do not display main table in HTML output.', - 'precursor-match', NA_character_, 0, 'logical', 'Remove peaks whose molecule precursor peak has not been matched. Unset by default.', - 'precursor-rt-tol', NA_character_, 1, 'numeric', paste0('Precursor retention time tolerance. Only used when precursor-match is enabled. Default is ', MSDB.DFT[['precursor-rt-tol']], '.'), - 'pos-prec', NA_character_, 1, 'character', paste0('Set the list of precursors to use in positive mode. Default is "', MSDB.DFT[['pos-prec']], '".'), - 'neg-prec', NA_character_, 1, 'character', paste0('Set the list of precursors to use in negative mode. Default is "', MSDB.DFT[['neg-prec']], '".'), - 'input-col-names', NA_character_, 1, 'character', paste0('Set the input column names. Default is "', DEFAULT.ARG.VALUES[['input-col-names']], '".'), - 'output-col-names', NA_character_, 1, 'character', paste0('Set the output column names. Default is "', DEFAULT.ARG.VALUES[['output-col-names']], '".'), - 'molids-sep', NA_character_, 1, 'character', paste0('Set character separator used to when concatenating molecule IDs in output. Default is "', MSDB.DFT[['molids-sep']] , '".'), - 'first-val', NA_character_, 0, 'logical', 'Keep only the first value in multi-value fields. Unset by default.', - 'excel2011comp', NA_character_, 0, 'logical', 'Excel 2011 compatiblity mode. Output ASCII text files instead of UTF-8 files, where greek letters are replaced with their latin names, plusminus sign is replaced with +- and apostrophe is replaced with \"prime\". All other non-ASCII characters are repladed with underscore.', - 'database', 'd', 1, 'character', paste0('Set database to use: "xls" for an Excel database, "file" for a single file database, "4tabsql" for a 4Tab SQL database, and "peakforest" for a connection to PeakForest database.'), - 'url', NA_character_, 1, 'character', 'URL of database. For "peakforest" database it is the HTTP URL, for the "xls" database it is the path to the directory containing the Excel files, for the "file" database it is the path to the file database and for the "4tabsql" database it is the IP address of the server.', - 'cache-dir', NA_character_, 1, 'character', 'Path to directory where to store cache files. Only used when database flag is set to "xls".', - 'db-name', NA_character_, 1, 'character', 'Name of the database. Used by the "4tabsql" database.', - 'db-user', NA_character_, 1, 'character', 'User of the database. Used by the "4tabsql" database.', - 'db-password', NA_character_, 1, 'character', 'Password of the database user. Used by the "4tabsql" database.', - 'db-fields', NA_character_, 1, 'character', paste0('Comma separated key/value list giving the field names to be used in the single file database (option --db-file). Default is "', MSDB.DFT[['db-fields']], '".'), - 'db-ms-modes', NA_character_, 1, 'character', paste0('Comma separated key/value list giving the MS modes to be used in the single file database (option --db-file). Default is "', MSDB.DFT[['db-ms-modes']], '".'), - 'db-token', NA_character_, 1, 'character', 'Database token. Used by Peakforest database.', - 'debug', NA_character_, 0, 'logical', 'Set debug mode.' - ) - - spec <- c(spec, make.getopt.spec.print.dft()) - - if ( ! is.null(spec)) - spec <- matrix(spec, byrow = TRUE, ncol = 5) - - return(spec) -} - -############# -# READ ARGS # -############# +# Read args {{{1 +################################################################ read_args <- function() { - # options - spec <- make.getopt.spec() - opt <- getopt(spec) + # Get options + opt <- getopt(matrix(make.getopt.spec(), byrow = TRUE, ncol = 5)) # help - if ( ! is.null(opt$help)) - print.help(spec) + if ( ! is.null(opt$help)) { + print.help() + quit() + } - print.dft.arg.val(opt) # Print default values opt <- set.dft.arg.val(opt) # Set default values opt <- parse.arg.val(opt) # Parse list values # Check values - error <- .check.db.conn.opts(opt) - if (is.null(opt[['output-file']]) && is.null(opt[['list-cols']])) { - warning("You must set a path for the output file.") - error <- TRUE - } - if (is.null(opt[['list-cols']])) { - if (is.null(opt[['input-file']])) { - warning("You must provide an input file.") - error <- TRUE - } - if (is.null(opt$mode) || ( ! opt$mode %in% MSDB.MODE.VALS)) { - warning("You must specify a mode through the --mode option.") - error <- TRUE - } - if (is.null(opt$mzprec)) { - warning("You must set a precision in MZ with the --mzprec option.") - error <- TRUE - } - if ( ( ! is.null(opt$rtcol) || ! is.null(opt[['all-cols']])) && (is.null(opt$rttolx) || is.null(opt$rttoly))) { - warning("When chromatographic columns are set, you must provide values for --rttolx and -rttoly.") - error <- TRUE - } - if (is.null(opt$mztolunit) || ( ! opt$mztolunit %in% MSDB.MZTOLUNIT.VALS)) { - warning("You must specify an M/Z tolerance unit through the --mztolunit option.") - error <- TRUE - } - } - - # help - if (error) - print.help(spec, status = 1) + error <- check.args(opt) return(opt) } - ##################################### - # CHECK DATABASE CONNECTION OPTIONS # - ##################################### - - .check.db.conn.opts <- function(opt) { +# Check args {{{1 +################################################################ + +check.args <- function(opt) { + + # Check database type + if (is.null(opt$database)) + stop("You must provide a database type through --database option.") + if ( ! opt$database %in% MSDB.VALS) + stop(paste0("Invalid value \"", opt$database, "\" for --database option.")) + + # Check filedb database + if (opt$database == MSDB.FILE) { + if (is.null(opt$url)) + stop("When using single file database, you must specify the location of the database file with option --url.") + if ( ! file.exists(opt$url)) + stop(paste0("The file path \"", opt$url,"\" specified with --db-file option is not valid.")) + } - # Print default values - if ( ! is.null(opt[['print-db-fields']])) { - cat(MSDB.DFT[['db-fields']]) - q(status = 0) - } - if ( ! is.null(opt[['print-db-ms-modes']])) { - cat(MSDB.DFT[['db-ms-modes']]) - q(status = 0) - } - - # Check values - error <- FALSE - if (is.null(opt$database)) { - warning("You must provide a database type through --database option.") - error <- TRUE - } - if ( ! opt$database %in% MSDB.VALS) { - warning(paste0("Invalid value \"", opt$database, "\" for --database option.")) - error <- TRUE - } - if (opt$database == MSDB.FILE) { - if (is.null(opt$url)) { - warning("When using single file database, you must specify the location of the database file with option --url.") - error <- TRUE - } - if ( ! file.exists(opt$url)) { - warning(paste0("The file path \"", opt$url,"\" specified with --db-file option is not valid.")) - error <- TRUE - } - } - if (opt$database == MSDB.XLS) { - if (is.null(opt$url)) { - warning("When using Excel database, you must specify the location of the Excel files directory with option --url.") - error <- TRUE - } - if ( ! file.exists(opt$url)) { - warning(paste0("The directory path \"", opt$url,"\" specified with --xls-dir option is not valid.")) - error <- TRUE - } - } - if (opt$database == MSDB.4TABSQL) { - if (is.null(opt$url)) { - warning("When using 4Tab SQL database, you must specify the URL of the SQL server with option --url.") - error <- TRUE - } - if (is.null(opt[['db-name']])) { - warning("When using 4Tab SQL database, you must specify the database name through the --db-name option.") - error <- TRUE - } - if (is.null(opt[['db-user']])) { - warning("When using 4Tab SQL database, you must specify the database user through the --db-user option.") - error <- TRUE - } - if (is.null(opt[['db-password']])) { - warning("When using 4Tab SQL database, you must specify the database user password through the --db-password option.") - error <- TRUE - } - } - if (opt$database == MSDB.PEAKFOREST) { - if (is.null(opt$url)) { - warning("When using PeakForest database, you must specify the URL of the PeakForest server with option --url.") - error <- TRUE - } - } - - return(error) + # Check Excel database + if (opt$database == MSDB.XLS) { + if (is.null(opt$url)) + stop("When using Excel database, you must specify the location of the Excel files directory with option --url.") + if ( ! file.exists(opt$url)) + stop(paste0("The directory path \"", opt$url,"\" specified with --xls-dir option is not valid.")) } - - ############################# - # DISPLAY COMMAND LINE HELP # - ############################# - - .disp.cmd.line.help <- function(optspec, opt, prog, error = FALSE) { - - if ( ! is.null(opt$help) || error ) { - cat(getopt(optspec, usage = TRUE, command = prog)) - q(status = 1) - } - } - - ################# - # LOAD DATABASE # - ################# - - .load.db <- function(opt) { - if (is.null(opt[['pos-prec']]) && is.null(opt[['neg-prec']])) { - precursors <- NULL - } else { - precursors <- list() - precursors[[MSDB.TAG.POS]] <- opt[['pos-prec']] - precursors[[MSDB.TAG.NEG]] <- opt[['neg-prec']] - } + # Check 4 tab database + if (opt$database == MSDB.4TABSQL) { + if (is.null(opt$url)) + stop("When using 4Tab SQL database, you must specify the URL of the SQL server with option --url.") + if (is.null(opt[['db-name']])) + stop("When using 4Tab SQL database, you must specify the database name through the --db-name option.") + if (is.null(opt[['db-user']])) + stop("When using 4Tab SQL database, you must specify the database user through the --db-user option.") + if (is.null(opt[['db-password']])) + stop("When using 4Tab SQL database, you must specify the database user password through the --db-password option.") + } - db <- switch(opt$database, - peakforest = MsPeakForestDb$new(url = opt$url, useragent = USERAGENT, token = opt[['db-token']]), - xls = MsXlsDb$new(db_dir = opt$url, cache_dir = opt[['cache-dir']]), - '4tabsql' = Ms4TabSqlDb$new(host = extract.address(opt$url), port = extract.port(opt$url), dbname = opt[['db-name']], user = opt[['db-user']], password = opt[['db-password']]), - file = MsFileDb$new(file = opt$url), - NULL) - db$setPrecursors(precursors) - if (db$areDbFieldsSettable()) - db$setDbFields(opt[['db-fields']]) - if (db$areDbMsModesSettable()) - db$setDbMsModes(opt[['db-ms-modes']]) - db$addObservers(MsDbLogger$new()) - - return(db) + # Check Peakforest database + if (opt$database == MSDB.PEAKFOREST) { + if (is.null(opt$url)) + stop("When using PeakForest database, you must specify the URL of the PeakForest server with option --url.") } -############### -# OUTPUT HTML # -############### + if (is.null(opt[['list-cols']])) { + + if (is.null(opt[['output-file']])) + stop("You must set a path for the output file.") + + if (is.null(opt[['input-file']])) + stop("You must provide an input file.") + + if (is.null(opt$mode) || ( ! opt$mode %in% MSDB.MODE.VALS)) + stop("You must specify a mode through the --mode option.") + + if (is.null(opt$mzprec)) + stop("You must set a precision in MZ with the --mzprec option.") + + if ( ( ! is.null(opt$rtcol) || ! is.null(opt[['all-cols']])) && (is.null(opt$rttolx) || is.null(opt$rttoly))) + stop("When chromatographic columns are set, you must provide values for --rttolx and -rttoly.") + + if (is.null(opt$mztolunit) || ( ! opt$mztolunit %in% MSDB.MZTOLUNIT.VALS)) + stop("You must specify an M/Z tolerance unit through the --mztolunit option.") + } +} + +# Load database {{{1 +################################################################ + +.load.db <- function(opt) { -output.html <- function(db, main, peaks, file, opt, output.fields) { + if (is.null(opt[['pos-prec']]) && is.null(opt[['neg-prec']])) { + precursors <- NULL + } else { + precursors <- list() + precursors[[MSDB.TAG.POS]] <- opt[['pos-prec']] + precursors[[MSDB.TAG.NEG]] <- opt[['neg-prec']] + } + + db <- switch(opt$database, + peakforest = MsPeakForestDb$new(url = opt$url, useragent = USERAGENT, token = opt[['db-token']]), + xls = MsXlsDb$new(db_dir = opt$url, cache_dir = opt[['cache-dir']]), + '4tabsql' = Ms4TabSqlDb$new(host = extract.address(opt$url), port = extract.port(opt$url), dbname = opt[['db-name']], user = opt[['db-user']], password = opt[['db-password']]), + file = MsFileDb$new(file = opt$url), + NULL) + db$setPrecursors(precursors) + if (db$areDbFieldsSettable()) + db$setDbFields(opt[['db-fields']]) + if (db$areDbMsModesSettable()) + db$setDbMsModes(opt[['db-ms-modes']]) + db$addObservers(MsDbLogger$new()) + + return(db) +} + +# Output HTML {{{1 +################################################################ + +output.html <- function(db, peaks, file) { # Replace public database IDs by URLs - if ( ! is.null(peaks) || ! is.null(main)) { + if ( ! is.null(peaks)) { # Conversion from extdb id field to extdb name extdb2classdb = list() extdb2classdb[MSDB.TAG.KEGG] = BIODB.KEGG @@ -407,11 +383,8 @@ # Loop on all dbs for (extdb in c(MSDB.TAG.KEGG, MSDB.TAG.HMDB, MSDB.TAG.CHEBI, MSDB.TAG.PUBCHEM)) { - field <- output.fields[[extdb]] - if ( ! is.null(peaks) && field %in% colnames(peaks)) - peaks[[field]] <- vapply(peaks[[field]], function(id) if (is.na(id)) '' else paste0('<a href="', get.entry.url(class = extdb2classdb[[extdb]], accession = id, content.type = BIODB.HTML), '">', id, '</a>'), FUN.VALUE = '') - if ( ! is.null(main) && field %in% colnames(main)) - main[[field]] <- vapply(main[[field]], function(ids) if (is.na(ids) || nchar(ids) == 0) '' else paste(vapply(strsplit(ids, opt[['molids-sep']])[[1]], function(id) paste0('<a href="', get.entry.url(class = extdb2classdb[[extdb]], accession = id, content.type = BIODB.HTML), '">', id, '</a>'), FUN.VALUE = ''), collapse = opt[['molids-sep']]), FUN.VALUE = '') + if ( ! is.null(peaks) && extdb %in% colnames(peaks)) + peaks[[extdb]] <- vapply(peaks[[extdb]], function(id) if (is.na(id)) '' else paste0('<a href="', get.entry.url(class = extdb2classdb[[extdb]], accession = id, content.type = BIODB.HTML), '">', id, '</a>'), FUN.VALUE = '') } } @@ -432,36 +405,9 @@ html$writeEndTag('style') html$writeEndTag('header') html$writeBegTag('body') - html$writeTag('h1', text = "LC/MS matching") - - # Write parameters - html$writeTag('h2', text = "Parameters") - html$writeBegTag('ul') - html$writeTag('li', text = paste0("Mode = ", opt$mode, ".")) - html$writeTag('li', text = paste0("M/Z precision = ", opt$mzprec, ".")) - html$writeTag('li', text = paste0("M/Z shift = ", opt$mzshift, ".")) - html$writeTag('li', text = paste0("Precursor match = ", (if (is.null(opt[['precursor-match']])) "no" else "yes"), ".")) - if ( ! is.null(opt[['precursor-match']])) { - html$writeTag('li', text = paste0("Positive precursors = ", paste0(opt[['pos-prec']], collapse = ', '), ".")) - html$writeTag('li', text = paste0("Negative precursors = ", paste0(opt[['neg-prec']], collapse = ', '), ".")) - } - if ( ! is.null(opt$rtcol)) { - html$writeTag('li', text = paste0("Columns = ", paste(opt$rtcol, collapse = ", "), ".")) - html$writeTag('li', text = paste0("RTX = ", opt$rttolx, ".")) - html$writeTag('li', text = paste0("RTY = ", opt$rttoly, ".")) - if ( ! is.null(opt[['precursor-match']])) - html$writeTag('li', text = paste0("RTZ = ", opt[['precursor-rt-tol']], ".")) - } - html$writeEndTag('ul') # Write results - html$writeTag('h2', text = "Results") results <- FALSE - if ( ! is.null(main) && nrow(main) > 0 && is.null(opt[['no-main-table-in-html-output']])) { - html$writeTag('h3', text = "Main output") - html$writeTable(main) - results <- TRUE - } if ( ! is.null(peaks) && nrow(peaks) > 0) { html$writeTag('h3', text = "Matched peaks") html$writeTable(peaks) @@ -474,11 +420,8 @@ html$writeEndTag('html') } -######## -# MAIN # -######## - -options(error = function() { traceback(2) ; quit(status = 1) }, warn = 2 ) +# MAIN {{{1 +################################################################ # Read command line arguments opt <- read_args() @@ -488,6 +431,7 @@ } # Load database +source(file.path(dirname(script.path), DB.SRC.FILE[[opt$database]]), chdir = TRUE) db <- .load.db(opt) # Print columns @@ -503,7 +447,7 @@ if (file.info(opt[['input-file']])$size > 0) { # Load file into data frame - input <- read.table(file = opt[['input-file']], header = TRUE, sep = "\t", stringsAsFactor = FALSE) + input <- read.table(file = opt[['input-file']], header = TRUE, sep = "\t", stringsAsFactor = FALSE, check.names = FALSE, comment.char = '') # Convert each column that is identified by a number into a name for (field in names(opt[['input-col-names']])) { @@ -526,7 +470,7 @@ # Set columns 'all-cols' specified if ( ! is.null(opt[['all-cols']])) - opt$rtcol <- db$getChromCol() + opt$rtcol <- db$getChromCol()[['id']] # Check chrom columns if ( ! is.null(opt[['check-cols']]) && ! is.null(opt$rtcol)) { @@ -541,15 +485,20 @@ if ( ! is.null(opt$rtcol) && ! opt[['input-col-names']][['rt']] %in% colnames(input)) stop(paste0("You are running an MZ/RT match run on your input data, but no retention time column named '", opt[['input-col-names']][['rt']],"' can be found inside your input file.")) +# Set output col names +output.col.names <- opt[['input-col-names']] + # Set streams -input.stream <- MsDbInputDataFrameStream$new(df = input, input.fields = opt[['input-col-names']]) -main.output <- MsDbOutputDataFrameStream$new(keep.unused = ! is.null(opt[['same-cols']]), output.fields = opt[['output-col-names']], one.line = ! is.null(opt[['same-rows']]), match.sep = opt[['molids-sep']], first.val = ! is.null(opt[['first-val']]), ascii = ! is.null(opt[['excel2011comp']]), nogreek = ! is.null(opt[['excel2011comp']]), noapostrophe = ! is.null(opt[['excel2011comp']]), noplusminus = ! is.null(opt[['excel2011comp']])) -peaks.output <- MsDbOutputDataFrameStream$new(keep.unused = ! is.null(opt[['same-cols']]), output.fields = opt[['output-col-names']], first.val = ! is.null(opt[['first-val']]), ascii = ! is.null(opt[['excel2011comp']]), nogreek = ! is.null(opt[['excel2011comp']]), noapostrophe = ! is.null(opt[['excel2011comp']]), noplusminus = ! is.null(opt[['excel2011comp']])) +input.stream <- MsDbInputDataFrameStream$new(df = input, input.fields = opt[['input-col-names']], rtunit = opt[['rtunit']]) +main.output <- MsDbOutputDataFrameStream$new(keep.unused = ! is.null(opt[['same-cols']]), output.fields = output.col.names, one.line = ! is.null(opt[['same-rows']]), match.sep = opt[['molids-sep']], first.val = ! is.null(opt[['first-val']]), ascii = ! is.null(opt[['excel2011comp']]), nogreek = ! is.null(opt[['excel2011comp']]), noapostrophe = ! is.null(opt[['excel2011comp']]), noplusminus = ! is.null(opt[['excel2011comp']]), rtunit = opt[['rtunit']]) +peaks.output <- MsDbOutputDataFrameStream$new(keep.unused = ! is.null(opt[['same-cols']]), output.fields = output.col.names, first.val = ! is.null(opt[['first-val']]), ascii = ! is.null(opt[['excel2011comp']]), nogreek = ! is.null(opt[['excel2011comp']]), noapostrophe = ! is.null(opt[['excel2011comp']]), noplusminus = ! is.null(opt[['excel2011comp']]), rtunit = opt[['rtunit']]) invisible(db$setInputStream(input.stream)) db$addOutputStreams(c(main.output, peaks.output)) -# Set M/Z tolerance unit +# Set database units db$setMzTolUnit(opt$mztolunit) +if ( ! is.null(opt[['db-rt-unit']]) && opt$database == 'file') + db$setRtUnit(opt[['db-rt-unit']]) # Search database mode <- if (opt$mode == POS_MODE) MSDB.TAG.POS else MSDB.TAG.NEG @@ -565,4 +514,4 @@ df.write.tsv(peaks.output$getDataFrame(), file = opt[['peak-output-file']], row.names = FALSE) if ( ! is.null(opt[['html-output-file']])) # TODO Create a class MsDbOutputHtmlFileStream - output.html(db = db, main = main.output$getDataFrame(), peaks = peaks.output$getDataFrame(), file = opt[['html-output-file']], opt = opt, output.fields = opt[['output-col-names']]) + output.html(db = db, peaks = peaks.output$getDataFrame(), file = opt[['html-output-file']])
--- a/spec-dist.R Tue Mar 14 12:40:22 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,196 +0,0 @@ -#dyn.load('src/closeMatchPpm.dll') -# commented out for refactoring as package -#dyn.load('src/closeMatchPpm.so') - -matchPpm <- function(x, y, ppm = 3, mzmin = 0) { - if (any(is.na(y))) - stop("NA's are not allowed in y !\n") - ok <- !(is.na(x)) - ans <- order(x) - keep <- seq_along(ok)[ok] - xidx <- ans[ans %in% keep] - xs <- x[xidx] - yidx <- order(y) - ys <- y[yidx] - if (!is.double(xs)) - xs <- as.double(xs) - if (!is.double(ys)) - ys <- as.double(ys) - if (!is.integer(xidx)) - xidx <- as.integer(xidx) - if (!is.integer(yidx)) - yidx <- as.integer(yidx) - - fm <- - .Call( - "closeMatchPpm", - xs, - ys, - xidx, - yidx, - as.integer(length(x)), - as.double(ppm), - as.double(mzmin) - ) - fm -} - - -simpList <- function(v) { - vapply(v, function(x) { - if (is.null(x)) { - -1 - } else{ - x - } - }, FUN.VALUE = -1) -} - - -##Stein and scott values : mzexp 3 intexp 0.6 -##Massbank values : mzexp 2 intexp 0.5 - - -cosine <- - function(mz1, - mz2, - int1, - int2, - mzexp = 2, - intexp = 0.5, - ppm, - dmz = 0.005) { - matchList <- matchPpm(mz1, mz2, ppm, dmz) - ###Weigthed intensity - pfound <- which(!sapply(matchList, is.null, simplify = TRUE)) - - ###If no peak is found. - if (length(pfound) == 0) - return(list(measure = 0, matched = rep(-1, length(mz1)))) - w1 <- int1 ^ intexp * mz1 ^ mzexp - w2 <- int2 ^ intexp * mz2 ^ mzexp - cat(w1[pfound], w2[unlist(matchList[pfound])],'\n') - cos_value <- - sum((w1[pfound] * w2[unlist(matchList[pfound])]) ^ 2) / (sum(w1[pfound] ^ - 2) * sum(w2[unlist(matchList[pfound])] ^ 2)) - - ####Adding the penality if needed. - list(measure = cos_value, matched = simpList(matchList)) - } - - -###penalized cosine - -wcosine <- - function(mz1, - mz2, - int1, - int2, - mzexp = 2, - intexp = 0.5, - ppm, - dmz = 0.005, - penality = c("rweigth")) { - penality <- match.arg(penality) - matchList <- matchPpm(mz1, mz2, ppm, dmz) - ###Weigthed intensity - pfound <- which(!sapply(matchList, is.null, simplify = TRUE)) - ###If no peak is found. - if (length(pfound) == 0) - return(list(measure = 0, matched = rep(-1, length(mz1)))) - w1 <- int1 ^ intexp * mz1 ^ mzexp - w2 <- int2 ^ intexp * mz2 ^ mzexp - - cos_value <- - sum((w1[pfound] * w2[unlist(matchList[pfound])]) ^ 2) / (sum(w1[pfound] ^ - 2) * sum(w2[unlist(matchList[pfound])] ^ 2)) - - if(is.nan(cos_value)) cos_value <- 0 - ####Adding the penality if needed. - div = 1 - if (penality == "rweigth") { - p <- - (sum(w1[pfound]) / sum(w1) + sum(w2[unlist(matchList[pfound])]) / sum(w2)) / - 2 - div = 2 - } else{ - p <- 0 - } - - measure <- (cos_value + p) / div - if(is.nan(measure)) measure <- (cos_value) / div - list(measure = measure, - matched = simpList(matchList)) - } - -##A gaussian of the two spectra seen as a mixture of gaussian, derived form Heinonen et al 2012 -pkernel <- - function(mz1, - mz2, - int1, - int2, - mzexp = 2, - intexp = 0.5, - ppm, - dmz = 0.005, - sigint = 0.5, - penality = c("rweigth")) { - ###We first match the peak - matchList <- matchPpm(mz1, mz2, ppm, dmz) - # ###Weigthed intensity - pfound <- which(!sapply(matchList, is.null, simplify = TRUE)) - # - ###If no peak is found. - if (length(pfound) == 0) - return(list(measure = 0, matched = rep(-1, length(mz1)))) - w1 <- int1 ^ intexp * mz1 ^ mzexp - w2 <- int2 ^ intexp * mz2 ^ mzexp - w1 <- w1 * 1 / sum(w1) - w2 <- w2 * 1 / sum(w2) - l1 <- length(w1) - l2 <- length(w2) - ###The mz dev - vsig1 = mz1 * ppm * 3 * 10 ^ -6 - vsig1 = sapply(vsig1, function(x, y) { - return(max(x, y)) - }, y = dmz) - - vsig2 = mz2 * ppm * 3 * 10 ^ -6 - vsig2 = sapply(vsig2, function(x, y) { - return(max(x, y)) - }, y = dmz) - accu = 0 - ###TO DO rcopder en C - for (i in 1:l1) { - for (j in 1:l2) { - divisor = max(stats::dnorm( - mz1[i], - mean = mz1[i], - sd = sqrt(vsig1[i] ^ 2 + vsig1[i] ^ 2) - ), - stats::dnorm( - mz2[j], - mean = mz2[j], - sd = sqrt(vsig2[j] ^ 2 + vsig2[j] ^ 2) - )) - if (divisor == 0) - next - scalet = stats::dnorm(mz1[i], - mean = mz2[j], - sd = sqrt(vsig1[i] ^ 2 + vsig2[j] ^ 2)) - accu = accu + scalet / divisor - } - } - div = 1 - if (penality == "rweigth") { - p <- - (sum(w1[pfound]) / sum(w1) + sum(w2[unlist(matchList[pfound])]) / sum(w2)) / - 2 - div = 2 - } else{ - p <- 0 - } - accu = accu / (l2 * l1) - list(measure = (accu + p) / div, - matched = simpList(matchList)) - }
--- a/test-data/filedb-small-mz-match-html-output.html Tue Mar 14 12:40:22 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,370 +0,0 @@ -<html> - <header> - <meta charset="UTF-8"/> - <title>LC/MS matching results</title> - <style> - table, th, td { border-collapse: collapse; } - table, th { border: 1px solid black; } - td { border-left: 1px solid black; border-right: 1px solid black; } - th, td { padding: 5px; } - th { background-color: LightBlue; } - tr:nth-child(even) { background-color: LemonChiffon; } - tr:nth-child(odd) { background-color: LightGreen; } - </style> - </header> - <body> - <h1>LC/MS matching</h1> - <h2>Parameters</h2> - <ul> - <li>Mode = pos.</li> - <li>M/Z precision = 5.</li> - <li>M/Z shift = 0.</li> - <li>Precursor match = no.</li> - </ul> - <h2>Results</h2> - <h3>Matched peaks</h3> - <table> - <tr> - <th>mz</th> - <th>compoundid</th> - <th>msmode</th> - <th>mztheo</th> - <th>peakcomp</th> - <th>peakattr</th> - <th>compoundcomp</th> - <th>compoundmass</th> - <th>fullnames</th> - </tr> - <tr> - <td>80.04959</td> - <td>U761</td> - <td>POS</td> - <td>80.04948</td> - <td>P9Z5W46 O0</td> - <td>[(M+H)-(NHCO)]+</td> - <td>J16L6M62O</td> - <td>122.048</td> - <td>Coquelicol;Paquerettol</td> - </tr> - <tr> - <td>82.04819</td> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - </tr> - <tr> - <td>83.01344</td> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - </tr> - <tr> - <td>84.05585</td> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - </tr> - <tr> - <td>87.05536</td> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - </tr> - <tr> - <td>89.50682</td> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - </tr> - <tr> - <td>90.97681</td> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - </tr> - <tr> - <td>92.98093</td> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - </tr> - <tr> - <td>94.57331</td> - <td>A10</td> - <td>POS</td> - <td>94.57331</td> - <td>P93Z8W419 O2</td> - <td>[(M+2H)+(CH3CN)]++</td> - <td>J114L6M62O2</td> - <td>146.1055</td> - <td>Blablaine</td> - </tr> - <tr> - <td>97.07603</td> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - </tr> - <tr> - <td>99.54296</td> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - </tr> - <tr> - <td>101.0709</td> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - </tr> - <tr> - <td>102.0663</td> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - </tr> - <tr> - <td>102.2845</td> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - </tr> - <tr> - <td>104.0034</td> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - </tr> - <tr> - <td>104.5318</td> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - </tr> - <tr> - <td>105.4461</td> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - </tr> - <tr> - <td>105.7271</td> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - </tr> - <tr> - <td>106.0231</td> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - </tr> - <tr> - <td>106.24</td> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - </tr> - <tr> - <td>106.5116</td> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - </tr> - <tr> - <td>106.763</td> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - </tr> - <tr> - <td>106.9815</td> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - </tr> - <tr> - <td>107.2424</td> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - </tr> - <tr> - <td>107.4569</td> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - </tr> - <tr> - <td>107.6885</td> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - </tr> - <tr> - <td>107.9273</td> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - </tr> - <tr> - <td>108.1576</td> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - </tr> - <tr> - <td>109.0777</td> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - </tr> - <tr> - <td>110.0599</td> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - <td/> - </tr> - </table> - </body> -</html>
--- a/test-data/filedb-small-mz-match-output.tsv Tue Mar 14 12:40:22 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,31 +0,0 @@ -"mz" "compoundid" "msmode" "mztheo" "peakcomp" "peakattr" "compoundcomp" "compoundmass" "fullnames" -80.04959021 "U761" "POS" "80.049475" "P9Z5W46 O0" "[(M+H)-(NHCO)]+" "J16L6M62O" "122.04801" "Coquelicol;Paquerettol" -82.04819461 NA NA NA NA NA NA NA NA -83.01343941 NA NA NA NA NA NA NA NA -84.05585475 NA NA NA NA NA NA NA NA -87.05536392 NA NA NA NA NA NA NA NA -89.50682004 NA NA NA NA NA NA NA NA -90.97680734 NA NA NA NA NA NA NA NA -92.98092987 NA NA NA NA NA NA NA NA -94.57331384 "A10" "POS" "94.5733145" "P93Z8W419 O2" "[(M+2H)+(CH3CN)]++" "J114L6M62O2" "146.10553" "Blablaine" -97.07602789 NA NA NA NA NA NA NA NA -99.5429594 NA NA NA NA NA NA NA NA -101.0708987 NA NA NA NA NA NA NA NA -102.066292 NA NA NA NA NA NA NA NA -102.2845376 NA NA NA NA NA NA NA NA -104.0034256 NA NA NA NA NA NA NA NA -104.5317528 NA NA NA NA NA NA NA NA -105.4460999 NA NA NA NA NA NA NA NA -105.7271343 NA NA NA NA NA NA NA NA -106.0231437 NA NA NA NA NA NA NA NA -106.2399954 NA NA NA NA NA NA NA NA -106.5116177 NA NA NA NA NA NA NA NA -106.7629705 NA NA NA NA NA NA NA NA -106.9814579 NA NA NA NA NA NA NA NA -107.2424051 NA NA NA NA NA NA NA NA -107.4569385 NA NA NA NA NA NA NA NA -107.6884734 NA NA NA NA NA NA NA NA -107.9272908 NA NA NA NA NA NA NA NA -108.1575604 NA NA NA NA NA NA NA NA -109.0777249 NA NA NA NA NA NA NA NA -110.0599023 NA NA NA NA NA NA NA NA
--- a/test-data/filedb-small-mz-match-peaks-output.tsv Tue Mar 14 12:40:22 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,31 +0,0 @@ -"mz" "compoundid" "msmode" "mztheo" "peakcomp" "peakattr" "compoundcomp" "compoundmass" "fullnames" -80.04959021 "U761" "POS" 80.049475 "P9Z5W46 O0" "[(M+H)-(NHCO)]+" "J16L6M62O" 122.04801 "Coquelicol;Paquerettol" -82.04819461 NA NA NA NA NA NA NA NA -83.01343941 NA NA NA NA NA NA NA NA -84.05585475 NA NA NA NA NA NA NA NA -87.05536392 NA NA NA NA NA NA NA NA -89.50682004 NA NA NA NA NA NA NA NA -90.97680734 NA NA NA NA NA NA NA NA -92.98092987 NA NA NA NA NA NA NA NA -94.57331384 "A10" "POS" 94.5733145 "P93Z8W419 O2" "[(M+2H)+(CH3CN)]++" "J114L6M62O2" 146.10553 "Blablaine" -97.07602789 NA NA NA NA NA NA NA NA -99.5429594 NA NA NA NA NA NA NA NA -101.0708987 NA NA NA NA NA NA NA NA -102.066292 NA NA NA NA NA NA NA NA -102.2845376 NA NA NA NA NA NA NA NA -104.0034256 NA NA NA NA NA NA NA NA -104.5317528 NA NA NA NA NA NA NA NA -105.4460999 NA NA NA NA NA NA NA NA -105.7271343 NA NA NA NA NA NA NA NA -106.0231437 NA NA NA NA NA NA NA NA -106.2399954 NA NA NA NA NA NA NA NA -106.5116177 NA NA NA NA NA NA NA NA -106.7629705 NA NA NA NA NA NA NA NA -106.9814579 NA NA NA NA NA NA NA NA -107.2424051 NA NA NA NA NA NA NA NA -107.4569385 NA NA NA NA NA NA NA NA -107.6884734 NA NA NA NA NA NA NA NA -107.9272908 NA NA NA NA NA NA NA NA -108.1575604 NA NA NA NA NA NA NA NA -109.0777249 NA NA NA NA NA NA NA NA -110.0599023 NA NA NA NA NA NA NA NA
--- a/test-data/filedb.tsv Tue Mar 14 12:40:22 2017 -0400 +++ b/test-data/filedb.tsv Wed Apr 19 10:00:05 2017 -0400 @@ -1,7 +1,9 @@ "compoundid" "msmode" "mztheo" "peakcomp" "peakattr" "chromcol" "chromcolrt" "compoundcomp" "compoundmass" "fullnames" A10 "POS" 112.07569 "P9Z6W410 O" "[(M+H)-(H2O)-(NH3)]+" "colzz" 5.69 "J114L6M62O2" 146.10553 Blablaine' +A10 "POS" 112.07569 "P9Z6W410 O" "[(M+H)-(H2O)-(NH3)]+" "colAA" 1.58 "J114L6M62O2" 146.10553 Blablaine' A10 "POS" 112.07569 "P9Z6W410 O" "[(M+H)-(H2O)-(NH3)]+" "col12" 0.8 "J114L6M62O2" 146.10553 "Blablaine" A10 "POS" 112.07569 "P9Z6W410 O" "[(M+H)-(H2O)-(NH3)]+" "somecol" 8.97 "J114L6M62O2" 146.10553 "Blablaine" +A10 "POS" 112.07569 "P9Z6W410 O" "[(M+H)-(H2O)-(NH3)]+" "colzz2" 4.08 "J114L6M62O2" 146.10553 Blablaine' A10 "POS" 191.076694 "P92Z6W413 Na2 O2" "[(M-H+2Na)]+" "colAA" 1.58 "J114L6M62O2" 146.10553 "Blablaine" A10 "POS" 191.076694 "P92Z6W413 Na2 O2" "[(M-H+2Na)]+" "colzz2" 4.08 "J114L6M62O2" 146.10553 "Blablaine" A10 "POS" 294.221687 "U1113P94ZW429 O4" "[(2M+H)]+ (13C)" "somecol" 8.97 "J114L6M62O2" 146.10553 "Blablaine"
--- a/todf.R Tue Mar 14 12:40:22 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,34 +0,0 @@ -source('tolst.R') - -# Convert a list of key/value lists or a list of objects into a data frame. Each key becomes a column. -# x The object to convert to data frame. Either a list of key/value lists, or a list of objects. -# rm_na_col If true, remove all columns that contain only NA values. -todf <- function(x, rm_na_col = FALSE) { - - df <- data.frame() - - # x not null ? - if ( ! is.null(x) && length(x) > 0) { - - # fill data frame - for (i in 1:length(x)) { - lst <- if (typeof(x[[i]]) == 'S4') tolst(x[[i]]) else x[[i]] - for (k in names(lst)) { - v <- x[[i]][[k]] - df[i , k] <- if (length(v) > 1) paste0(v, collapse = ';') else v - } - } - - # remove NA columns - if (rm_na_col) { - drop <- character() - for (col in names(df)) - if (all(is.na(df[[col]]))) - drop <- c(drop, col) - if (length(drop) > 0) - df <- df[, !(names(df) %in% drop)] - } - } - - return(df) -}
--- a/tolst.R Tue Mar 14 12:40:22 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,30 +0,0 @@ -################## -# OBJECT TO LIST # -################## - -.object_to_list <- function(obj) { - - if(is.null(obj)) - return(NULL) - - field_names <- names(obj$getRefClass()$fields()) - l <- c() - lapply( field_names, function(x) { l<<-c(l,list(obj$field(x))) } ) - names(l) <- field_names - return(l) -} - -########### -# TO LIST # -########### - -tolst <- function(v) { - - switch(typeof(v), - S4 = lst <- .object_to_list(v), - list = lst <- v, - stop("Unknown type '", typeof(v), "'.") - ) - - return(lst) -}
--- a/tostr.R Tue Mar 14 12:40:22 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,189 +0,0 @@ -source("tolst.R") - -# Convert lists and objects to string representation. Supported outputs are: -# _ Text. -# _ PHP code. -# _ R code (to be done). - -########################## -# SET STRING TO VARIABLE # -########################## - -# str The value converted to a string. -# mode The mode to use. -# var Variable name. -.set_str_to_variable <- function(str, mode = 'txt', var = NA_character_) { - - # Add variable - switch(mode, - txt = { str <- paste(var, '=', str) }, - php = { str <- paste0('$', var, ' = ', str, ';') }, - stop("Unknown mode '", mode, "'.") - ) - - return(str) -} - -################ -# QUOTE VALUES # -################ - -# values A vector of values. -# mode The mode to use. -# keys If the vector contains keys of a dictionary structured (depending on the mode, they will be quoted or not). -.quote_values <- function(values, mode = 'txt', keys = FALSE) { - - if (mode == 'txt' && keys) - return(values) - - # Quote string values - # TODO escape quote characters - if (is.character(values)) - return(unlist(lapply(values, function(x) { paste0('"', x, '"') } ))) - - return(values) -} - -############ -# SET KEYS # -############ - -# values Vector or list of values. -# mode The mode to use. -.set_keys <- function(values, mode = 'txt') { - - if ( ! is.null(names(values))) { - keys <- names(values) - indices <- 1:length(values) - switch(mode, - txt = { values <- lapply(indices, function(x) paste( if (nchar(keys[[x]]) == 0) x else keys[[x]], '=>', values[[x]])) }, - php = { values <- lapply(names(values), function(x) paste0('"', if (nchar(keys[[x]]) == 0) x else keys[[x]], '"', ' => ', values[[x]])) }, - stop("Unknown mode '", mode, "'.") - ) - } - - return(values) -} - -############### -# JOIN VALUES # -############### - -# values Vector or list of values to join. -# mode The mode to use. -.join_values <- function(values, mode = 'txt') { - - switch(mode, - txt = { str <- paste0('(', paste(values, collapse = ', '), ')') }, - php = { str <- paste0('[', paste(values, collapse = ', '), ']') }, - stop("Unknown mode '", mode, "'.") - ) - - return(str) -} - -############### -# NULL TO STR # -############### - -# value The NULL or NA value, or the vector of length 0. -# mode The mode to use. -# var Variable name. -.null_to_str <- function(value, mode = 'txt', var = NA_character_) { - - # Set to 'null' string - switch(mode, - txt = { str <- if (length(value) > 0 && is.na(value)) 'NA' else 'null' }, - php = { str <- 'NULL' }, - stop("Unknown mode '", mode, "'.") - ) - - if ( ! is.null(var) && ! is.na(var)) - str <- .set_str_to_variable(str, mode, var) - - return(str) -} - -################ -# VALUE TO STR # -################ - -# TODO hide this function ? value_to_str -> .value_to_str - -# value The value to convert. -# mode The mode to use. -# var Variable name. -# lst If true, print the output as a list or array, even if it contains only one value. -.value_to_str <- function(value, mode = 'txt', var = NA_character_, lst = FALSE) { - - if (is.null(value) || (length(value) == 0 && ! lst) || (length(value) > 0 && is.na(value))) - return(.null_to_str(value, mode = mode, var = var)) - - # Transform value to a string - value <- .quote_values(value, mode = mode) - str <- if (length(value) == 1 && ! lst && is.null(names(value))) as.character(value) else .join_values(.set_keys(value, mode = mode), mode = mode) - - # Set to variable - if ( ! is.null(var) && ! is.na(var)) - str <- .set_str_to_variable(str, mode, var) - - return(str) -} - -############### -# LIST TO STR # -############### - -# vlist The list to convert. -# mode The mode to use. -# var Variable name. -# lst If true, print the output as a list or array, even if it contains only one value. -.list_to_str <- function(vlist, mode = 'txt', var = NA_character_, lst = FALSE) { - - if (is.null(vlist) || (length(vlist) == 0 && ! lst) || (length(vlist) > 0 && is.na(vlist))) - return(.null_to_str(vlist, mode = mode, var = var)) - - # - vstr <- character() - if (length(vlist) > 0) { - keys <- unlist(lapply(names(vlist), function(x) if (nchar(x) == 0) x else .quote_values(x, mode = mode, keys = TRUE))) - values <- lapply(vlist, function(x) tostr(x, mode = mode)) - sep <- switch(mode, - txt = '=>', - php = '=>', - stop("Unknown mode '", mode, "'.") - ) - vstr <- unlist(lapply(1:length(vlist), function(i) if (is.null(keys) || nchar(keys[i]) == 0) values[[i]] else paste(keys[i], sep, values[[i]]))) - } - - # Join string values - if (length(vstr) > 1 || lst || ! is.null(keys)) - str <- .join_values(vstr, mode = mode) - else - str <- vstr - - # Set to variable - if ( ! is.null(var) && ! is.na(var)) - str <- .set_str_to_variable(str, mode, var) - - return(str) -} - -########## -# TO STR # -########## - -# obj The object to convert. -# mode The mode to use. -# var Variable name. -# lst If true, print the output as a list or array, even if it contains only one value. -tostr <- function(obj, mode = 'txt', var = NA_character_, lst = FALSE) { - - switch(typeof(obj), - S4 = str <- tostr(tolst(obj), mode = mode, var = var, lst = lst), - list = str <- .list_to_str(obj, mode = mode, var = var, lst = lst), - str <- .value_to_str(obj, mode = mode, var = var, lst = lst) - ) - - return(str) -}