Mercurial > repos > prog > lcmsmatching
comparison MassbankEntry.R @ 2:20d69a062da3 draft
planemo upload for repository https://github.com/workflow4metabolomics/lcmsmatching.git commit d4048accde6bdfd5b3e14f5394902d38991854f8
| author | prog |
|---|---|
| date | Thu, 02 Mar 2017 08:55:00 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:253d531a0193 | 2:20d69a062da3 |
|---|---|
| 1 ########################### | |
| 2 # MASSBANK SPECTRUM CLASS # | |
| 3 ########################### | |
| 4 | |
| 5 MassbankEntry <- methods::setRefClass("MassbankEntry", contains = "BiodbEntry") | |
| 6 | |
| 7 ########### | |
| 8 # FACTORY # | |
| 9 ########### | |
| 10 | |
| 11 createMassbankEntryFromTxt <- function(contents, drop = TRUE) { | |
| 12 | |
| 13 entries <- list() | |
| 14 | |
| 15 # Define fields regex | |
| 16 regex <- character() | |
| 17 regex[[BIODB.ACCESSION]] <- "^ACCESSION: (.+)$" | |
| 18 regex[[BIODB.MSDEV]] <- "^AC\\$INSTRUMENT: (.+)$" | |
| 19 regex[[BIODB.MSDEVTYPE]] <- "^AC\\$INSTRUMENT_TYPE: (.+)$" | |
| 20 regex[[BIODB.MSTYPE]] <- "^AC\\$MASS_SPECTROMETRY: MS_TYPE (.+)$" | |
| 21 regex[[BIODB.MSPRECMZ]] <- "^MS\\$FOCUSED_ION: PRECURSOR_M/Z (.+)$" | |
| 22 regex[[BIODB.NB.PEAKS]] <- "^PK\\$NUM_PEAK: ([0-9]+)$" | |
| 23 regex[[BIODB.MSPRECANNOT]] <- "^MS\\$FOCUSED_ION: PRECURSOR_TYPE (.+)$" | |
| 24 regex[[BIODB.CHEBI.ID]] <- "^CH\\$LINK: CHEBI\\s+(.+)$" | |
| 25 regex[[BIODB.KEGG.ID]] <- "^CH\\$LINK: KEGG\\s+(.+)$" | |
| 26 regex[[BIODB.INCHI]] <- "^CH\\$IUPAC:\\s+(.+)$" | |
| 27 regex[[BIODB.INCHIKEY]] <- "^CH\\$LINK: INCHIKEY\\s+(.+)$" | |
| 28 regex[[BIODB.CHEMSPIDER.ID]] <- "^CH\\$LINK: CHEMSPIDER\\s+(.+)$" | |
| 29 regex[[BIODB.CAS.ID]] <- "^CH\\$LINK: CAS\\s+(.+)$" | |
| 30 regex[[BIODB.FORMULA]] <- "^CH\\$FORMULA:\\s+(.+)$" | |
| 31 regex[[BIODB.SMILES]] <- "^CH\\$SMILES:\\s+(.+)$" | |
| 32 regex[[BIODB.MASS]] <- "^CH\\$EXACT_MASS:\\s+(.+)$" | |
| 33 regex[[BIODB.PUBCHEMCOMP.ID]] <- "^CH\\$LINK: PUBCHEM\\s+.*CID:([0-9]+)" | |
| 34 regex[[BIODB.PUBCHEMSUB.ID]] <- "^CH\\$LINK: PUBCHEM\\s+.*SID:([0-9]+)" | |
| 35 | |
| 36 for (text in contents) { | |
| 37 | |
| 38 # Create instance | |
| 39 entry <- MassbankEntry$new() | |
| 40 | |
| 41 if ( ! is.null(text) && ! is.na(text)) { | |
| 42 | |
| 43 # Read text | |
| 44 lines <- strsplit(text, "\n") | |
| 45 for (s in lines[[1]]) { | |
| 46 | |
| 47 # Test generic regex | |
| 48 parsed <- FALSE | |
| 49 for (field in names(regex)) { | |
| 50 g <- stringr::str_match(s, regex[[field]]) | |
| 51 if ( ! is.na(g[1,1])) { | |
| 52 entry$setField(field, g[1,2]) | |
| 53 parsed <- TRUE | |
| 54 break | |
| 55 } | |
| 56 } | |
| 57 if (parsed) | |
| 58 next | |
| 59 | |
| 60 # Name | |
| 61 if (is.na(entry$getField(BIODB.NAME))) { | |
| 62 g <- stringr::str_match(s, "^CH\\$NAME:\\s+(.+)$") | |
| 63 if ( ! is.na(g[1,1])) | |
| 64 entry$setField(BIODB.NAME, g[1,2]) | |
| 65 } | |
| 66 | |
| 67 # PubChem | |
| 68 g <- stringr::str_match(s, "^CH\\$LINK: PUBCHEM\\s+([0-9]+)$") | |
| 69 if ( ! is.na(g[1,1])) | |
| 70 entry$setField(BIODB.PUBCHEMSUB.ID, g[1,2]) | |
| 71 | |
| 72 # MS MODE | |
| 73 g <- stringr::str_match(s, "^AC\\$MASS_SPECTROMETRY: ION_MODE (.+)$") | |
| 74 if ( ! is.na(g[1,1])) { | |
| 75 entry$setField(BIODB.MSMODE, if (g[1,2] == 'POSITIVE') BIODB.MSMODE.POS else BIODB.MSMODE.NEG) | |
| 76 next | |
| 77 } | |
| 78 | |
| 79 # PEAKS | |
| 80 if (.parse.peak.line(entry, s)) | |
| 81 next | |
| 82 } | |
| 83 } | |
| 84 | |
| 85 entries <- c(entries, entry) | |
| 86 } | |
| 87 | |
| 88 # Replace elements with no accession id by NULL | |
| 89 entries <- lapply(entries, function(x) if (is.na(x$getField(BIODB.ACCESSION))) NULL else x) | |
| 90 | |
| 91 # If the input was a single element, then output a single object | |
| 92 if (drop && length(contents) == 1) | |
| 93 entries <- entries[[1]] | |
| 94 | |
| 95 return(entries) | |
| 96 } | |
| 97 | |
| 98 ################### | |
| 99 # PARSE PEAK LINE # | |
| 100 ################### | |
| 101 | |
| 102 .parse.peak.line <- function(entry, line) { | |
| 103 | |
| 104 peaks <- BIODB.PEAK.DF.EXAMPLE | |
| 105 | |
| 106 # Annotation | |
| 107 g <- stringr::str_match(line, "^\\s+([0-9][0-9.]*) ([A-Z0-9+-]+) ([0-9]+) ([0-9][0-9.]*) ([0-9][0-9.]*)$") | |
| 108 if ( ! is.na(g[1,1])) | |
| 109 peaks[1, c(BIODB.PEAK.MZ, BIODB.PEAK.FORMULA, BIODB.PEAK.FORMULA.COUNT, BIODB.PEAK.MASS, BIODB.PEAK.ERROR.PPM)] <- list(as.double(g[1,2]), g[1,3], as.integer(g[1,4]), as.double(g[1,5]), as.double(g[1,6])) | |
| 110 | |
| 111 # Peak | |
| 112 g <- stringr::str_match(line, "^\\s+([0-9][0-9.]*) ([0-9][0-9.]*) ([0-9]+)$") | |
| 113 if ( ! is.na(g[1,1])) | |
| 114 peaks[1, c(BIODB.PEAK.MZ, BIODB.PEAK.INTENSITY, BIODB.PEAK.RELATIVE.INTENSITY)] <- list(as.double(g[1,2]), as.double(g[1,3]), as.integer(g[1,4])) | |
| 115 | |
| 116 if (nrow(peaks) > 0) { | |
| 117 | |
| 118 # Get curent peaks and merge with new peaks | |
| 119 current.peaks <- entry$getField(BIODB.PEAKS) | |
| 120 if ( ! is.null(current.peaks)) | |
| 121 peaks <- rbind(current.peaks, peaks) | |
| 122 | |
| 123 entry$setField(BIODB.PEAKS, peaks) | |
| 124 | |
| 125 return(TRUE) | |
| 126 } | |
| 127 | |
| 128 return(FALSE) | |
| 129 } |
