# HG changeset patch # User computational-metabolomics # Date 1580917308 18000 # Node ID 9e6bf72782576674ecbb60eebbb179cd7cb51eac "planemo upload for repository https://github.com/computational-metabolomics/sirius_csifingerid_galaxy commit 1d1b37a070f895c94069819237199c768da27258" diff -r 000000000000 -r 9e6bf7278257 sirius_csifingerid.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sirius_csifingerid.py Wed Feb 05 10:41:48 2020 -0500 @@ -0,0 +1,328 @@ +from __future__ import absolute_import, print_function + +import argparse +import csv +import glob +import multiprocessing +import os +import re +import sys +import tempfile +import uuid +from collections import defaultdict + +import six + +parser = argparse.ArgumentParser() +parser.add_argument('--input_pth') +parser.add_argument('--result_pth') +parser.add_argument('--database') +parser.add_argument('--profile') +parser.add_argument('--candidates') +parser.add_argument('--ppm_max') +parser.add_argument('--polarity') +parser.add_argument('--results_name') +parser.add_argument('--out_dir') +parser.add_argument('--tool_directory') +parser.add_argument('--temp_dir') + +parser.add_argument('--meta_select_col', default='all') +parser.add_argument('--cores_top_level', default=1) +parser.add_argument('--chunks', default=1) +parser.add_argument('--minMSMSpeaks', default=1) +parser.add_argument('--schema', default='msp') +args = parser.parse_args() +print(args) +if os.stat(args.input_pth).st_size == 0: + print('Input file empty') + exit() + +if args.temp_dir: + wd = os.path.join(args.temp_dir, 'temp') + os.mkdir(wd) + + if not os.path.exists(wd): + os.mkdir(wd) +else: + td = tempfile.mkdtemp() + wd = os.path.join(td, str(uuid.uuid4())) + os.mkdir(wd) + +###################################################################### +# Setup regular expressions for MSP parsing dictionary +###################################################################### +regex_msp = {} +regex_msp['name'] = [r'^Name(?:=|:)(.*)$'] +regex_msp['polarity'] = [r'^ion.*mode(?:=|:)(.*)$', + r'^ionization.*mode(?:=|:)(.*)$', + r'^polarity(?:=|:)(.*)$'] +regex_msp['precursor_mz'] = [r'^precursor.*m/z(?:=|:)\s*(\d*[.,]?\d*)$', + r'^precursor.*mz(?:=|:)\s*(\d*[.,]?\d*)$'] +regex_msp['precursor_type'] = [r'^precursor.*type(?:=|:)(.*)$', + r'^adduct(?:=|:)(.*)$', + r'^ADDUCTIONNAME(?:=|:)(.*)$'] +regex_msp['num_peaks'] = [r'^Num.*Peaks(?:=|:)\s*(\d*)$'] +regex_msp['msp'] = [r'^Name(?:=|:)(.*)$'] # Flag for standard MSP format + +regex_massbank = {} +regex_massbank['name'] = [r'^RECORD_TITLE:(.*)$'] +regex_massbank['polarity'] = \ + [r'^AC\$MASS_SPECTROMETRY:\s+ION_MODE\s+(.*)$'] +regex_massbank['precursor_mz'] = \ + [r'^MS\$FOCUSED_ION:\s+PRECURSOR_M/Z\s+(\d*[.,]?\d*)$'] +regex_massbank['precursor_type'] = \ + [r'^MS\$FOCUSED_ION:\s+PRECURSOR_TYPE\s+(.*)$'] +regex_massbank['num_peaks'] = [r'^PK\$NUM_PEAK:\s+(\d*)'] +regex_massbank['cols'] = [r'^PK\$PEAK:\s+(.*)'] +regex_massbank['massbank'] = [r'^RECORD_TITLE:(.*)$'] # Flag for massbank + +if args.schema == 'msp': + meta_regex = regex_msp +elif args.schema == 'massbank': + meta_regex = regex_massbank +elif args.schema == 'auto': + # If auto we just check for all the available paramter names + # and then determine if Massbank or MSP based on + # the name parameter + meta_regex = {} + meta_regex.update(regex_massbank) + meta_regex['name'].extend(regex_msp['name']) + meta_regex['polarity'].extend(regex_msp['polarity']) + meta_regex['precursor_mz'].extend(regex_msp['precursor_mz']) + meta_regex['precursor_type'].extend(regex_msp['precursor_type']) + meta_regex['num_peaks'].extend(regex_msp['num_peaks']) + meta_regex['msp'] = regex_msp['msp'] + + print(meta_regex) + +# this dictionary will store the meta data results form the MSp file +meta_info = {} + + +# function to extract the meta data using the regular expressions +def parse_meta(meta_regex, meta_info=None): + if meta_info is None: + meta_info = {} + for k, regexes in six.iteritems(meta_regex): + for reg in regexes: + m = re.search(reg, line, re.IGNORECASE) + if m: + meta_info[k] = '-'.join(m.groups()).strip() + return meta_info + + +###################################################################### +# Setup parameter dictionary +###################################################################### +def init_paramd(args): + paramd = defaultdict() + paramd["cli"] = {} + paramd["cli"]["--database"] = args.database + paramd["cli"]["--profile"] = args.profile + paramd["cli"]["--candidates"] = args.candidates + paramd["cli"]["--ppm-max"] = args.ppm_max + if args.polarity == 'positive': + paramd["default_ion"] = "[M+H]+" + elif args.polarity == 'negative': + paramd["default_ion"] = "[M-H]-" + else: + paramd["default_ion"] = '' + + return paramd + + +###################################################################### +# Function to run sirius when all meta and spectra is obtained +###################################################################### +def run_sirius(meta_info, peaklist, args, wd, spectrac): + # Get sample details (if possible to extract) e.g. if created as part of + # the msPurity pipeline) choose between getting additional details to + # add as columns as either all meta data from msp, just details from the + # record name (i.e. when using msPurity and we have the columns + # coded into the name) or just the spectra index (spectrac) + paramd = init_paramd(args) + + if args.meta_select_col == 'name': + # have additional column of just the name + paramd['additional_details'] = {'name': meta_info['name']} + elif args.meta_select_col == 'name_split': + # have additional columns split by "|" and + # then on ":" e.g. MZ:100.2 | RT:20 | xcms_grp_id:1 + paramd['additional_details'] = { + sm.split(":")[0].strip(): sm.split(":")[1].strip() for sm in + meta_info['name'].split("|")} + elif args.meta_select_col == 'all': + # have additional columns based on all + # the meta information extracted from the MSP + paramd['additional_details'] = meta_info + else: + # Just have and index of the spectra in the MSP file + paramd['additional_details'] = {'spectra_idx': spectrac} + + paramd["SampleName"] = "{}_sirius_result".format(spectrac) + + paramd["cli"]["--output"] = \ + os.path.join(wd, "{}_sirius_result".format(spectrac)) + + # =============== Output peaks to txt file ============================== + paramd["cli"]["--ms2"] = os.path.join(wd, + "{}_tmpspec.txt".format(spectrac)) + + # write spec file + with open(paramd["cli"]["--ms2"], 'w') as outfile: + for p in peaklist: + outfile.write(p[0] + "\t" + p[1] + "\n") + + # =============== Update param based on MSP metadata ====================== + # Replace param details with details from MSP if required + if 'precursor_type' in meta_info and meta_info['precursor_type']: + paramd["cli"]["--ion"] = meta_info['precursor_type'] + else: + if paramd["default_ion"]: + paramd["cli"]["--ion"] = paramd["default_ion"] + else: + paramd["cli"]["--auto-charge"] = '' + + if 'precursor_mz' in meta_info and meta_info['precursor_mz']: + paramd["cli"]["--precursor"] = meta_info['precursor_mz'] + + # ============== Create CLI cmd for metfrag =============================== + cmd = "sirius --fingerid" + for k, v in six.iteritems(paramd["cli"]): + cmd += " {} {}".format(str(k), str(v)) + paramds[paramd["SampleName"]] = paramd + + # =============== Run srius ============================================== + # Filter before process with a minimum number of MS/MS peaks + if plinesread >= float(args.minMSMSpeaks): + + if int(args.cores_top_level) == 1: + os.system(cmd) + + return paramd, cmd + + +def work(cmds): + return [os.system(cmd) for cmd in cmds] + + +###################################################################### +# Parse MSP file and run SIRIUS CLI +###################################################################### +# keep list of commands if performing in CLI in parallel +cmds = [] +# keep a dictionary of all params +paramds = {} +# keep count of spectra (for uid) +spectrac = 0 + +with open(args.input_pth, "r") as infile: + # number of lines for the peaks + pnumlines = 0 + # number of lines read for the peaks + plinesread = 0 + for line in infile: + + line = line.strip() + + if pnumlines == 0: + + # ============== Extract metadata from MSP ======================== + meta_info = parse_meta(meta_regex, meta_info) + + if ('massbank' in meta_info and 'cols' in meta_info) or \ + ('msp' in meta_info and 'num_peaks' in meta_info): + pnumlines = int(meta_info['num_peaks']) + peaklist = [] + plinesread = 0 + + elif plinesread < pnumlines: + # =============== Extract peaks from MSP ========================== + # .split() will split on any empty space (i.e. tab and space) + line = tuple(line.split()) + # Keep only m/z and intensity, not relative intensity + save_line = tuple(line[0].split() + line[1].split()) + plinesread += 1 + + peaklist.append(save_line) + + elif plinesread and plinesread == pnumlines: + # ======= Get sample name and additional details for output ======= + spectrac += 1 + paramd, cmd = run_sirius(meta_info, peaklist, args, wd, spectrac) + + paramds[paramd["SampleName"]] = paramd + cmds.append(cmd) + + meta_info = {} + pnumlines = 0 + plinesread = 0 + + # end of file. Check if there is a MSP spectra to + # run metfrag on still + + if plinesread and plinesread == pnumlines: + paramd, cmd = run_sirius(meta_info, peaklist, args, wd, spectrac + 1) + + paramds[paramd["SampleName"]] = paramd + cmds.append(cmd) + +# Perform multiprocessing on command line call level +if int(args.cores_top_level) > 1: + cmds_chunks = [cmds[x:x + int(args.chunks)] + for x in list(range(0, len(cmds), int(args.chunks)))] + pool = multiprocessing.Pool(processes=int(args.cores_top_level)) + pool.map(work, cmds_chunks) + pool.close() + pool.join() + +###################################################################### +# Concatenate and filter the output +###################################################################### +# outputs might have different headers. Need to get a list of all the headers +# before we start merging the files outfiles = [os.path.join(wd, f) for f in +# glob.glob(os.path.join(wd, "*_metfrag_result.csv"))] +outfiles = glob.glob(os.path.join(wd, '*', '*', 'summary_csi_fingerid.csv')) + +# sort files nicely +outfiles.sort(key=lambda s: int(re.match(r'^.*/(' + r'\d+).*/.*/summary_csi_fingerid.csv', + s).group(1))) +print(outfiles) + +if len(outfiles) == 0: + print('No results') + sys.exit() + +headers = [] +c = 0 +for fn in outfiles: + with open(fn, 'r') as infile: + reader = csv.reader(infile, delimiter='\t') + if sys.version_info >= (3, 0): + headers.extend(next(reader)) + else: + headers.extend(reader.next()) + break + +headers = list(paramd['additional_details'].keys()) + headers + +with open(args.result_pth, 'a') as merged_outfile: + dwriter = csv.DictWriter(merged_outfile, + fieldnames=headers, delimiter='\t') + dwriter.writeheader() + + for fn in sorted(outfiles): + print(fn) + + with open(fn) as infile: + reader = csv.DictReader(infile, delimiter='\t') + + ad = paramds[fn.split(os.sep)[-3]]['additional_details'] + + for line in reader: + line.update(ad) + # round score to 5 d.p. + line['score'] = round(float(line['score']), 5) + + dwriter.writerow(line) diff -r 000000000000 -r 9e6bf7278257 sirius_csifingerid.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sirius_csifingerid.xml Wed Feb 05 10:41:48 2020 -0500 @@ -0,0 +1,194 @@ + + is used to identify metabolites using single and + tandem mass spectrometry + + + sirius-csifingerid + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +---------------- +SIRIUS-FingerID +---------------- + +Description +----------- + +| SIRIUS is a java-based software framework for discovering a landscape of +| de-novo identification of metabolites using single and tandem mass +| spectrometry. SIRIUS uses isotope pattern analysis for detecting the +| molecular formula and further analyses the fragmentation pattern of a +| compound using fragmentation trees. Website: +| https://bio.informatik.uni-jena.de/software/sirius/ +| + +Parameters +---------- + +**\1. MSP file** + +MSP file created using *Create MSP* tool + +**\2. Select SIRIUS-CSI:FingerID Databases** + +The following databases are available: + +* PubChem + +* HMDB + +* KEGG + +* KNApSAcK + +* BioCyc + +* All (SIRIUS will consider all m/z possible molecular formulas) + +**\3. Mass deviation of the fragment peaks in ppm** + +Allowed mass deviation of the fragment peaks. + +**\4. The maximum number of candidates in the output** + +Set the top X candidates to return. + +**\5. Ion mode** + +* Positive + +* Negative + +**\6. Analysis used** + +* Orbitrap + +* qTOF + +* FT-ICR + +If you want to analyze spectra measured with Orbitrap or FT-ICR, you should +specify the appropriate analysis profile. A profile is a set of configuration +options and scoring functions SIRIUS 3 will use for its analysis. For example, +the Orbitrap and FT-ICR profiles have tighter constraints for the allowed mass +deviation but do not rely so much on the intensity of isotope peaks. + + +Developers and contributors +--------------------------- + +- **Jordi Capellades (j.capellades.to@gmail.com) - Universitat Rovira i Virgili (SP)** +- **Thomas N Lawson (t.n.lawson@bham.ac.uk) - University of Birmingham (UK)** +- **Simon Bray (sbray@informatik.uni-freiburg.de) - University of Freiburg (Germany)** +- **Ralf Weber (r.j.weber@bham.ac.uk) - University of Birmingham (UK)** + + + + 10.1073/pnas.1509788112 + 10.1093/bioinformatics/btu275 + + diff -r 000000000000 -r 9e6bf7278257 test-data/CCMSLIB00000578155.msp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/CCMSLIB00000578155.msp Wed Feb 05 10:41:48 2020 -0500 @@ -0,0 +1,55 @@ +Name: D-GLUCOSE-6-PHOSPHATE +Synon: $:00in-source +DB#: CCMSLIB00000578155 +InChIKey: NBSCHQHZLSJFNQ-UHFFFAOYSA-N +Precursor_type: [M-H]- +Spectrum_type: MS2 +PrecursorMZ: 259.022 +Instrument: Q-Exactive Plus +Ion_mode: N +Formula: C6H13O9P +MW: 260 +ExactMass: 260.029718626 +Comments: "cas number=54010-71-8" "pubmed id=5958" "SMILES=C(C1C(C(C(C(O1)O)O)O)O)OP(=O)(O)O" "computed SMILES=O=P(O)(O)OCC1OC(O)C(O)C(O)C1O" "computed InChI=InChI=1S/C6H13O9P/c7-3-2(1-14-16(11,12)13)15-6(10)5(9)4(3)8/h2-10H,1H2,(H2,11,12,13)" "ion source=LC-ESI" "compound source=Commercial standard" "instrument=Q-Exactive Plus" "exact mass=260.03" "charge state=0" "source file=IROA_PLATE_neg_1_H.mzXML" "ms level=MS2" "origin=GNPS-EMBL-MCF" "author=pphapale, Alexandrov Theodore, Prasad" "ionization mode=negative" "precursor m/z=259.022" "precursor type=[M-H]-" "computed mass accuracy=1.7088355429058593" "computed mass error=-4.426259999945614E-4" "SPLASH=splash10-0002-9000000000-952cb45e58693e9f65b4" "submitter=GNPS Collaboration (University of California, San Diego)" +Num Peaks: 40 +53.041370 0.041054 +59.012539 0.064363 +71.012398 0.228006 +73.028061 0.075870 +73.063553 0.074974 +76.275803 0.045210 +78.957573 25.209277 +80.963760 0.251310 +82.959442 0.110012 +83.059830 0.080083 +96.968170 100.000000 +97.972389 0.086076 +101.023033 0.843600 +102.224525 0.053152 +111.524658 0.047110 +122.061569 0.051876 +131.034348 0.069417 +138.978973 6.805775 +143.045074 0.077522 +150.924576 0.048922 +150.978821 0.448366 +168.989807 1.792421 +175.017166 0.273820 +177.014206 0.075262 +177.033279 0.093911 +179.028229 0.060016 +189.016006 0.069395 +199.000717 2.367082 +204.990021 0.056081 +214.751755 0.053999 +214.913773 0.079768 +223.000900 0.202970 +250.817719 0.051827 +258.921234 0.240552 +258.956421 0.115132 +258.981964 0.095161 +259.022217 1.307559 +264.273438 0.053000 +276.932190 0.888159 +277.930237 0.064464 + diff -r 000000000000 -r 9e6bf7278257 test-data/CCMSLIB00000578155_result.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/CCMSLIB00000578155_result.tsv Wed Feb 05 10:41:48 2020 -0500 @@ -0,0 +1,9 @@ +name inchikey2D inchi molecularFormula rank score name smiles xlogp pubchemids links +D-GLUCOSE-6-PHOSPHATE NBSCHQHZLSJFNQ InChI=1S/C6H13O9P/c7-3-2(1-14-16(11,12)13)15-6(10)5(9)4(3)8/h2-10H,1H2,(H2,11,12,13) C6H13O9P 1 -2956.17597 D-GLUCOSE-6-PHOSPHATE C(C1C(C(C(C(O)O1)O)O)O)OP(=O)(O)O 208;5958;65127;439198;439284;439404;439427;440100;447096;449526;4178491;4459709;9817215;9859975;10038266;10332946;10422797;10422798;10848963;11499884;11536233;11651816;11651817;11701643;12314997;12598269;16219407;21604864;21604865;23421197;23421199;23421200;24802166;25200774;25244236;42609823;44589902;44629605;46936284;51351673;51351674;59660207;59660208;66804219;70828590;71048769;72200063;89530481;89533633;90087729;90657928;92043642;92144442;92331699;92450038;100983220;101251820;102072969;124302956;124303605 HMDB:(3498);KNApSAcK:(7307);Natural Products:(UNPD119019 UNPD208877);CHEBI:(47944 136602 41076 4141 17665 134068 91004 61567 61667 4170 61548 58735 17719 58225 60332 58247 48066);KEGG:(C02962 C03735 C00275 C02965 C01172 C00092 C00668 C01113);Plantcyc:(MANNOSE-6P CPD-15711 CPD-15712 D-HEXOSE-6-PHOSPHATE GLC-6-P ALPHA-GLC-6-P CPD-1241);YMDB:(2311);Biocyc:(CPD-15712 CPD-1241) +D-GLUCOSE-6-PHOSPHATE HXXFSFRBOHSIMQ InChI=1S/C6H13O9P/c7-1-2-3(8)4(9)5(10)6(14-2)15-16(11,12)13/h2-10H,1H2,(H2,11,12,13) C6H13O9P 2 -2968.893 D-GLUCOSE-6-PHOSPHATE C(C1C(C(C(C(O1)OP(=O)(O)O)O)O)O)O 466;65533;122250;123912;439165;439279;439426;439995;644175;1549075;1549076;3034296;3246168;3551220;5702593;6560208;6560209;7091266;7098639;10084035;11108064;11299931;11536234;11557960;11586967;11637475;11701642;12773693;12773694;15720053;20706002;21120286;22298591;23421196;23421198;23724605;23724607;24802153;24802168;25134172;25244208;25245607;26470622;26470623;26470920;26470921;26470922;40467866;40467867;40467868;40473131;40473132;42609824;44224049;45109780;46173227;46173228;46878478;51397481;57349329;57466719;57616986;57616987;58434201;59383287;59973641;59973642;59985133;60023647;67062884;67062905;67062913;67062918;67794900;68298161;68937634;70124502;70837719;71122101;71728461;88462985;90472756;91265893;91658980;101503810;101747832;101747833;121494054;122545953;125293590;125293595;125293596;125293598 HMDB:(62705);KNApSAcK:(7389);Natural Products:(UNPD85752 UNPD57928 UNPD186485);CHEBI:(16077 17973 75522 24588 53072 58601 57684 60389 58336 60465 53025 16326 80181 58908 18205 16218 58409 57629);KEGG:(C15924 C15926 C01171 C03384 C00636 C00446 C01002 C00103 C00663);Plantcyc:(CPD-9828 GALACTOSE-1P GLC-1-P MANNOSE-1P CPDQT-4 CPD-448 CPD4FS-5);YMDB:(970);Biocyc:(CPD4FS-5) +D-GLUCOSE-6-PHOSPHATE BGWGXPAPYGQALX InChI=1S/C6H13O9P/c7-2-6(10)5(9)4(8)3(15-6)1-14-16(11,12)13/h3-5,7-10H,1-2H2,(H2,11,12,13) C6H13O9P 3 -2996.82333 D-GLUCOSE-6-PHOSPHATE C(C1C(C(C(CO)(O)O1)O)O)OP(=O)(O)O 719;124155;439160;439396;440641;440970;444848;5083448;9543488;15648788;16760431;20843252;21604862;21604863;23421195;24802142;25201714;25245410;42609822;46174048;46878483;52916945;86308139;91746169;92024282;102322321;122174030;124300900;124350439;124524514;124579643 HMDB:(6873);KNApSAcK:(7305);Natural Products:(UNPD153056);CHEBI:(57634 4251 81499 61553 58695 16084 6307 45804 47946 58926 61527);KEGG:(C06312 C18096 C01097 C05345 C00085);Plantcyc:(TAGATOSE-6-PHOSPHATE FRUCTOSE-6P);Biocyc:(L-TAGATOSE-6-PHOSPHATE) +D-GLUCOSE-6-PHOSPHATE PMTUDJVZIGZBIX InChI=1S/C6H13O9P/c7-1-3-4(9)5(10)6(2-8,14-3)15-16(11,12)13/h3-5,7-10H,1-2H2,(H2,11,12,13) C6H13O9P 4 -2999.57091 D-GLUCOSE-6-PHOSPHATE C(C1C(C(C(CO)(O1)OP(=O)(O)O)O)O)O 193537;5176477;6398638;15703397;16069990;21126112;21126113;57357663;99639213;124202606 HMDB:(6800);CHEBI:(27884 57267 12350);KEGG:(C03267);YMDB:(878);Biocyc:(FRUCTOSE-2-PHOSPHATE) +D-GLUCOSE-6-PHOSPHATE RHKKZBWRNHGJEZ InChI=1S/C6H13O9P/c7-1-3-4(8)5(9)6(10,15-3)2-14-16(11,12)13/h3-5,7-10H,1-2H2,(H2,11,12,13) C6H13O9P 5 -3000.17545 D-GLUCOSE-6-PHOSPHATE C(C1C(C(C(COP(=O)(O)O)(O)O1)O)O)O 717;439394;10400369;21627880;23421194;25244216;51397484;52916944;90658050;90658051;90659357;90659358;92209483;97041850 HMDB:(1076);KNApSAcK:(19676);CHEBI:(37515 58674);KEGG:(C01094);Plantcyc:(FRU1P);Biocyc:(CPD-16154 CPD-16158 CPD-16159) +D-GLUCOSE-6-PHOSPHATE INAPMGSXUVUWAF InChI=1S/C6H13O9P/c7-1-2(8)4(10)6(5(11)3(1)9)15-16(12,13)14/h1-11H,(H2,12,13,14) C6H13O9P 6 -3061.86763 D-GLUCOSE-6-PHOSPHATE C1(C(C(C(C(C1O)O)OP(=O)(O)O)O)O)O 9;107737;160886;161368;440043;440194;4449629;5288642;5288700;6323385;7098643;10659045;13072112;18654477;25200523;25200860;25203035;35027167;53924828;59824613;59824614;59824615;59824616;101661021;121400595;121403401 HMDB:(6814);KNApSAcK:(7483);Natural Products:(UNPD107543 UNPD92136 UNPD189294);CHEBI:(58469 18169 62383 37493 18384 58433 18297 64841 58401 64838 84142 84141);KEGG:(C03546 C06155 C01177 C04006);Plantcyc:(1-L-MYO-INOSITOL-1-P D-MYO-INOSITOL-1-MONOPHOSPHATE CPD-6701 CPD-6702 CPD-6746 CPD-9887 D-MYO-INOSITOL-4-PHOSPHATE);YMDB:(2322);Biocyc:(D-MYO-INOSITOL-4-PHOSPHATE CPD-6701 CPD-6702 CPD-6746) +D-GLUCOSE-6-PHOSPHATE GSXOAOHZAIYLCY InChI=1S/C6H13O9P/c7-1-3(8)5(10)6(11)4(9)2-15-16(12,13)14/h4-7,9-11H,1-2H2,(H2,12,13,14) C6H13O9P 7 -3108.21629 D-GLUCOSE-6-PHOSPHATE C(C(=O)C(C(C(COP(=O)(O)O)O)O)O)O 603;69507;151197;5459902;5459952;6602428;20111689;20111690;21114947;21872891;23615358;40467872;40467873;46943428;50909805;87615581 HMDB:(124);KNApSAcK:(19683);Natural Products:(UNPD94448);CHEBI:(57579 61519 134284 15946 15845 61559 47947 134283);Plantcyc:(D-ALLULOSE-6-PHOSPHATE);YMDB:(78);Biocyc:(CPD-15828 CPD-15826 D-ALLULOSE-6-PHOSPHATE) +D-GLUCOSE-6-PHOSPHATE ZKLLSNQJRLJIGT InChI=1S/C6H13O9P/c7-1-3(8)5(10)6(11)4(9)2-15-16(12,13)14/h3,5-8,10-11H,1-2H2,(H2,12,13,14) C6H13O9P 8 -3116.86489 D-GLUCOSE-6-PHOSPHATE C(C(C(C(C(=O)COP(=O)(O)O)O)O)O)O 481;65246;151033;439837;440076;6101730;11129032;11737049;14844438;20111955;21145035;23615304;54551858;54551860;54551861;54551863;91010818 HMDB:(60467);KNApSAcK:(19630);CHEBI:(38342 218 18105);KEGG:(C03654 C02888);YMDB:(655);Biocyc:(CPD-15970 CPD0-1116 CPD-531) diff -r 000000000000 -r 9e6bf7278257 test-data/ML006801.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/ML006801.tsv Wed Feb 05 10:41:48 2020 -0500 @@ -0,0 +1,2 @@ +name inchikey2D inchi molecularFormula rank score name smiles xlogp pubchemids links +L-thialysine; LC-ESI-ITFT; MS2; CE: 50%; R=7500; [M+H]+ GHSJKUNUIHUPDF InChI=1S/C5H12N2O2S/c6-1-2-10-3-4(7)5(8)9/h4H,1-3,6-7H2,(H,8,9) C5H12N2O2S 1 -7.08314 L-thialysine; LC-ESI-ITFT; MS2; CE: 50%; R=7500; [M+H]+ C(CSCC(C(=O)O)N)N 20049;99558;6995002;12898158;25246097;54754416;57517225 HMDB:(33518);Natural Products:(UNPD166389);CHEBI:(497734);Plantcyc:(S-2-AMINOETHYL-L-CYSTEINE);Biocyc:(S-2-AMINOETHYL-L-CYSTEINE THIALYSINE) diff -r 000000000000 -r 9e6bf7278257 test-data/ML006801.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/ML006801.txt Wed Feb 05 10:41:48 2020 -0500 @@ -0,0 +1,63 @@ +ACCESSION: ML004801 +RECORD_TITLE: L-thialysine; LC-ESI-ITFT; MS2; CE: 50%; R=7500; [M+H]+ +DATE: 2014.11.12 +AUTHORS: Mark Earll, Stephan Beisken, EMBL-EBI +LICENSE: CC BY-SA +COPYRIGHT: Copyright (C) 2014, European Molecular Biology Laboratory - European Bioinformatics Institute (EMBL-EBI), Hinxton, UK. +PUBLICATION: Beisken S et al (2014) Scientific Data, 1:140029, DOI:10.1038/sdata.2014.29. http://www.ebi.ac.uk/metabolights/MTBLS38 +COMMENT: CONFIDENCE standard compound +COMMENT: ML_ID 48 +CH$NAME: L-thialysine +CH$NAME: (2R)-2-amino-3-(2-aminoethylsulfanyl)propanoic acid +CH$COMPOUND_CLASS: N/A; Environmental Standard +CH$FORMULA: C5H12N2O2S +CH$EXACT_MASS: 164.0619 +CH$SMILES: NCCSC[C@H](N)C(=O)O +CH$IUPAC: InChI=1S/C5H12N2O2S/c6-1-2-10-3-4(7)5(8)9/h4H,1-3,6-7H2,(H,8,9)/t4-/m0/s1 +CH$LINK: CHEBI 497734 +CH$LINK: PUBCHEM CID:99558 +CH$LINK: INCHIKEY GHSJKUNUIHUPDF-BYPYZUCNSA-N +CH$LINK: CHEMSPIDER 89945 +AC$INSTRUMENT: LTQ Orbitrap Velos Thermo Scientific +AC$INSTRUMENT_TYPE: LC-ESI-ITFT +AC$MASS_SPECTROMETRY: MS_TYPE MS2 +AC$MASS_SPECTROMETRY: ION_MODE POSITIVE +AC$MASS_SPECTROMETRY: IONIZATION ESI +AC$MASS_SPECTROMETRY: FRAGMENTATION_MODE HCD +AC$MASS_SPECTROMETRY: COLLISION_ENERGY 50 % (nominal) +AC$MASS_SPECTROMETRY: RESOLUTION 7500 +AC$CHROMATOGRAPHY: COLUMN_NAME HSS T3 1.7 um, 2x150 mm, Waters +AC$CHROMATOGRAPHY: FLOW_GRADIENT 100/0 at 0 min, 90/10 at 7.5 min, 0/100 at 10 min, 0/100 at 12 min, 100/0 at 18 min, 100/0 at 25 min +AC$CHROMATOGRAPHY: FLOW_RATE 250 uL/min at 0 min, 400 uL/min at 7.5 min +AC$CHROMATOGRAPHY: RETENTION_TIME 1.2 min +AC$CHROMATOGRAPHY: SOLVENT A 0.2% Formic Acid +AC$CHROMATOGRAPHY: SOLVENT B 98/2/0.2 Acetonitrile/Water/Formic Acid +MS$FOCUSED_ION: BASE_PEAK 165.069 +MS$FOCUSED_ION: PRECURSOR_M/Z 165.0692 +MS$FOCUSED_ION: PRECURSOR_TYPE [M+H]+ +MS$DATA_PROCESSING: RECALIBRATE loess on assigned fragments and MS1 +MS$DATA_PROCESSING: REANALYZE Peaks with additional N2/O included +MS$DATA_PROCESSING: WHOLE RMassBank 1.7.0 +PK$SPLASH: splash10-00di-0900000000-99e0ec9e6034dff32dc8 +PK$ANNOTATION: m/z tentative_formula formula_count mass error(ppm) + 76.0215 C2H6NS+ 1 76.0215 -1.27 + 88.0392 C3H6NO2+ 1 88.0393 -1.19 + 92.0162 C2H6NOS+ 1 92.0165 -2.73 + 102.037 C4H8NS+ 1 102.0372 -1.93 + 109.0271 C4H3N3O+ 1 109.0271 0.61 + 120.0112 C3H6NO2S+ 1 120.0114 -1.8 + 148.0424 C5H10NO2S+ 1 148.0427 -1.8 + 165.0699 C5H13N2O2S+ 1 165.0692 4.09 + 174.0753 C5H10N4O3+ 1 174.0747 3.5 +PK$NUM_PEAK: 9 +PK$PEAK: m/z int. rel.int. + 76.0215 18351.9 16 + 88.0392 41980.6 36 + 92.0162 9969.8 8 + 102.037 24583.1 21 + 109.0271 1331.3 1 + 120.0112 1140642.2 999 + 148.0424 40689.7 35 + 165.0699 12929.9 11 + 174.0753 1548.6 1 +// \ No newline at end of file diff -r 000000000000 -r 9e6bf7278257 test-data/RP022611.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/RP022611.txt Wed Feb 05 10:41:48 2020 -0500 @@ -0,0 +1,48 @@ +ACCESSION: RP022611 +RECORD_TITLE: D-Glucose; LC-ESI-QTOF; MS2; CE: 10; R=; [M-H]- +DATE: 2017.11.29 +AUTHORS: BGC, Helmholtz Zentrum Muenchen +LICENSE: CC BY +COPYRIGHT: Copyright (C) 2017 +COMMENT: CONFIDENCE standard compound +COMMENT: INTERNAL_ID 226 +CH$NAME: D-Glucose +CH$NAME: (3R,4S,5S,6R)-6-(hydroxymethyl)oxane-2,3,4,5-tetrol +CH$COMPOUND_CLASS: N/A; Metabolomics Standard +CH$FORMULA: C6H12O6 +CH$EXACT_MASS: 180.0634 +CH$SMILES: OC[C@H]1OC(O)[C@H](O)[C@@H](O)[C@@H]1O +CH$IUPAC: InChI=1S/C6H12O6/c7-1-2-3(8)4(9)5(10)6(11)12-2/h2-11H,1H2/t2-,3-,4+,5-,6?/m1/s1 +CH$LINK: CAS 50-99-7 +CH$LINK: CHEBI 4167 +CH$LINK: KEGG C00031 +CH$LINK: PUBCHEM CID:5793 +CH$LINK: INCHIKEY WQZGKKKJIJFFOK-GASJEMHNSA-N +CH$LINK: CHEMSPIDER 5589 +AC$INSTRUMENT: maXis plus UHR-ToF-MS, Bruker Daltonics +AC$INSTRUMENT_TYPE: LC-ESI-QTOF +AC$MASS_SPECTROMETRY: MS_TYPE MS2 +AC$MASS_SPECTROMETRY: ION_MODE NEGATIVE +AC$MASS_SPECTROMETRY: IONIZATION ESI +AC$MASS_SPECTROMETRY: FRAGMENTATION_MODE CID +AC$MASS_SPECTROMETRY: COLLISION_ENERGY 10 +AC$CHROMATOGRAPHY: COLUMN_NAME BEH C18 1.7um, 2.1x100mm, Waters +AC$CHROMATOGRAPHY: FLOW_GRADIENT 95/5 at 0 min, 95/5 at 1.12 min, 0.5/99.5 at 6.41 min, 0.5/99.5 at 10.01 min +AC$CHROMATOGRAPHY: FLOW_RATE 400 uL/min +AC$CHROMATOGRAPHY: RETENTION_TIME 0.604 min +AC$CHROMATOGRAPHY: SOLVENT A Water with 0.1% formic acid +AC$CHROMATOGRAPHY: SOLVENT B ACN with 0.1% formic acid +MS$FOCUSED_ION: BASE_PEAK 179.0572 +MS$FOCUSED_ION: PRECURSOR_M/Z 179.0561 +MS$FOCUSED_ION: PRECURSOR_TYPE [M-H]- +MS$DATA_PROCESSING: REANALYZE Peaks with additional N2/O included +MS$DATA_PROCESSING: WHOLE RMassBank 2.4.0 +PK$SPLASH: splash10-059i-9000000000-fd62712fc14434a3aa53 +PK$NUM_PEAK: 5 +PK$PEAK: m/z int. rel.int. + 59.0138 278 715 + 71.014 264 679 + 72.9928 30 77 + 89.0251 388 999 + 101.0234 40 102 +// \ No newline at end of file diff -r 000000000000 -r 9e6bf7278257 test-data/RP022611_result.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/RP022611_result.tsv Wed Feb 05 10:41:48 2020 -0500 @@ -0,0 +1,9 @@ +name inchikey2D inchi molecularFormula rank score name smiles xlogp pubchemids links +D-Glucose; LC-ESI-QTOF; MS2; CE: 10; R=; [M-H]- WQZGKKKJIJFFOK InChI=1S/C6H12O6/c7-1-2-3(8)4(9)5(10)6(11)12-2/h2-11H,1H2 C6H12O6 1 -2990.5565 D-Glucose; LC-ESI-QTOF; MS2; CE: 10; R=; [M-H]- C(C1C(C(C(C(O)O1)O)O)O)O 206;5793;6036;18950;64689;79025;81696;185698;439353;439357;439507;439583;439680;441035;441032;441033;441034;444314;448388;448702;451187;451188;451189;452245;455147;657055;1549080;2724488;3000450;3034742;5104362;5319264;6102790;6321330;6323336;6400264;6560213;6971003;6971007;6971016;6971096;6971097;6971098;6992021;6992084;7018164;7043897;7044038;7098663;7098664;7157007;9794056;9815418;9834129;9899007;10035228;10081060;10103794;10130220;10197954;10219674;10219763;10313382;10329946;10899282;10954241;11019447;11030410;11344362;11367383;11412863;11480819;11492034;11571906;11571917;11600783;11651921;11672764;11959770;11970126;12003287;12193653;12285853;12285856;12285861;12285862;12285863;12285866;12285868;12285869;12285870;12285871;12285873;12285877;12285878;12285879;12285885;12285886;12285889;12285890;12285891;12285892;12285893;12285894;16211884;16211941;16211984;16211986;16212959;16212960;16212966;16213546;16213640;16213872;16217112;16219580;21355827;22825318;22836365;22836366;23424086;24802149;24802163;24802281;24892722;42626680;44328781;44328785;46188479;46780441;46897877;50939543;51340651;54445181;54445182;56845432;56845995;57197748;57288387;57483528;57691826;57973135;58070804;58265153;58265160;58265166;58265178;58265190;58265196;58300638;58595959;58594768;58618581;58969552;59034276;59036328;59040622;59083882;59105109;59125088;59146659;59383280;59445439;59503407;59503411;59886072;59965103;60052896;60078648;66629908;67518639;67615000;67615455;67641738;67938791;67944215;67944290;67950444;68167579;68324677;68334110;69528681;70443535;70543261;71309028;71309128;71309129;71309140;71309397;71309503;71309513;71309514;71309671;71309852;71309905;71309908;71309927;71317094;71317095;71317096;71317097;71317182;71777654;75357255;76973265;86278404;87297824;87929779;87931119;88255060;88547603;88974141;89000581;89200515;89332529;89374440;89424182;89742272;89855666;90057933;90159939;90346255;90470917;90472751;90472752;90472753;90472761;90472762;90472770;90473076;90781811;90895196;91057721;92043367;92043446;101015849;101033892;101254308;101254309;101254310;101254311;101254312;101254313;101254314;101254315;101469918;101513786;101718250;101718251;101796201;102089288;102447462;102447463;102601142;102601177;102601371;102601743;102601816;117064633;117064644;117065485;117633116;117768413;117938207;118797420;118797610;118797621;118797622;118855887;118855889;118855904;118855910;118855920;118855925;118924468;121494058;121494046;122360911;122522140;125280077;125280078;125280079;125280080;125309563;125309564;125353406;125363512;125363513;125363514;125363515;126704391 HMDB:(62202);KNApSAcK:(1126);Natural Products:(UNPD148053 UNPD72621 UNPD116684 UNPD119270 UNPD130932 UNPD158921 UNPD83717 UNPD175249 UNPD175204 UNPD191130 UNPD20367 UNPD175399);CHEBI:(37692 37671 37693 63421 37630 27667 72452 4191 4093 37619 15903 80962 37631 17925 37677 15444 37679 27380 4208 18246 16362 28729 37680 18269 4167 37740 59573 59552 27517 28100 37706 83029 28563 28102 37620 37686 37741 86059 4139 37744 68462 37627 28061 37704);KEGG:(C21032 C00221 C00031 C21050 C02209 C01825 C15923 C00936 C00738 C00737 C06465 C06466 C06464 C00984 C00962 C00159 C06467 C01487 C00267 C00124);Plantcyc:(ALPHA-GLUCOSE L-GALACTOSE ALPHA-D-GALACTOSE CPD-12601 CPD-13559 GALACTOSE CPD-15761 CPD-3607 CPD-15762 510-methenyl-thf GLC);YMDB:(894);Biocyc:(CPD-11613 CPD-13428 CPD-11611 CPD-13559 CPD-12844 CPD-15758 CPD-15627 CPD-15759 Alpha-D-Talose CPD-15628 L-SORBOSONE CPD-18461 CPD-15622 CPD-15624 CPD-15625 CPD-15757 CPD-15761 CPD-3607 CPD-15762 CPD-15621) +D-Glucose; LC-ESI-QTOF; MS2; CE: 10; R=; [M-H]- RFSUNEUAIZKAJO InChI=1S/C6H12O6/c7-1-3-4(9)5(10)6(11,2-8)12-3/h3-5,7-11H,1-2H2 C6H12O6 2 -2992.81068 D-Glucose; LC-ESI-QTOF; MS2; CE: 10; R=; [M-H]- C(C1C(C(C(CO)(O)O1)O)O)O 716;439163;439553;439709;11008518;11105942;11378852;11769129;12306006;12306007;12306010;12306011;12306012;12306013;12306014;12306016;15942891;21581131;24755524;24755531;50990586;58798223;59105060;59642118;59748470;60078501;60109622;66809988;68009591;68015592;69261724;69261935;69261937;71310006;71310036;71529761;89810242;89810768;90159920;90346952;90347094;102193695;117935612 HMDB:(660);KNApSAcK:(1117);Natural Products:(UNPD19574 UNPD185250 UNPD163774 UNPD109385);CHEBI:(48648 48647 48646 28645 37727 49090 48673 37720 37721 48672 49089 49088 48670 37725 29084);KEGG:(C02336 C00095 C01719);Plantcyc:(BETA-D-FRUCTOSE);Biocyc:(CPD-10723 CPD-10729 CPD-15988 CPD-15989 CPD-10730) +D-Glucose; LC-ESI-QTOF; MS2; CE: 10; R=; [M-H]- LKDRXBCSQODPBY InChI=1S/C6H12O6/c7-2-6(11)5(10)4(9)3(8)1-12-6/h3-5,7-11H,1-2H2 C6H12O6 3 -3006.71254 D-Glucose; LC-ESI-QTOF; MS2; CE: 10; R=; [M-H]- C1C(C(C(C(CO)(O)O1)O)O)O 3426;24310;439192;439304;439312;440545;441036;441484;2723872;2724552;5317407;6432703;6915737;6971020;6971021;6971099;6992107;10130221;10130228;10154314;10176372;11355843;14408225;15559359;16212688;16213406;16213544;16213545;22814148;24802515;45039313;51340644;51340682;52916942;52916943;57745769;59575442;59875236;71308848;71309127;71309810;71309883;71751872;71752285;89015893;89174364;89333506;89345843;89360325;89360759;89363316;89810855;90472720;90472721;90472746;91329420;91654167;101763542;101763543;102602138;118797422;118797598;118855901;118855902;118855927;119077570;121494037;121494038;121494039;121494041;121494042;125300503;125300504;125300505;125300506;125322958;125322959;125322960;125356688;129275707 HMDB:(1266);Natural Products:(UNPD1409 UNPD51200 UNPD43618 UNPD196486 UNPD14140 UNPD69968);CHEBI:(48645 48678 48677 10295 37728 37729 49092 37719 49091 48674 4249 37714 37715);HSDB:(7660-25-5);KEGG:(C08356 C05003 C00764 C00247 C06468 C00795);Plantcyc:(CPD-10726);YMDB:(204);Biocyc:(CPD-15986 CPD-10728 CPD-15987 CPD-10726 CPD-10727 CPD-10725) +D-Glucose; LC-ESI-QTOF; MS2; CE: 10; R=; [M-H]- BJHIKXHVCXFQLS InChI=1S/C6H12O6/c7-1-3(9)5(11)6(12)4(10)2-8/h3,5-9,11-12H,1-2H2 C6H12O6 4 -3015.06916 D-Glucose; LC-ESI-QTOF; MS2; CE: 10; R=; [M-H]- C(C(C(C(C(=O)CO)O)O)O)O 1101;5984;6904;90008;92092;107428;5460024;10965117;11458041;11961810;15559364;15559365;15559366;15559367;15975980;56846514;71310259;87203108;87883498;88364517;89357936;90194848;90471261;100938761;100938762;100938763;100938764;100938765;101274261;102026061;102525471;126737088 HMDB:(62538);KNApSAcK:(33848);Natural Products:(UNPD157348 UNPD11673 UNPD28362 UNPD42482);CHEBI:(13172 13022 134275 47693 27605 37724 27922);HSDB:(87-79-6);KEGG:(C21523 C10906 C01452);Plantcyc:(CPD-9570 CPD-15616 PSICOSE CPD-15382 TAGATOSE);YMDB:(657);Biocyc:(CPD-15825 PSICOSE) +D-Glucose; LC-ESI-QTOF; MS2; CE: 10; R=; [M-H]- GZCGUPFRVQAUEE InChI=1S/C6H12O6/c7-1-3(9)5(11)6(12)4(10)2-8/h1,3-6,8-12H,2H2 C6H12O6 5 -3017.53465 D-Glucose; LC-ESI-QTOF; MS2; CE: 10; R=; [M-H]- C(=O)C(C(C(C(CO)O)O)O)O 24749;64731;80127;82308;84996;90173;94780;99459;102190;102288;107526;111112;111123;111317;134512;161658;165139;165171;165863;166991;167792;168037;169509;187891;3037556;3086538;5460248;6451569;10910141;10954115;11229130;11355844;11745248;11805319;12305796;12305797;12305799;12305800;12305811;15977259;16057040;21183545;45109778;53462839;53462852;53462878;56846079;56846519;56846584;57449163;57557846;58654615;58654624;60078498;60101813;66509130;71309394;71309492;71309493;71310055;71310073;71434190;71777455;76973373;87109007;87228435;87228929;87229000;87355288;88034483;88353328;89242343;89317890;89327884;89327885;89472723;89623639;90132269;90273086;90472355;90472363;92023398;92043770;92044000;100917967;101117002;101117003;101129024;101129025;101129026;101129027;101248541;101261456;101265967;101446815;101719777;101728293;101728294;102505103;102601198;102601265;102601267;102601589;102601778;102602086;119077284;119078796;126664755 HMDB:(62473);Natural Products:(UNPD142849 UNPD3363 UNPD7578 UNPD95755 UNPD35192 UNPD45514);CHEBI:(37681 37695 28385 37617 28014 37675 33917 86058 37701 37746 17118 37626 68461);HSDB:(50-99-7);KEGG:(C01582);Plantcyc:(CPD-15373 CPD-15374 CPD-15590);YMDB:(789);Biocyc:(CPD-7409 CPD-15626 CPD-7408 CPD-9728 UDP-GLACTOSE CPD1G-120 CPD-15629 CPD1G-2 CPD-9327 4-AMINO-BUTYRALDEHYDE CPD-7420 CPD-15590 CPD1F-130 CHOLESTEROL_ESTER CPD-15756 4-TOLUENESULFONATE CPD1F-98 DEMETHYLMENAQUINONE UBIQUINONE-9 CPD-7419 ACETONE CPD1F-129 PARATHION CPD-15383 IRON-CHELATE CPD-15760 CPD-15620 VITAMIN_K_2) +D-Glucose; LC-ESI-QTOF; MS2; CE: 10; R=; [M-H]- CDAISMWEOUEBRE InChI=1S/C6H12O6/c7-1-2(8)4(10)6(12)5(11)3(1)9/h1-12H C6H12O6 6 -3046.13844 D-Glucose; LC-ESI-QTOF; MS2; CE: 10; R=; [M-H]- C1(C(C(C(C(C1O)O)O)O)O)O 892;11973225;12302985;53714837;68591801;90768658;91019724;100996307;100996308 HMDB:(34220);KNApSAcK:(1164);Natural Products:(UNPD40912 UNPD103126 UNPD106247 UNPD16776 UNPD54610 UNPD50920 UNPD136396 UNPD185125 UNPD191761);CHEBI:(27374 17268 27372 24848 25492 23927 27987 23311 10642 22357);KEGG:(C19891 C06153 C00137 C06151 C06152);Plantcyc:(MYO-INOSITOL CPD-8052 CPD-8059 CPD-8050);YMDB:(173);Biocyc:(CPD-8055 CPD-8054 CPD-8053 CPD-8059 CPD-8061 CPD-8060) +D-Glucose; LC-ESI-QTOF; MS2; CE: 10; R=; [M-H]- KEQUNHIAUQQPAC InChI=1S/C6H12O6/c7-1-5(9)3-12-6(10,2-8)4-11-5/h7-10H,1-4H2 C6H12O6 7 -3051.24166 D-Glucose; LC-ESI-QTOF; MS2; CE: 10; R=; [M-H]- C(C1(COC(CO)(CO1)O)O)O 2723627;4180364;13560352;40503129;124202832 HMDB:(32222) +D-Glucose; LC-ESI-QTOF; MS2; CE: 10; R=; [M-H]- YGMNHEPVTNXLLS InChI=1S/C6H12O6/c7-2-5(10)3(8)1-4(9)6(11)12/h3-5,7-10H,1-2H2,(H,11,12) C6H12O6 8 -3052.67808 D-Glucose; LC-ESI-QTOF; MS2; CE: 10; R=; [M-H]- C(C(C(CO)O)O)C(C(=O)O)O 10350;152990;5289313;14122626;15560246;21596764;21596765;21596766;21596767;21596768;21596769;21596770;58966097;88049798;89007240;89391706;101963537;101963539 HMDB:(346) diff -r 000000000000 -r 9e6bf7278257 test-data/demo_db.csv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/demo_db.csv Wed Feb 05 10:41:48 2020 -0500 @@ -0,0 +1,8 @@ +"Identifier","MonoisotopicMass","MolecularFormula","SMILES","InChI","InChIKey1","InChIKey2","InChIKey3","Name","InChIKey" +"HMDB0000123",75.03202841,"C2H5NO2","NCC(O)=O","InChI=1S/C2H5NO2/c3-1-2(4)5/h1,3H2,(H,4,5)","DHMQDGOQFOQNFH","UHFFFAOYSA","N","Glycine","DHMQDGOQFOQNFH-UHFFFAOYSA-N" +"HMDB0002151",78.0139355,"C2H6OS","CS(C)=O","InChI=1S/C2H6OS/c1-4(2)3/h1-2H3","IAZDPXIOMUYVGZ","UHFFFAOYSA","N","Dimethyl sulfoxide","IAZDPXIOMUYVGZ-UHFFFAOYSA-N" +"HMDB0031239",75.03202841,"C2H5NO2","CCON=O","InChI=1S/C2H5NO2/c1-2-5-3-4/h2H2,1H3","QQZWEECEMNQSTG","UHFFFAOYSA","N","Ethyl nitrite","QQZWEECEMNQSTG-UHFFFAOYSA-N" +"HMDB0014691",75.03202841,"C2H5NO2","CC(=O)NO","InChI=1S/C2H5NO2/c1-2(4)3-5/h5H,1H3,(H,3,4)","RRUDCFGSUDOHDG","UHFFFAOYSA","N","Acetohydroxamic Acid","RRUDCFGSUDOHDG-UHFFFAOYSA-N" +"HMDB0002039",85.05276385,"C4H7NO","O=C1CCCN1","InChI=1S/C4H7NO/c6-4-2-1-3-5-4/h1-3H2,(H,5,6)","HNJBEVLQSNELDL","UHFFFAOYSA","N","2-Pyrrolidinone","HNJBEVLQSNELDL-UHFFFAOYSA-N" +"HMDB0060427",85.05276385,"C4H7NO","CC(C)(O)C#N","InChI=1S/C4H7NO/c1-4(2,6)3-5/h6H,1-2H3","MWFMGBPGAXYFAR","UHFFFAOYSA","N","Acetone cyanohydrin","MWFMGBPGAXYFAR-UHFFFAOYSA-N" + diff -r 000000000000 -r 9e6bf7278257 test-data/generic.msp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/generic.msp Wed Feb 05 10:41:48 2020 -0500 @@ -0,0 +1,123 @@ +NAME: MZ:70.0659 | RT:1483 | XCMS_group:1 | file:1 | scan:NA | pid:NA +PRECURSORMZ: 70.0658950805664 +Comment: +Num Peaks: 8 +50.4781379699707 3487.4296875 4.61 +51.0193099975586 3390.96948242188 4.49 +53.0031509399414 10011.958984375 13.25 +53.5898513793945 4252.7880859375 5.63 +54.3787727355957 3541.5107421875 4.69 +69.0455169677734 9650.0107421875 12.77 +70.0660934448242 37168.609375 49.18 +82.9910659790039 4077.36694335938 5.39 + +NAME: MZ:72.0815 | RT:1823 | XCMS_group:2 | file:1 | scan:NA +PRECURSORMZ: 72.0815277099609 +COMMENT: +Num Peaks: 6 +51.773567199707 818.313903808594 10.98 +54.0346794128418 1247.91137695312 16.75 +54.6847991943359 967.616882324219 12.98 +56.050350189209 1780.01037597656 23.90 +58.4994125366211 975.196228027344 13.09 +72.0814056396484 1660.50390625 22.29 + +NAME: MZ:72.0815 | RT:1857 | scan:NA +PRECURSORMZ: 72.08154296875 +COMMENT: +Num Peaks: 4 +56.0504341125488 1838.78173828125 46.54 +59.9103507995605 701.556762695312 17.75 +63.7723731994629 650.224975585938 16.46 +72.0814590454102 760.228637695312 19.25 + +NAME: MZ:76.0400 | RT:1606 | XCMS_group:5 | file:1 | scan:NA +PRECURSORMZ: 76.0400390625 +COMMENT: +Num Peaks: 4 +53.2376174926758 3224.35571289062 25.41 +60.3291244506836 3193.19482421875 25.17 +73.7529830932617 3305.61401367188 26.05 +82.5309600830078 2965.41772460938 23.37 + +NAME: MZ:79.0219 | RT:177 | XCMS_group:9 | file:1 | scan:NA +PRECURSORMZ: 79.0218658447266 +COMMENT: +Num Peaks: 7 +53.6282501220703 15316.7431640625 1.07 +59.967342376709 251727.734375 17.51 +61.0115814208984 80113.8046875 5.57 +62.9908714294434 93065.1015625 6.47 +63.9986305236816 950876.9375 66.13 +79.0219345092773 33032.984375 2.30 +95.4936447143555 13826.033203125 0.96 + +NAME: MZ:79.0219 | RT:184 | XCMS_group:9 | file:1 | scan:NA +PRECURSORMZ: 79.0218811035156 +COMMENT: +Num Peaks: 5 +59.1125831604004 67799.1953125 3.10 +59.9673652648926 345613.1875 15.83 +62.9906845092773 117693.296875 5.39 +63.9986686706543 1585970.25 72.62 +80.5974655151367 66719.4609375 3.06 + +NAME: MZ:79.0219 | RT:212 | XCMS_group:9 | file:1 | scan:NA +PRECURSORMZ: 79.0218887329102 +COMMENT: +Num Peaks: 12 +53.1700401306152 2441.47143554688 2.54 +55.1893730163574 2006.07958984375 2.08 +58.9013671875 2539.39086914062 2.64 +59.9673500061035 13423.1376953125 13.94 +61.0115776062012 4831.0986328125 5.02 +62.9908828735352 3668.52905273438 3.81 +63.9986190795898 54386.6640625 56.50 +73.8388671875 2330.30126953125 2.42 +78.5768051147461 2563.25 2.66 +79.0221328735352 2581.44604492188 2.68 +96.8009872436523 2530.70141601562 2.63 +99.6652908325195 2961.3095703125 3.08 + +NAME: MZ:79.9904 | RT:1284 | XCMS_group:11 | file:1 | scan:NA +PRECURSORMZ: 79.9903564453125 +COMMENT: +Num Peaks: 3 +51.6917915344238 584.212829589844 31.93 +53.0398750305176 649.807922363281 35.48 +97.3154754638672 596.341003417969 32.59 + +NAME: MZ:86.0607 | RT:1497 | XCMS_group:19 | file:1 | scan:NA +PRECURSORMZ: 86.060661315918 +COMMENT: +Num Peaks: 4 +53.0031318664551 9658.7939453125 60.81 +53.1939277648926 1998.81518554688 12.58 +80.3447494506836 2044.23645019531 12.87 +101.307479858398 2181.85522460938 13.73 + +NAME: MZ:86.0606 | RT:1498 | XCMS_group:19 | file:1 | scan:NA +PRECURSORMZ: 86.0606307983398 +COMMENT: +Num Peaks: 11 +52.6782836914062 1061.12646484375 3.59 +53.0032196044922 15176.8583984375 51.38 +53.1121788024902 1193.6044921875 4.039 +53.9984169006348 2790.28930664062 9.45 +54.0287094116211 999.250427246094 3.38 +56.7024726867676 1171.42797851562 3.96 +69.0346069335938 1878.03894042969 3.36 +72.9083633422852 1256.455078125 4.25 +74.0740356445312 1324.07055664062 4.48 +80.5324630737305 1329.61022949219 4.50 +91.0167770385742 1362.0029296875 4.61 + +NAME: MZ:86.0607 | RT:1500 | XCMS_group:19 | file:1 | scan:NA +PRECURSORMZ: 86.0606536865234 +COMMENT: +Num Peaks: 5 +53.0031509399414 29580.330078125 61.35 +55.3490409851074 4989.64990234375 10.35 +61.990592956543 4089.9619140625 8.48 +63.2290992736816 4168.97412109375 8.64 +67.6647109985352 5392.48779296875 11.18 diff -r 000000000000 -r 9e6bf7278257 test-data/generic.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/generic.tsv Wed Feb 05 10:41:48 2020 -0500 @@ -0,0 +1,7 @@ +name inchikey2D inchi molecularFormula rank score name smiles xlogp pubchemids links +MZ:86.0606 | RT:1498 | XCMS_group:19 | file:1 | scan:NA HNJBEVLQSNELDL InChI=1S/C4H7NO/c6-4-2-1-3-5-4/h1-3H2,(H,5,6) C4H7NO 1 -149.0988 MZ:86.0606 | RT:1498 | XCMS_group:19 | file:1 | scan:NA C1CC(=NC1)O 12025;3956071;10419134;12197590;12197592;18999930;20030003;20589568;58329813;90472990;91343693;101225382;101796586;123509162 HMDB:(2039);KNApSAcK:(38233);Natural Products:(UNPD211738);CHEBI:(36592);HSDB:(616-45-5);Plantcyc:(CPD-19607) +MZ:86.0606 | RT:1498 | XCMS_group:19 | file:1 | scan:NA MWFMGBPGAXYFAR InChI=1S/C4H7NO/c1-4(2,6)3-5/h6H,1-2H3 C4H7NO 2 -169.83339 MZ:86.0606 | RT:1498 | XCMS_group:19 | file:1 | scan:NA CC(C)(C#N)O 6406;10486800;91131204 HMDB:(60427);Natural Products:(UNPD47968);CHEBI:(15348);HSDB:(75-86-5);KEGG:(C02659);Plantcyc:(2-HYDROXY-2-METHYLPROPANENITRILE) +MZ:72.0815 | RT:1823 | XCMS_group:2 | file:1 | scan:NA RWRDLPDLKQPQOW InChI=1S/C4H9N/c1-2-4-5-3-1/h5H,1-4H2 C4H9N 1 -136.14546 MZ:72.0815 | RT:1823 | XCMS_group:2 | file:1 | scan:NA C1CCNC1 31268;3613359;11062297;12196044;12196046;12196049;12196050;18440991;20463768;53660610;57608708;57608709;57608710;57750053;60135501;90927493;91312985 HMDB:(31641);Natural Products:(UNPD154562);CHEBI:(33135 52145);HSDB:(123-75-1);Plantcyc:(PYRROLIDINE);Biocyc:(PYRROLIDINE) +MZ:79.0219 | RT:177 | XCMS_group:9 | file:1 | scan:NA IAZDPXIOMUYVGZ InChI=1S/C2H6OS/c1-4(2)3/h1-2H3 C2H6OS 1 -75.82312 MZ:79.0219 | RT:177 | XCMS_group:9 | file:1 | scan:NA CS(=O)C 679;75151;10103116;12206145;12264368;18594457;20151975;21022526;22345572;57247813;71309204;76973052;90811807;90817578 HMDB:(2151);Natural Products:(UNPD148866);CHEBI:(28262);Maconda:(CON00016);HSDB:(67-68-5);KEGG:(C11143);Plantcyc:(DMSO);Biocyc:(DOH-ISO-VAL DMSO) +MZ:79.0219 | RT:184 | XCMS_group:9 | file:1 | scan:NA IAZDPXIOMUYVGZ InChI=1S/C2H6OS/c1-4(2)3/h1-2H3 C2H6OS 1 -86.79175 MZ:79.0219 | RT:184 | XCMS_group:9 | file:1 | scan:NA CS(=O)C 679;75151;10103116;12206145;12264368;18594457;20151975;21022526;22345572;57247813;71309204;76973052;90811807;90817578 HMDB:(2151);Natural Products:(UNPD148866);CHEBI:(28262);Maconda:(CON00016);HSDB:(67-68-5);KEGG:(C11143);Plantcyc:(DMSO);Biocyc:(DOH-ISO-VAL DMSO) +MZ:79.0219 | RT:212 | XCMS_group:9 | file:1 | scan:NA IAZDPXIOMUYVGZ InChI=1S/C2H6OS/c1-4(2)3/h1-2H3 C2H6OS 1 -75.67854 MZ:79.0219 | RT:212 | XCMS_group:9 | file:1 | scan:NA CS(=O)C 679;75151;10103116;12206145;12264368;18594457;20151975;21022526;22345572;57247813;71309204;76973052;90811807;90817578 HMDB:(2151);Natural Products:(UNPD148866);CHEBI:(28262);Maconda:(CON00016);HSDB:(67-68-5);KEGG:(C11143);Plantcyc:(DMSO);Biocyc:(DOH-ISO-VAL DMSO) diff -r 000000000000 -r 9e6bf7278257 test-data/historic.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/historic.tsv Wed Feb 05 10:41:48 2020 -0500 @@ -0,0 +1,3 @@ +"name" "source" "experimentName" "confidence" "inchikey2D" "inchi" "molecularFormula" "rank" "score" "name" "smiles" "xlogp" "pubchemids" "links" +"19" "2_tmpspec" "" "0.0" "RWRDLPDLKQPQOW" "InChI=1S/C4H9N/c1-2-4-5-3-1/h5H,1-4H2" "C4H9N" "1" "-136.14546214244544" "19" "C1CCNC1" "" "31268;3613359;11062297;12196044;12196046;12196049;12196050;18440991;20463768;53660610;57608708;57608709;57608710;57750053;60135501;90927493;91312985" "HMDB:(31641);Natural Products:(UNPD154562);CHEBI:(33135 52145);HSDB:(123-75-1);Plantcyc:(PYRROLIDINE);Biocyc:(PYRROLIDINE)" +"19" "4_tmpspec" "" "0.0" "IAZDPXIOMUYVGZ" "InChI=1S/C2H6OS/c1-4(2)3/h1-2H3" "C2H6OS" "1" "-86.79174845072117" "19" "CS(=O)C" "" "679;75151;10103116;12206145;12264368;18594457;20151975;21022526;22345572;57247813;71309204;76973052;90811807;90817578" "HMDB:(2151);Natural Products:(UNPD148866);CHEBI:(28262);Maconda:(CON00016);HSDB:(67-68-5);KEGG:(C11143);Plantcyc:(DMSO);Biocyc:(DOH-ISO-VAL DMSO)" diff -r 000000000000 -r 9e6bf7278257 test-data/historic_input.msp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/historic_input.msp Wed Feb 05 10:41:48 2020 -0500 @@ -0,0 +1,61 @@ +NAME: 1 +PRECURSORMZ: 70.0658950805664 +Comment: +Num Peaks: 8 +50.4781379699707 3487.4296875 +51.0193099975586 3390.96948242188 +53.0031509399414 10011.958984375 +53.5898513793945 4252.7880859375 +54.3787727355957 3541.5107421875 +69.0455169677734 9650.0107421875 +70.0660934448242 37168.609375 +82.9910659790039 4077.36694335938 + +NAME: 2 +PRECURSORMZ: 72.0815277099609 +Comment: +Num Peaks: 6 +51.773567199707 818.313903808594 +54.0346794128418 1247.91137695312 +54.6847991943359 967.616882324219 +56.050350189209 1780.01037597656 +58.4994125366211 975.196228027344 +72.0814056396484 1660.50390625 + +NAME: 5 +PRECURSORMZ: 76.0400390625 +Comment: +Num Peaks: 4 +53.2376174926758 3224.35571289062 +60.3291244506836 3193.19482421875 +73.7529830932617 3305.61401367188 +82.5309600830078 2965.41772460938 + +NAME: 9 +PRECURSORMZ: 79.0218811035156 +Comment: +Num Peaks: 5 +59.1125831604004 67799.1953125 +59.9673652648926 345613.1875 +62.9906845092773 117693.296875 +63.9986686706543 1585970.25 +80.5974655151367 66719.4609375 + +NAME: 11 +PRECURSORMZ: 79.9903564453125 +Comment: +Num Peaks: 3 +51.6917915344238 584.212829589844 +53.0398750305176 649.807922363281 +97.3154754638672 596.341003417969 + +NAME: 19 +PRECURSORMZ: 86.0606536865234 +Comment: +Num Peaks: 5 +53.0031509399414 29580.330078125 +55.3490409851074 4989.64990234375 +61.990592956543 4089.9619140625 +63.2290992736816 4168.97412109375 +67.6647109985352 5392.48779296875 + diff -r 000000000000 -r 9e6bf7278257 test-data/invalid_adduct.msp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/invalid_adduct.msp Wed Feb 05 10:41:48 2020 -0500 @@ -0,0 +1,13 @@ +NAME: MZ:70.0659 | RT:1483 | XCMS_group:1 | file:1 | scan:NA | pid:NA +PRECURSORMZ: 70.0658950805664 +ADDUCT: [M+INVALID_ADDUCT]+ +Comment: +Num Peaks: 8 +50.4781379699707 3487.4296875 4.61 +51.0193099975586 3390.96948242188 4.49 +53.0031509399414 10011.958984375 13.25 +53.5898513793945 4252.7880859375 5.63 +54.3787727355957 3541.5107421875 4.69 +69.0455169677734 9650.0107421875 12.77 +70.0660934448242 37168.609375 49.18 +82.9910659790039 4077.36694335938 5.39 diff -r 000000000000 -r 9e6bf7278257 test-data/invalid_adduct_result.tsv diff -r 000000000000 -r 9e6bf7278257 test-data/sirus_csifingerid_test1.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sirus_csifingerid_test1.tsv Wed Feb 05 10:41:48 2020 -0500 @@ -0,0 +1,4 @@ +UID InChIkey2D InChI molecularFormula Rank Score Name smiles xlogp pubchemids links +2 RWRDLPDLKQPQOW InChI=1S/C4H9N/c1-2-4-5-3-1/h5H,1-4H2 C4H9N 1 -136.14546214244544 Azolidine C1CCNC1 31268;3613359;11062297;12196044;12196046;12196049;12196050;18440991;20463768;53660610;57608708;57608709;57608710;57750053;60135501;90927493;91312985 HMDB:(31641);Natural Products:(UNPD154562);CHEBI:(33135 52145);HSDB:(123-75-1);Plantcyc:(PYRROLIDINE);Biocyc:(PYRROLIDINE) +UID InChIkey2D InChI molecularFormula Rank Score Name smiles xlogp pubchemids links +9 IAZDPXIOMUYVGZ InChI=1S/C2H6OS/c1-4(2)3/h1-2H3 C2H6OS 1 -86.79174845072117 Demasorb CS(=O)C 679;75151;10103116;12206145;12264368;18594457;20151975;21022526;22345572;57247813;71309204;76973052;90811807;90817578 HMDB:(2151);Natural Products:(UNPD148866);CHEBI:(28262);Maconda:(CON00016);HSDB:(67-68-5);KEGG:(C11143);Plantcyc:(DMSO);Biocyc:(DOH-ISO-VAL DMSO)