diff sirius_csifingerid.py @ 0:9e6bf7278257 draft

"planemo upload for repository https://github.com/computational-metabolomics/sirius_csifingerid_galaxy commit 1d1b37a070f895c94069819237199c768da27258"
author computational-metabolomics
date Wed, 05 Feb 2020 10:41:48 -0500
parents
children 856b3761277d
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sirius_csifingerid.py	Wed Feb 05 10:41:48 2020 -0500
@@ -0,0 +1,328 @@
+from __future__ import absolute_import, print_function
+
+import argparse
+import csv
+import glob
+import multiprocessing
+import os
+import re
+import sys
+import tempfile
+import uuid
+from collections import defaultdict
+
+import six
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--input_pth')
+parser.add_argument('--result_pth')
+parser.add_argument('--database')
+parser.add_argument('--profile')
+parser.add_argument('--candidates')
+parser.add_argument('--ppm_max')
+parser.add_argument('--polarity')
+parser.add_argument('--results_name')
+parser.add_argument('--out_dir')
+parser.add_argument('--tool_directory')
+parser.add_argument('--temp_dir')
+
+parser.add_argument('--meta_select_col', default='all')
+parser.add_argument('--cores_top_level', default=1)
+parser.add_argument('--chunks', default=1)
+parser.add_argument('--minMSMSpeaks', default=1)
+parser.add_argument('--schema', default='msp')
+args = parser.parse_args()
+print(args)
+if os.stat(args.input_pth).st_size == 0:
+    print('Input file empty')
+    exit()
+
+if args.temp_dir:
+    wd = os.path.join(args.temp_dir, 'temp')
+    os.mkdir(wd)
+
+    if not os.path.exists(wd):
+        os.mkdir(wd)
+else:
+    td = tempfile.mkdtemp()
+    wd = os.path.join(td, str(uuid.uuid4()))
+    os.mkdir(wd)
+
+######################################################################
+# Setup regular expressions for MSP parsing dictionary
+######################################################################
+regex_msp = {}
+regex_msp['name'] = [r'^Name(?:=|:)(.*)$']
+regex_msp['polarity'] = [r'^ion.*mode(?:=|:)(.*)$',
+                         r'^ionization.*mode(?:=|:)(.*)$',
+                         r'^polarity(?:=|:)(.*)$']
+regex_msp['precursor_mz'] = [r'^precursor.*m/z(?:=|:)\s*(\d*[.,]?\d*)$',
+                             r'^precursor.*mz(?:=|:)\s*(\d*[.,]?\d*)$']
+regex_msp['precursor_type'] = [r'^precursor.*type(?:=|:)(.*)$',
+                               r'^adduct(?:=|:)(.*)$',
+                               r'^ADDUCTIONNAME(?:=|:)(.*)$']
+regex_msp['num_peaks'] = [r'^Num.*Peaks(?:=|:)\s*(\d*)$']
+regex_msp['msp'] = [r'^Name(?:=|:)(.*)$']  # Flag for standard MSP format
+
+regex_massbank = {}
+regex_massbank['name'] = [r'^RECORD_TITLE:(.*)$']
+regex_massbank['polarity'] = \
+    [r'^AC\$MASS_SPECTROMETRY:\s+ION_MODE\s+(.*)$']
+regex_massbank['precursor_mz'] = \
+    [r'^MS\$FOCUSED_ION:\s+PRECURSOR_M/Z\s+(\d*[.,]?\d*)$']
+regex_massbank['precursor_type'] = \
+    [r'^MS\$FOCUSED_ION:\s+PRECURSOR_TYPE\s+(.*)$']
+regex_massbank['num_peaks'] = [r'^PK\$NUM_PEAK:\s+(\d*)']
+regex_massbank['cols'] = [r'^PK\$PEAK:\s+(.*)']
+regex_massbank['massbank'] = [r'^RECORD_TITLE:(.*)$']  # Flag for massbank
+
+if args.schema == 'msp':
+    meta_regex = regex_msp
+elif args.schema == 'massbank':
+    meta_regex = regex_massbank
+elif args.schema == 'auto':
+    # If auto we just check for all the available paramter names
+    # and then determine if Massbank or MSP based on
+    # the name parameter
+    meta_regex = {}
+    meta_regex.update(regex_massbank)
+    meta_regex['name'].extend(regex_msp['name'])
+    meta_regex['polarity'].extend(regex_msp['polarity'])
+    meta_regex['precursor_mz'].extend(regex_msp['precursor_mz'])
+    meta_regex['precursor_type'].extend(regex_msp['precursor_type'])
+    meta_regex['num_peaks'].extend(regex_msp['num_peaks'])
+    meta_regex['msp'] = regex_msp['msp']
+
+    print(meta_regex)
+
+# this dictionary will store the meta data results form the MSp file
+meta_info = {}
+
+
+# function to extract the meta data using the regular expressions
+def parse_meta(meta_regex, meta_info=None):
+    if meta_info is None:
+        meta_info = {}
+    for k, regexes in six.iteritems(meta_regex):
+        for reg in regexes:
+            m = re.search(reg, line, re.IGNORECASE)
+            if m:
+                meta_info[k] = '-'.join(m.groups()).strip()
+    return meta_info
+
+
+######################################################################
+# Setup parameter dictionary
+######################################################################
+def init_paramd(args):
+    paramd = defaultdict()
+    paramd["cli"] = {}
+    paramd["cli"]["--database"] = args.database
+    paramd["cli"]["--profile"] = args.profile
+    paramd["cli"]["--candidates"] = args.candidates
+    paramd["cli"]["--ppm-max"] = args.ppm_max
+    if args.polarity == 'positive':
+        paramd["default_ion"] = "[M+H]+"
+    elif args.polarity == 'negative':
+        paramd["default_ion"] = "[M-H]-"
+    else:
+        paramd["default_ion"] = ''
+
+    return paramd
+
+
+######################################################################
+# Function to run sirius when all meta and spectra is obtained
+######################################################################
+def run_sirius(meta_info, peaklist, args, wd, spectrac):
+    # Get sample details (if possible to extract) e.g. if created as part of
+    # the msPurity pipeline) choose between getting additional details to
+    # add as columns as either all meta data from msp, just details from the
+    # record name (i.e. when using msPurity and we have the columns
+    # coded into the name) or just the spectra index (spectrac)
+    paramd = init_paramd(args)
+
+    if args.meta_select_col == 'name':
+        # have additional column of just the name
+        paramd['additional_details'] = {'name': meta_info['name']}
+    elif args.meta_select_col == 'name_split':
+        # have additional columns split by "|" and
+        # then on ":" e.g. MZ:100.2 | RT:20 | xcms_grp_id:1
+        paramd['additional_details'] = {
+            sm.split(":")[0].strip(): sm.split(":")[1].strip() for sm in
+            meta_info['name'].split("|")}
+    elif args.meta_select_col == 'all':
+        # have additional columns based on all
+        # the meta information extracted from the MSP
+        paramd['additional_details'] = meta_info
+    else:
+        # Just have and index of the spectra in the MSP file
+        paramd['additional_details'] = {'spectra_idx': spectrac}
+
+    paramd["SampleName"] = "{}_sirius_result".format(spectrac)
+
+    paramd["cli"]["--output"] = \
+        os.path.join(wd, "{}_sirius_result".format(spectrac))
+
+    # =============== Output peaks to txt file  ==============================
+    paramd["cli"]["--ms2"] = os.path.join(wd,
+                                          "{}_tmpspec.txt".format(spectrac))
+
+    # write spec file
+    with open(paramd["cli"]["--ms2"], 'w') as outfile:
+        for p in peaklist:
+            outfile.write(p[0] + "\t" + p[1] + "\n")
+
+    # =============== Update param based on MSP metadata ======================
+    # Replace param details with details from MSP if required
+    if 'precursor_type' in meta_info and meta_info['precursor_type']:
+        paramd["cli"]["--ion"] = meta_info['precursor_type']
+    else:
+        if paramd["default_ion"]:
+            paramd["cli"]["--ion"] = paramd["default_ion"]
+        else:
+            paramd["cli"]["--auto-charge"] = ''
+
+    if 'precursor_mz' in meta_info and meta_info['precursor_mz']:
+        paramd["cli"]["--precursor"] = meta_info['precursor_mz']
+
+    # ============== Create CLI cmd for metfrag ===============================
+    cmd = "sirius --fingerid"
+    for k, v in six.iteritems(paramd["cli"]):
+        cmd += " {} {}".format(str(k), str(v))
+    paramds[paramd["SampleName"]] = paramd
+
+    # =============== Run srius ==============================================
+    # Filter before process with a minimum number of MS/MS peaks
+    if plinesread >= float(args.minMSMSpeaks):
+
+        if int(args.cores_top_level) == 1:
+            os.system(cmd)
+
+    return paramd, cmd
+
+
+def work(cmds):
+    return [os.system(cmd) for cmd in cmds]
+
+
+######################################################################
+# Parse MSP file and run SIRIUS CLI
+######################################################################
+# keep list of commands if performing in CLI in parallel
+cmds = []
+# keep a dictionary of all params
+paramds = {}
+# keep count of spectra (for uid)
+spectrac = 0
+
+with open(args.input_pth, "r") as infile:
+    # number of lines for the peaks
+    pnumlines = 0
+    # number of lines read for the peaks
+    plinesread = 0
+    for line in infile:
+
+        line = line.strip()
+
+        if pnumlines == 0:
+
+            # ============== Extract metadata from MSP ========================
+            meta_info = parse_meta(meta_regex, meta_info)
+
+            if ('massbank' in meta_info and 'cols' in meta_info) or \
+                    ('msp' in meta_info and 'num_peaks' in meta_info):
+                pnumlines = int(meta_info['num_peaks'])
+                peaklist = []
+                plinesread = 0
+
+        elif plinesread < pnumlines:
+            # =============== Extract peaks from MSP ==========================
+            # .split() will split on any empty space (i.e. tab and space)
+            line = tuple(line.split())
+            # Keep only m/z and intensity, not relative intensity
+            save_line = tuple(line[0].split() + line[1].split())
+            plinesread += 1
+
+            peaklist.append(save_line)
+
+        elif plinesread and plinesread == pnumlines:
+            # ======= Get sample name and additional details for output =======
+            spectrac += 1
+            paramd, cmd = run_sirius(meta_info, peaklist, args, wd, spectrac)
+
+            paramds[paramd["SampleName"]] = paramd
+            cmds.append(cmd)
+
+            meta_info = {}
+            pnumlines = 0
+            plinesread = 0
+
+            # end of file. Check if there is a MSP spectra to
+            # run metfrag on still
+
+    if plinesread and plinesread == pnumlines:
+        paramd, cmd = run_sirius(meta_info, peaklist, args, wd, spectrac + 1)
+
+        paramds[paramd["SampleName"]] = paramd
+        cmds.append(cmd)
+
+# Perform multiprocessing on command line call level
+if int(args.cores_top_level) > 1:
+    cmds_chunks = [cmds[x:x + int(args.chunks)]
+                   for x in list(range(0, len(cmds), int(args.chunks)))]
+    pool = multiprocessing.Pool(processes=int(args.cores_top_level))
+    pool.map(work, cmds_chunks)
+    pool.close()
+    pool.join()
+
+######################################################################
+# Concatenate and filter the output
+######################################################################
+# outputs might have different headers. Need to get a list of all the headers
+# before we start merging the files outfiles = [os.path.join(wd, f) for f in
+# glob.glob(os.path.join(wd, "*_metfrag_result.csv"))]
+outfiles = glob.glob(os.path.join(wd, '*', '*', 'summary_csi_fingerid.csv'))
+
+# sort files nicely
+outfiles.sort(key=lambda s: int(re.match(r'^.*/('
+                                         r'\d+).*/.*/summary_csi_fingerid.csv',
+                                         s).group(1)))
+print(outfiles)
+
+if len(outfiles) == 0:
+    print('No results')
+    sys.exit()
+
+headers = []
+c = 0
+for fn in outfiles:
+    with open(fn, 'r') as infile:
+        reader = csv.reader(infile, delimiter='\t')
+        if sys.version_info >= (3, 0):
+            headers.extend(next(reader))
+        else:
+            headers.extend(reader.next())
+        break
+
+headers = list(paramd['additional_details'].keys()) + headers
+
+with open(args.result_pth, 'a') as merged_outfile:
+    dwriter = csv.DictWriter(merged_outfile,
+                             fieldnames=headers, delimiter='\t')
+    dwriter.writeheader()
+
+    for fn in sorted(outfiles):
+        print(fn)
+
+        with open(fn) as infile:
+            reader = csv.DictReader(infile, delimiter='\t')
+
+            ad = paramds[fn.split(os.sep)[-3]]['additional_details']
+
+            for line in reader:
+                line.update(ad)
+                # round score to 5 d.p.
+                line['score'] = round(float(line['score']), 5)
+
+                dwriter.writerow(line)