Mercurial > repos > computational-metabolomics > metfrag

diff metfrag.py @ 2:f151ee133612 draft default tip
"planemo upload for repository https://github.com/computational-metabolomics/metfrag-galaxy commit b337c6296968848e3214f4b51df3d86776f84b6a"
author: computational-metabolomics
date: Tue, 14 Jul 2020 07:41:53 -0400
parents: 9ee2e2ceb2c9
--- a/metfrag.py	Thu Mar 19 06:04:18 2020 -0400
+++ b/metfrag.py	Tue Jul 14 07:41:53 2020 -0400
@@ -1,6 +1,11 @@
 from __future__ import absolute_import, print_function
 
-import ConfigParser
+try:
+    from configparser import ConfigParser
+except ImportError as e:
+    print(e)
+    from ConfigParser import ConfigParser
+
 import argparse
 import csv
 import glob
@@ -14,137 +19,6 @@
 
 import six
 
-print(sys.version)
-
-parser = argparse.ArgumentParser()
-parser.add_argument('--input_pth')
-parser.add_argument('--result_pth', default='metfrag_result.csv')
-
-parser.add_argument('--temp_dir')
-parser.add_argument('--polarity', default='pos')
-parser.add_argument('--minMSMSpeaks', default=1)
-
-parser.add_argument('--MetFragDatabaseType', default='PubChem')
-parser.add_argument('--LocalDatabasePath', default='')
-parser.add_argument('--LocalMetChemDatabaseServerIp', default='')
-
-parser.add_argument('--DatabaseSearchRelativeMassDeviation', default=5)
-parser.add_argument('--FragmentPeakMatchRelativeMassDeviation', default=10)
-parser.add_argument('--FragmentPeakMatchAbsoluteMassDeviation', default=0.001)
-parser.add_argument('--NumberThreads', default=1)
-parser.add_argument('--UnconnectedCompoundFilter', action='store_true')
-parser.add_argument('--IsotopeFilter', action='store_true')
-
-parser.add_argument('--FilterMinimumElements', default='')
-parser.add_argument('--FilterMaximumElements', default='')
-parser.add_argument('--FilterSmartsInclusionList', default='')
-parser.add_argument('--FilterSmartsExclusionList', default='')
-parser.add_argument('--FilterIncludedElements', default='')
-parser.add_argument('--FilterExcludedElements', default='')
-parser.add_argument('--FilterIncludedExclusiveElements', default='')
-
-parser.add_argument('--score_thrshld', default=0)
-parser.add_argument('--pctexplpeak_thrshld', default=0)
-parser.add_argument('--schema')
-parser.add_argument('--cores_top_level', default=1)
-parser.add_argument('--chunks', default=1)
-parser.add_argument('--meta_select_col', default='name')
-parser.add_argument('--skip_invalid_adducts', action='store_true')
-
-parser.add_argument('--ScoreSuspectLists', default='')
-parser.add_argument('--MetFragScoreTypes',
-                    default="FragmenterScore,OfflineMetFusionScore")
-parser.add_argument('--MetFragScoreWeights', default="1.0,1.0")
-
-args = parser.parse_args()
-print(args)
-
-config = ConfigParser.ConfigParser()
-config.read(
-    os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config.ini'))
-
-if os.stat(args.input_pth).st_size == 0:
-    print('Input file empty')
-    exit()
-
-# Create temporary working directory
-if args.temp_dir:
-    wd = args.temp_dir
-else:
-    wd = tempfile.mkdtemp()
-
-if os.path.exists(wd):
-    shutil.rmtree(wd)
-    os.makedirs(wd)
-else:
-    os.makedirs(wd)
-
-######################################################################
-# Setup regular expressions for MSP parsing dictionary
-######################################################################
-regex_msp = {}
-regex_msp['name'] = [r'^Name(?:=|:)(.*)$']
-regex_msp['polarity'] = [r'^ion.*mode(?:=|:)(.*)$',
-                         r'^ionization.*mode(?:=|:)(.*)$',
-                         r'^polarity(?:=|:)(.*)$']
-regex_msp['precursor_mz'] = [r'^precursor.*m/z(?:=|:)\s*(\d*[.,]?\d*)$',
-                             r'^precursor.*mz(?:=|:)\s*(\d*[.,]?\d*)$']
-regex_msp['precursor_type'] = [r'^precursor.*type(?:=|:)(.*)$',
-                               r'^adduct(?:=|:)(.*)$',
-                               r'^ADDUCTIONNAME(?:=|:)(.*)$']
-regex_msp['num_peaks'] = [r'^Num.*Peaks(?:=|:)\s*(\d*)$']
-regex_msp['msp'] = [r'^Name(?:=|:)(.*)$']  # Flag for standard MSP format
-
-regex_massbank = {}
-regex_massbank['name'] = [r'^RECORD_TITLE:(.*)$']
-regex_massbank['polarity'] = [r'^AC\$MASS_SPECTROMETRY:\s+ION_MODE\s+(.*)$']
-regex_massbank['precursor_mz'] = [
-    r'^MS\$FOCUSED_ION:\s+PRECURSOR_M/Z\s+(\d*[.,]?\d*)$']
-regex_massbank['precursor_type'] = [
-    r'^MS\$FOCUSED_ION:\s+PRECURSOR_TYPE\s+(.*)$']
-regex_massbank['num_peaks'] = [r'^PK\$NUM_PEAK:\s+(\d*)']
-regex_massbank['cols'] = [r'^PK\$PEAK:\s+(.*)']
-regex_massbank['massbank'] = [
-    r'^RECORD_TITLE:(.*)$']  # Flag for massbank format
-
-if args.schema == 'msp':
-    meta_regex = regex_msp
-elif args.schema == 'massbank':
-    meta_regex = regex_massbank
-elif args.schema == 'auto':
-    # If auto we just check for all the available paramter names and then
-    # determine if Massbank or MSP based on the name parameter
-    meta_regex = {}
-    meta_regex.update(regex_massbank)
-    meta_regex['name'].extend(regex_msp['name'])
-    meta_regex['polarity'].extend(regex_msp['polarity'])
-    meta_regex['precursor_mz'].extend(regex_msp['precursor_mz'])
-    meta_regex['precursor_type'].extend(regex_msp['precursor_type'])
-    meta_regex['num_peaks'].extend(regex_msp['num_peaks'])
-    meta_regex['msp'] = regex_msp['msp']
-else:
-    sys.exit("No schema selected")
-
-adduct_types = {
-    '[M+H]+': 1.007276,
-    '[M+NH4]+': 18.034374,
-    '[M+Na]+': 22.989218,
-    '[M+K]+': 38.963158,
-    '[M+CH3OH+H]+': 33.033489,
-    '[M+ACN+H]+': 42.033823,
-    '[M+ACN+Na]+': 64.015765,
-    '[M+2ACN+H]+': 83.06037,
-    '[M-H]-': -1.007276,
-    '[M+Cl]-': 34.969402,
-    '[M+HCOO]-': 44.99819,
-    '[M-H+HCOOH]-': 44.99819,
-    # same as above but different style of writing adduct
-    '[M+CH3COO]-': 59.01385,
-    '[M-H+CH3COOH]-': 59.01385
-    # same as above but different style of writing adduct
-}
-inv_adduct_types = {int(round(v, 0)): k for k, v in adduct_types.iteritems()}
-
 
 # function to extract the meta data using the regular expressions
 def parse_meta(meta_regex, meta_info=None):
@@ -158,6 +32,70 @@
     return meta_info
 
 
+def get_meta_regex(schema):
+    ######################################################################
+    # Setup regular expressions for MSP parsing dictionary
+    ######################################################################
+    regex_msp = {}
+    regex_msp['name'] = [r'^Name(?:=|:)(.*)$']
+    regex_msp['polarity'] = [r'^ion.*mode(?:=|:)(.*)$',
+                             r'^ionization.*mode(?:=|:)(.*)$',
+                             r'^polarity(?:=|:)(.*)$']
+    regex_msp['precursor_mz'] = [r'^precursor.*m/z(?:=|:)\s*(\d*[.,]?\d*)$',
+                                 r'^precursor.*mz(?:=|:)\s*(\d*[.,]?\d*)$']
+    regex_msp['precursor_type'] = [r'^precursor.*type(?:=|:)(.*)$',
+                                   r'^adduct(?:=|:)(.*)$',
+                                   r'^ADDUCTIONNAME(?:=|:)(.*)$']
+
+    regex_msp['retention_time'] = [r'^RETENTION.*TIME(?:=|:)\s*(.*)$',
+                                   r'^rt(?:=|:)\s*(.*)$',
+                                   r'^time(?:=|:)\s*(.*)$']
+    # From example winter_pos.mspy from kristian
+    regex_msp['AlignmentID'] = [r'^AlignmentID(?:=|:)\s*(.*)$']
+
+    regex_msp['num_peaks'] = [r'^Num.*Peaks(?:=|:)\s*(\d*)$']
+    regex_msp['msp'] = [r'^Name(?:=|:)(.*)$']  # Flag for standard MSP format
+
+    regex_massbank = {}
+    regex_massbank['name'] = [r'^RECORD_TITLE:(.*)$']
+    regex_massbank['polarity'] = [
+        r'^AC\$MASS_SPECTROMETRY:\s+ION_MODE\s+(.*)$']
+    regex_massbank['precursor_mz'] = [
+        r'^MS\$FOCUSED_ION:\s+PRECURSOR_M/Z\s+(\d*[.,]?\d*)$']
+    regex_massbank['precursor_type'] = [
+        r'^MS\$FOCUSED_ION:\s+PRECURSOR_TYPE\s+(.*)$']
+    regex_massbank['retention_time'] = [
+        r'^AC\$CHROMATOGRAPHY:\s+RETENTION_TIME\s*(\d*\.?\d+).*']
+
+    regex_massbank['num_peaks'] = [r'^PK\$NUM_PEAK:\s+(\d*)']
+    regex_massbank['cols'] = [r'^PK\$PEAK:\s+(.*)']
+    regex_massbank['massbank'] = [
+        r'^RECORD_TITLE:(.*)$']  # Flag for massbank format
+
+    if schema == 'msp':
+        meta_regex = regex_msp
+    elif schema == 'massbank':
+        meta_regex = regex_massbank
+    elif schema == 'auto':
+        # If auto we just check for all the available paramter names and then
+        # determine if Massbank or MSP based on the name parameter
+        meta_regex = {}
+        meta_regex.update(regex_massbank)
+        meta_regex['name'].extend(regex_msp['name'])
+        meta_regex['polarity'].extend(regex_msp['polarity'])
+        meta_regex['precursor_mz'].extend(regex_msp['precursor_mz'])
+        meta_regex['precursor_type'].extend(regex_msp['precursor_type'])
+        meta_regex['num_peaks'].extend(regex_msp['num_peaks'])
+        meta_regex['retention_time'].extend(regex_msp['retention_time'])
+        meta_regex['AlignmentID'] = regex_msp['AlignmentID']
+        meta_regex['msp'] = regex_msp['msp']
+
+    else:
+        sys.exit("No schema selected")
+
+    return meta_regex
+
+
 ######################################################################
 # Setup parameter dictionary
 ######################################################################
@@ -271,6 +209,8 @@
     # record name (i.e. when using msPurity and we have the columns coded into
     # the name) or just the spectra index (spectrac)].
     # Returns the parameters used and the command line call
+    meta_info = {k: v for k, v in meta_info.items() if k
+                 not in ['msp', 'massbank', 'cols']}
 
     paramd = init_paramd(args)
     if args.meta_select_col == 'name':
@@ -287,7 +227,7 @@
         # extracted from the MSP
         paramd['additional_details'] = meta_info
     else:
-        # Just have and index of the spectra in the MSP file
+        # Just have an index of the spectra in the MSP file
         paramd['additional_details'] = {'spectra_idx': spectrac}
 
     paramd["SampleName"] = "{}_metfrag_result".format(spectrac)
@@ -297,9 +237,15 @@
                                           "{}_tmpspec.txt".format(spectrac))
 
     # write spec file
+
     with open(paramd["PeakListPath"], 'w') as outfile:
+        pls = ''
         for p in peaklist:
             outfile.write(p[0] + "\t" + p[1] + "\n")
+            pls = pls + '{}_{};'.format(p[0], p[1])
+
+        if args.output_cl:
+            peaklist_str = pls[:-1]
 
     # =============== Update param based on MSP metadata ======================
     # Replace param details with details from MSP if required
@@ -311,6 +257,8 @@
         paramd["PrecursorIonMode"] = \
             int(round(adduct_types[meta_info['precursor_type']], 0))
     elif not args.skip_invalid_adducts:
+        inv_adduct_types = {int(round(v, 0)): k for k, v in
+                            six.iteritems(adduct_types)}
         adduct = inv_adduct_types[int(paramd['PrecursorIonModeDefault'])]
         paramd["PrecursorIonMode"] = paramd['PrecursorIonModeDefault']
         nm = float(meta_info['precursor_mz']) - paramd['nm_mass_diff_default']
@@ -318,7 +266,10 @@
         print('Skipping {}'.format(paramd["SampleName"]))
         return '', ''
 
-    paramd['additional_details']['adduct'] = adduct
+    if not ('precursor_type' in paramd['additional_details'] or 'adduct'
+            in paramd['additional_details']):
+        paramd['additional_details']['adduct'] = adduct
+
     paramd["NeutralPrecursorMass"] = nm
 
     # ============== Create CLI cmd for metfrag ===============================
@@ -328,6 +279,10 @@
                      'additional_details']:
             cmd += " {}={}".format(str(k), str(v))
 
+    if args.output_cl:
+        cli_str = '{} PeakListString={}'.format(cmd, peaklist_str)
+        paramd['additional_details']['MetFragCLIString'] = cli_str
+
     # ============== Run metfrag ==============================================
     # print(cmd)
     # Filter before process with a minimum number of MS/MS peaks
@@ -343,178 +298,300 @@
     return [os.system(cmd) for cmd in cmds]
 
 
-######################################################################
-# Parse MSP file and run metfrag CLI
-######################################################################
-# keep list of commands if performing in CLI in parallel
-cmds = []
-# keep a dictionary of all params
-paramds = {}
-# keep count of spectra (for uid)
-spectrac = 0
-# this dictionary will store the meta data results form the MSp file
-meta_info = {}
+if __name__ == "__main__":
+    print(sys.version)
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input_pth')
+    parser.add_argument('--result_pth', default='metfrag_result.csv')
+
+    parser.add_argument('--temp_dir')
+    parser.add_argument('--polarity', default='pos')
+    parser.add_argument('--minMSMSpeaks', default=1)
+
+    parser.add_argument('--MetFragDatabaseType', default='PubChem')
+    parser.add_argument('--LocalDatabasePath', default='')
+    parser.add_argument('--LocalMetChemDatabaseServerIp', default='')
+
+    parser.add_argument('--DatabaseSearchRelativeMassDeviation', default=5)
+    parser.add_argument('--FragmentPeakMatchRelativeMassDeviation', default=10)
+    parser.add_argument('--FragmentPeakMatchAbsoluteMassDeviation',
+                        default=0.001)
+    parser.add_argument('--NumberThreads', default=1)
+    parser.add_argument('--UnconnectedCompoundFilter', action='store_true')
+    parser.add_argument('--IsotopeFilter', action='store_true')
+
+    parser.add_argument('--FilterMinimumElements', default='')
+    parser.add_argument('--FilterMaximumElements', default='')
+    parser.add_argument('--FilterSmartsInclusionList', default='')
+    parser.add_argument('--FilterSmartsExclusionList', default='')
+    parser.add_argument('--FilterIncludedElements', default='')
+    parser.add_argument('--FilterExcludedElements', default='')
+    parser.add_argument('--FilterIncludedExclusiveElements', default='')
 
-with open(args.input_pth, "r") as infile:
-    # number of lines for the peaks
-    pnumlines = 0
-    # number of lines read for the peaks
-    plinesread = 0
-    for line in infile:
-        line = line.strip()
+    parser.add_argument('--score_thrshld', default=0)
+    parser.add_argument('--pctexplpeak_thrshld', default=0)
+    parser.add_argument('--schema')
+    parser.add_argument('--cores_top_level', default=1)
+    parser.add_argument('--chunks', default=1)
+    parser.add_argument('--meta_select_col', default='name')
+    parser.add_argument('--skip_invalid_adducts', action='store_true')
+    parser.add_argument('--output_cl', action='store_true')
 
-        if pnumlines == 0:
-            # ============== Extract metadata from MSP ========================
-            meta_info = parse_meta(meta_regex, meta_info)
+    parser.add_argument('--ScoreSuspectLists', default='')
+    parser.add_argument('--MetFragScoreTypes',
+                        default="FragmenterScore,OfflineMetFusionScore")
+    parser.add_argument('--MetFragScoreWeights', default="1.0,1.0")
+    parser.add_argument('-a', '--adducts', action='append', nargs=1,
+                        required=False, default=[], help='Adducts used')
 
-            if ('massbank' in meta_info and 'cols' in meta_info) or (
-                    'msp' in meta_info and 'num_peaks' in meta_info):
-                pnumlines = int(meta_info['num_peaks'])
-                plinesread = 0
-                peaklist = []
+    args = parser.parse_args()
+    print(args)
+    config = ConfigParser()
+    config.read(
+        os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config.ini'))
 
-        elif plinesread < pnumlines:
-            # ============== Extract peaks from MSP ==========================
-            # .split() will split on any empty space (i.e. tab and space)
-            line = tuple(line.split())
-            # Keep only m/z and intensity, not relative intensity
-            save_line = tuple(line[0].split() + line[1].split())
-            plinesread += 1
-            peaklist.append(save_line)
+    if os.stat(args.input_pth).st_size == 0:
+        print('Input file empty')
+        exit()
+
+    # Create temporary working directory
+    if args.temp_dir:
+        wd = args.temp_dir
+    else:
+        wd = tempfile.mkdtemp()
 
-        elif plinesread and plinesread == pnumlines:
-            # ======= Get sample name and additional details for output =======
-            spectrac += 1
-            paramd, cmd = run_metfrag(meta_info, peaklist, args, wd, spectrac,
-                                      adduct_types)
+    if os.path.exists(wd):
+        shutil.rmtree(wd)
+        os.makedirs(wd)
+    else:
+        os.makedirs(wd)
 
-            if paramd:
-                paramds[paramd["SampleName"]] = paramd
-                cmds.append(cmd)
+    meta_regex = get_meta_regex(args.schema)
 
-            meta_info = {}
-            pnumlines = 0
-            plinesread = 0
-
-    # end of file. Check if there is a MSP spectra to run metfrag on still
-    if plinesread and plinesread == pnumlines:
-
-        paramd, cmd = run_metfrag(meta_info, peaklist, args, wd, spectrac + 1,
-                                  adduct_types)
+    adduct_types = {
+        '[M+H]+': 1.007276,
+        '[M+NH4]+': 18.034374,
+        '[M+Na]+': 22.989218,
+        '[M+K]+': 38.963158,
+        '[M+CH3OH+H]+': 33.033489,
+        '[M+ACN+H]+': 42.033823,
+        '[M+ACN+Na]+': 64.015765,
+        '[M+2ACN+H]+': 83.06037,
+        '[M-H]-': -1.007276,
+        '[M+Cl]-': 34.969402,
+        '[M+HCOO]-': 44.99819,
+        '[M-H+HCOOH]-': 44.99819,
+        # same as above but different style of writing adduct
+        '[M+CH3COO]-': 59.01385,
+        '[M-H+CH3COOH]-': 59.01385
+        # same as above but different style of writing adduct
+    }
 
-        if paramd:
-            paramds[paramd["SampleName"]] = paramd
-            cmds.append(cmd)
+    ######################################################################
+    # Parse MSP file and run metfrag CLI
+    ######################################################################
+    # keep list of commands if performing in CLI in parallel
+    cmds = []
+    # keep a dictionary of all params
+    paramds = {}
+    # keep count of spectra (for uid)
+    spectrac = 0
+    # this dictionary will store the meta data results form the MSp file
+    meta_info = {}
 
-# Perform multiprocessing on command line call level
-if int(args.cores_top_level) > 1:
-    cmds_chunks = [cmds[x:x + int(args.chunks)] for x in
-                   list(range(0, len(cmds), int(args.chunks)))]
-    pool = multiprocessing.Pool(processes=int(args.cores_top_level))
-    pool.map(work, cmds_chunks)
-    pool.close()
-    pool.join()
+    if args.adducts:
+        adducts_from_cli = [
+            a[0].replace('__ob__', '[').replace('__cb__', ']') for a in
+            args.adducts
+        ]
+    else:
+        adducts_from_cli = []
 
-######################################################################
-# Concatenate and filter the output
-######################################################################
-# outputs might have different headers. Need to get a list of all the
-# headers before we start merging the files
-# outfiles = [os.path.join(wd, f) for f in glob.glob(os.path.join(wd,
-# "*_metfrag_result.csv"))]
-outfiles = glob.glob(os.path.join(wd, "*_metfrag_result.csv"))
+    with open(args.input_pth, "r") as infile:
+        # number of lines for the peaks
+        pnumlines = 0
+        # number of lines read for the peaks
+        plinesread = 0
+        for line in infile:
+            line = line.strip()
+
+            if pnumlines == 0:
+                # ============== Extract metadata from MSP ====================
+                meta_info = parse_meta(meta_regex, meta_info)
 
-if len(outfiles) == 0:
-    print('No results')
-    sys.exit()
+                if ('massbank' in meta_info and 'cols' in meta_info) or (
+                        'msp' in meta_info and 'num_peaks' in meta_info):
+                    pnumlines = int(meta_info['num_peaks'])
+                    plinesread = 0
+                    peaklist = []
+
+            elif plinesread < pnumlines:
+                # ============== Extract peaks from MSP =======================
+                # .split() will split on any empty space (i.e. tab and space)
+                line = tuple(line.split())
+                # Keep only m/z and intensity, not relative intensity
+                save_line = tuple(line[0].split() + line[1].split())
+                plinesread += 1
+                peaklist.append(save_line)
 
-headers = []
-c = 0
-for fn in outfiles:
-    with open(fn, 'r') as infile:
-        reader = csv.reader(infile)
-        if sys.version_info >= (3, 0):
-            headers.extend(next(reader))
-        else:
-            headers.extend(reader.next())
-        # check if file has any data rows
-        for i, row in enumerate(reader):
-            c += 1
-            if i == 1:
-                break
+            elif plinesread and plinesread == pnumlines:
+                # =Get sample name and additional details for output and RUN ==
+                if adducts_from_cli:
+                    for adduct in adducts_from_cli:
+                        spectrac += 1
+                        meta_info['precursor_type'] = adduct
+                        paramd, cmd = run_metfrag(meta_info, peaklist, args,
+                                                  wd, spectrac, adduct_types)
+                        if paramd:
+                            paramds[paramd["SampleName"]] = paramd
+                            cmds.append(cmd)
+                else:
+                    spectrac += 1
+                    paramd, cmd = run_metfrag(meta_info, peaklist, args,
+                                              wd, spectrac, adduct_types)
+                    if paramd:
+                        paramds[paramd["SampleName"]] = paramd
+                        cmds.append(cmd)
 
-# if no data rows (e.g. matches) then do not save an
-# output and leave the program
-if c == 0:
-    print('No results')
-    sys.exit()
+                meta_info = {}
+                pnumlines = 0
+                plinesread = 0
+
+        # end of file. Check if there is a MSP spectra to run metfrag on
+        if plinesread and plinesread == pnumlines:
 
-additional_detail_headers = ['sample_name']
-for k, paramd in six.iteritems(paramds):
-    additional_detail_headers = list(set(
-        additional_detail_headers + list(paramd['additional_details'].keys())))
-
-# add inchikey if not already present (missing in metchem output)
-if 'InChIKey' not in headers:
-    headers.append('InChIKey')
-
-headers = additional_detail_headers + sorted(list(set(headers)))
+            if adducts_from_cli:
+                for adduct in adducts_from_cli:
+                    spectrac += 1
+                    meta_info['precursor_type'] = adduct
+                    paramd, cmd = run_metfrag(meta_info, peaklist, args,
+                                              wd, spectrac, adduct_types)
+                    if paramd:
+                        paramds[paramd["SampleName"]] = paramd
+                        cmds.append(cmd)
+            else:
+                spectrac += 1
+                paramd, cmd = run_metfrag(meta_info, peaklist, args,
+                                          wd, spectrac, adduct_types)
+                if paramd:
+                    paramds[paramd["SampleName"]] = paramd
+                    cmds.append(cmd)
 
-# Sort files nicely
-outfiles.sort(
-    key=lambda s: int(re.match(r'^.*/(\d+)_metfrag_result.csv', s).group(1)))
-
-print(outfiles)
+    # Perform multiprocessing on command line call level
+    if int(args.cores_top_level) > 1:
+        cmds_chunks = [cmds[x:x + int(args.chunks)] for x in
+                       list(range(0, len(cmds), int(args.chunks)))]
+        pool = multiprocessing.Pool(processes=int(args.cores_top_level))
+        pool.map(work, cmds_chunks)
+        pool.close()
+        pool.join()
 
-# merge outputs
-with open(args.result_pth, 'a') as merged_outfile:
-    dwriter = csv.DictWriter(merged_outfile, fieldnames=headers,
-                             delimiter='\t', quotechar='"')
-    dwriter.writeheader()
+    ######################################################################
+    # Concatenate and filter the output
+    ######################################################################
+    # outputs might have different headers. Need to get a list of all the
+    # headers before we start merging the files
+    # outfiles = [os.path.join(wd, f) for f in glob.glob(os.path.join(wd,
+    # "*_metfrag_result.csv"))]
+    outfiles = glob.glob(os.path.join(wd, "*_metfrag_result.csv"))
 
+    if len(outfiles) == 0:
+        print('No results')
+        sys.exit()
+
+    headers = []
+    c = 0
     for fn in outfiles:
+        with open(fn, 'r') as infile:
+            reader = csv.reader(infile)
+            if sys.version_info >= (3, 0):
+                headers.extend(next(reader))
+            else:
+                headers.extend(reader.next())
+            # check if file has any data rows
+            for i, row in enumerate(reader):
+                c += 1
+                if i == 1:
+                    break
 
-        with open(fn) as infile:
-            reader = csv.DictReader(infile, delimiter=',', quotechar='"')
-            for line in reader:
-                bewrite = True
-                for key, value in line.items():
-                    # Filter when no MS/MS peak matched
-                    if key == "ExplPeaks":
-                        if float(args.pctexplpeak_thrshld) > 0 \
-                                and value and "NA" in value:
-                            bewrite = False
-                    # Filter with a score threshold
-                    elif key == "Score":
-                        if value and float(value) <= float(args.score_thrshld):
+    # if no data rows (e.g. matches) then do not save an
+    # output and leave the program
+    if c == 0:
+        print('No results')
+        sys.exit()
+
+    additional_detail_headers = ['sample_name']
+    for k, paramd in six.iteritems(paramds):
+        additional_detail_headers = list(set(
+            additional_detail_headers + list(
+                paramd['additional_details'].keys())))
+
+    # add inchikey if not already present (missing in metchem output)
+    if 'InChIKey' not in headers:
+        headers.append('InChIKey')
+
+    additional_detail_headers = sorted(additional_detail_headers)
+    headers = additional_detail_headers + sorted(list(set(headers)))
+
+    # Sort files nicely
+    outfiles.sort(
+        key=lambda s: int(
+            re.match(r'^.*/(\d+)_metfrag_result.csv', s).group(1)))
+
+    print(outfiles)
+
+    # merge outputs
+    with open(args.result_pth, 'a') as merged_outfile:
+        dwriter = csv.DictWriter(merged_outfile, fieldnames=headers,
+                                 delimiter='\t', quotechar='"')
+        dwriter.writeheader()
+
+        for fn in outfiles:
+            with open(fn) as infile:
+                reader = csv.DictReader(infile, delimiter=',', quotechar='"')
+                for line in reader:
+                    bewrite = True
+                    for key, value in line.items():
+                        # Filter when no MS/MS peak matched
+                        if key == "ExplPeaks":
+                            if float(args.pctexplpeak_thrshld) > 0 \
+                                    and value and "NA" in value:
+                                bewrite = False
+                        # Filter with a score threshold
+                        elif key == "Score":
+                            if value and float(value) <= float(
+                                    args.score_thrshld):
+                                bewrite = False
+                        elif key == "NoExplPeaks":
+                            nbfindpeak = float(value)
+                        elif key == "NumberPeaksUsed":
+                            totpeaks = float(value)
+                    # Filter with a relative number of peak matched
+                    try:
+                        pctexplpeak = nbfindpeak / totpeaks * 100
+                    except ZeroDivisionError:
+                        bewrite = False
+                    else:
+                        if pctexplpeak < float(args.pctexplpeak_thrshld):
                             bewrite = False
-                    elif key == "NoExplPeaks":
-                        nbfindpeak = float(value)
-                    elif key == "NumberPeaksUsed":
-                        totpeaks = float(value)
-                # Filter with a relative number of peak matched
-                try:
-                    pctexplpeak = nbfindpeak / totpeaks * 100
-                except ZeroDivisionError:
-                    bewrite = False
-                else:
-                    if pctexplpeak < float(args.pctexplpeak_thrshld):
-                        bewrite = False
+
+                    # Write the line if it pass all filters
+                    if bewrite:
+                        bfn = os.path.basename(fn)
+                        bfn = bfn.replace(".csv", "")
+                        line['sample_name'] = paramds[bfn]['SampleName']
+                        ad = paramds[bfn]['additional_details']
 
-                # Write the line if it pass all filters
-                if bewrite:
-                    bfn = os.path.basename(fn)
-                    bfn = bfn.replace(".csv", "")
-                    line['sample_name'] = paramds[bfn]['SampleName']
-                    ad = paramds[bfn]['additional_details']
+                        if args.MetFragDatabaseType == "MetChem":
+                            # for some reason the metchem database option does
+                            # not report the full inchikey (at least in the
+                            # Bham setup. This ensures we always get
+                            # the fully inchikey
+                            line['InChIKey'] = '{}-{}-{}'.format(
+                                line['InChIKey1'],
+                                line['InChIKey2'],
+                                line['InChIKey3'])
 
-                    if args.MetFragDatabaseType == "MetChem":
-                        # for some reason the metchem database option does
-                        # not report the full inchikey (at least in the Bham
-                        # setup. This ensures we always get the fully inchikey
-                        line['InChIKey'] = '{}-{}-{}'.format(line['InChIKey1'],
-                                                             line['InChIKey2'],
-                                                             line['InChIKey3'])
-
-                    line.update(ad)
-                    dwriter.writerow(line)
+                        line.update(ad)
+                        dwriter.writerow(line)
author	computational-metabolomics
date	Tue, 14 Jul 2020 07:41:53 -0400
parents	9ee2e2ceb2c9
children