Mercurial > repos > computational-metabolomics > metfrag
diff metfrag.py @ 2:f151ee133612 draft default tip
"planemo upload for repository https://github.com/computational-metabolomics/metfrag-galaxy commit b337c6296968848e3214f4b51df3d86776f84b6a"
author | computational-metabolomics |
---|---|
date | Tue, 14 Jul 2020 07:41:53 -0400 |
parents | 9ee2e2ceb2c9 |
children |
line wrap: on
line diff
--- a/metfrag.py Thu Mar 19 06:04:18 2020 -0400 +++ b/metfrag.py Tue Jul 14 07:41:53 2020 -0400 @@ -1,6 +1,11 @@ from __future__ import absolute_import, print_function -import ConfigParser +try: + from configparser import ConfigParser +except ImportError as e: + print(e) + from ConfigParser import ConfigParser + import argparse import csv import glob @@ -14,137 +19,6 @@ import six -print(sys.version) - -parser = argparse.ArgumentParser() -parser.add_argument('--input_pth') -parser.add_argument('--result_pth', default='metfrag_result.csv') - -parser.add_argument('--temp_dir') -parser.add_argument('--polarity', default='pos') -parser.add_argument('--minMSMSpeaks', default=1) - -parser.add_argument('--MetFragDatabaseType', default='PubChem') -parser.add_argument('--LocalDatabasePath', default='') -parser.add_argument('--LocalMetChemDatabaseServerIp', default='') - -parser.add_argument('--DatabaseSearchRelativeMassDeviation', default=5) -parser.add_argument('--FragmentPeakMatchRelativeMassDeviation', default=10) -parser.add_argument('--FragmentPeakMatchAbsoluteMassDeviation', default=0.001) -parser.add_argument('--NumberThreads', default=1) -parser.add_argument('--UnconnectedCompoundFilter', action='store_true') -parser.add_argument('--IsotopeFilter', action='store_true') - -parser.add_argument('--FilterMinimumElements', default='') -parser.add_argument('--FilterMaximumElements', default='') -parser.add_argument('--FilterSmartsInclusionList', default='') -parser.add_argument('--FilterSmartsExclusionList', default='') -parser.add_argument('--FilterIncludedElements', default='') -parser.add_argument('--FilterExcludedElements', default='') -parser.add_argument('--FilterIncludedExclusiveElements', default='') - -parser.add_argument('--score_thrshld', default=0) -parser.add_argument('--pctexplpeak_thrshld', default=0) -parser.add_argument('--schema') -parser.add_argument('--cores_top_level', default=1) -parser.add_argument('--chunks', default=1) -parser.add_argument('--meta_select_col', default='name') -parser.add_argument('--skip_invalid_adducts', action='store_true') - -parser.add_argument('--ScoreSuspectLists', default='') -parser.add_argument('--MetFragScoreTypes', - default="FragmenterScore,OfflineMetFusionScore") -parser.add_argument('--MetFragScoreWeights', default="1.0,1.0") - -args = parser.parse_args() -print(args) - -config = ConfigParser.ConfigParser() -config.read( - os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config.ini')) - -if os.stat(args.input_pth).st_size == 0: - print('Input file empty') - exit() - -# Create temporary working directory -if args.temp_dir: - wd = args.temp_dir -else: - wd = tempfile.mkdtemp() - -if os.path.exists(wd): - shutil.rmtree(wd) - os.makedirs(wd) -else: - os.makedirs(wd) - -###################################################################### -# Setup regular expressions for MSP parsing dictionary -###################################################################### -regex_msp = {} -regex_msp['name'] = [r'^Name(?:=|:)(.*)$'] -regex_msp['polarity'] = [r'^ion.*mode(?:=|:)(.*)$', - r'^ionization.*mode(?:=|:)(.*)$', - r'^polarity(?:=|:)(.*)$'] -regex_msp['precursor_mz'] = [r'^precursor.*m/z(?:=|:)\s*(\d*[.,]?\d*)$', - r'^precursor.*mz(?:=|:)\s*(\d*[.,]?\d*)$'] -regex_msp['precursor_type'] = [r'^precursor.*type(?:=|:)(.*)$', - r'^adduct(?:=|:)(.*)$', - r'^ADDUCTIONNAME(?:=|:)(.*)$'] -regex_msp['num_peaks'] = [r'^Num.*Peaks(?:=|:)\s*(\d*)$'] -regex_msp['msp'] = [r'^Name(?:=|:)(.*)$'] # Flag for standard MSP format - -regex_massbank = {} -regex_massbank['name'] = [r'^RECORD_TITLE:(.*)$'] -regex_massbank['polarity'] = [r'^AC\$MASS_SPECTROMETRY:\s+ION_MODE\s+(.*)$'] -regex_massbank['precursor_mz'] = [ - r'^MS\$FOCUSED_ION:\s+PRECURSOR_M/Z\s+(\d*[.,]?\d*)$'] -regex_massbank['precursor_type'] = [ - r'^MS\$FOCUSED_ION:\s+PRECURSOR_TYPE\s+(.*)$'] -regex_massbank['num_peaks'] = [r'^PK\$NUM_PEAK:\s+(\d*)'] -regex_massbank['cols'] = [r'^PK\$PEAK:\s+(.*)'] -regex_massbank['massbank'] = [ - r'^RECORD_TITLE:(.*)$'] # Flag for massbank format - -if args.schema == 'msp': - meta_regex = regex_msp -elif args.schema == 'massbank': - meta_regex = regex_massbank -elif args.schema == 'auto': - # If auto we just check for all the available paramter names and then - # determine if Massbank or MSP based on the name parameter - meta_regex = {} - meta_regex.update(regex_massbank) - meta_regex['name'].extend(regex_msp['name']) - meta_regex['polarity'].extend(regex_msp['polarity']) - meta_regex['precursor_mz'].extend(regex_msp['precursor_mz']) - meta_regex['precursor_type'].extend(regex_msp['precursor_type']) - meta_regex['num_peaks'].extend(regex_msp['num_peaks']) - meta_regex['msp'] = regex_msp['msp'] -else: - sys.exit("No schema selected") - -adduct_types = { - '[M+H]+': 1.007276, - '[M+NH4]+': 18.034374, - '[M+Na]+': 22.989218, - '[M+K]+': 38.963158, - '[M+CH3OH+H]+': 33.033489, - '[M+ACN+H]+': 42.033823, - '[M+ACN+Na]+': 64.015765, - '[M+2ACN+H]+': 83.06037, - '[M-H]-': -1.007276, - '[M+Cl]-': 34.969402, - '[M+HCOO]-': 44.99819, - '[M-H+HCOOH]-': 44.99819, - # same as above but different style of writing adduct - '[M+CH3COO]-': 59.01385, - '[M-H+CH3COOH]-': 59.01385 - # same as above but different style of writing adduct -} -inv_adduct_types = {int(round(v, 0)): k for k, v in adduct_types.iteritems()} - # function to extract the meta data using the regular expressions def parse_meta(meta_regex, meta_info=None): @@ -158,6 +32,70 @@ return meta_info +def get_meta_regex(schema): + ###################################################################### + # Setup regular expressions for MSP parsing dictionary + ###################################################################### + regex_msp = {} + regex_msp['name'] = [r'^Name(?:=|:)(.*)$'] + regex_msp['polarity'] = [r'^ion.*mode(?:=|:)(.*)$', + r'^ionization.*mode(?:=|:)(.*)$', + r'^polarity(?:=|:)(.*)$'] + regex_msp['precursor_mz'] = [r'^precursor.*m/z(?:=|:)\s*(\d*[.,]?\d*)$', + r'^precursor.*mz(?:=|:)\s*(\d*[.,]?\d*)$'] + regex_msp['precursor_type'] = [r'^precursor.*type(?:=|:)(.*)$', + r'^adduct(?:=|:)(.*)$', + r'^ADDUCTIONNAME(?:=|:)(.*)$'] + + regex_msp['retention_time'] = [r'^RETENTION.*TIME(?:=|:)\s*(.*)$', + r'^rt(?:=|:)\s*(.*)$', + r'^time(?:=|:)\s*(.*)$'] + # From example winter_pos.mspy from kristian + regex_msp['AlignmentID'] = [r'^AlignmentID(?:=|:)\s*(.*)$'] + + regex_msp['num_peaks'] = [r'^Num.*Peaks(?:=|:)\s*(\d*)$'] + regex_msp['msp'] = [r'^Name(?:=|:)(.*)$'] # Flag for standard MSP format + + regex_massbank = {} + regex_massbank['name'] = [r'^RECORD_TITLE:(.*)$'] + regex_massbank['polarity'] = [ + r'^AC\$MASS_SPECTROMETRY:\s+ION_MODE\s+(.*)$'] + regex_massbank['precursor_mz'] = [ + r'^MS\$FOCUSED_ION:\s+PRECURSOR_M/Z\s+(\d*[.,]?\d*)$'] + regex_massbank['precursor_type'] = [ + r'^MS\$FOCUSED_ION:\s+PRECURSOR_TYPE\s+(.*)$'] + regex_massbank['retention_time'] = [ + r'^AC\$CHROMATOGRAPHY:\s+RETENTION_TIME\s*(\d*\.?\d+).*'] + + regex_massbank['num_peaks'] = [r'^PK\$NUM_PEAK:\s+(\d*)'] + regex_massbank['cols'] = [r'^PK\$PEAK:\s+(.*)'] + regex_massbank['massbank'] = [ + r'^RECORD_TITLE:(.*)$'] # Flag for massbank format + + if schema == 'msp': + meta_regex = regex_msp + elif schema == 'massbank': + meta_regex = regex_massbank + elif schema == 'auto': + # If auto we just check for all the available paramter names and then + # determine if Massbank or MSP based on the name parameter + meta_regex = {} + meta_regex.update(regex_massbank) + meta_regex['name'].extend(regex_msp['name']) + meta_regex['polarity'].extend(regex_msp['polarity']) + meta_regex['precursor_mz'].extend(regex_msp['precursor_mz']) + meta_regex['precursor_type'].extend(regex_msp['precursor_type']) + meta_regex['num_peaks'].extend(regex_msp['num_peaks']) + meta_regex['retention_time'].extend(regex_msp['retention_time']) + meta_regex['AlignmentID'] = regex_msp['AlignmentID'] + meta_regex['msp'] = regex_msp['msp'] + + else: + sys.exit("No schema selected") + + return meta_regex + + ###################################################################### # Setup parameter dictionary ###################################################################### @@ -271,6 +209,8 @@ # record name (i.e. when using msPurity and we have the columns coded into # the name) or just the spectra index (spectrac)]. # Returns the parameters used and the command line call + meta_info = {k: v for k, v in meta_info.items() if k + not in ['msp', 'massbank', 'cols']} paramd = init_paramd(args) if args.meta_select_col == 'name': @@ -287,7 +227,7 @@ # extracted from the MSP paramd['additional_details'] = meta_info else: - # Just have and index of the spectra in the MSP file + # Just have an index of the spectra in the MSP file paramd['additional_details'] = {'spectra_idx': spectrac} paramd["SampleName"] = "{}_metfrag_result".format(spectrac) @@ -297,9 +237,15 @@ "{}_tmpspec.txt".format(spectrac)) # write spec file + with open(paramd["PeakListPath"], 'w') as outfile: + pls = '' for p in peaklist: outfile.write(p[0] + "\t" + p[1] + "\n") + pls = pls + '{}_{};'.format(p[0], p[1]) + + if args.output_cl: + peaklist_str = pls[:-1] # =============== Update param based on MSP metadata ====================== # Replace param details with details from MSP if required @@ -311,6 +257,8 @@ paramd["PrecursorIonMode"] = \ int(round(adduct_types[meta_info['precursor_type']], 0)) elif not args.skip_invalid_adducts: + inv_adduct_types = {int(round(v, 0)): k for k, v in + six.iteritems(adduct_types)} adduct = inv_adduct_types[int(paramd['PrecursorIonModeDefault'])] paramd["PrecursorIonMode"] = paramd['PrecursorIonModeDefault'] nm = float(meta_info['precursor_mz']) - paramd['nm_mass_diff_default'] @@ -318,7 +266,10 @@ print('Skipping {}'.format(paramd["SampleName"])) return '', '' - paramd['additional_details']['adduct'] = adduct + if not ('precursor_type' in paramd['additional_details'] or 'adduct' + in paramd['additional_details']): + paramd['additional_details']['adduct'] = adduct + paramd["NeutralPrecursorMass"] = nm # ============== Create CLI cmd for metfrag =============================== @@ -328,6 +279,10 @@ 'additional_details']: cmd += " {}={}".format(str(k), str(v)) + if args.output_cl: + cli_str = '{} PeakListString={}'.format(cmd, peaklist_str) + paramd['additional_details']['MetFragCLIString'] = cli_str + # ============== Run metfrag ============================================== # print(cmd) # Filter before process with a minimum number of MS/MS peaks @@ -343,178 +298,300 @@ return [os.system(cmd) for cmd in cmds] -###################################################################### -# Parse MSP file and run metfrag CLI -###################################################################### -# keep list of commands if performing in CLI in parallel -cmds = [] -# keep a dictionary of all params -paramds = {} -# keep count of spectra (for uid) -spectrac = 0 -# this dictionary will store the meta data results form the MSp file -meta_info = {} +if __name__ == "__main__": + print(sys.version) + + parser = argparse.ArgumentParser() + parser.add_argument('--input_pth') + parser.add_argument('--result_pth', default='metfrag_result.csv') + + parser.add_argument('--temp_dir') + parser.add_argument('--polarity', default='pos') + parser.add_argument('--minMSMSpeaks', default=1) + + parser.add_argument('--MetFragDatabaseType', default='PubChem') + parser.add_argument('--LocalDatabasePath', default='') + parser.add_argument('--LocalMetChemDatabaseServerIp', default='') + + parser.add_argument('--DatabaseSearchRelativeMassDeviation', default=5) + parser.add_argument('--FragmentPeakMatchRelativeMassDeviation', default=10) + parser.add_argument('--FragmentPeakMatchAbsoluteMassDeviation', + default=0.001) + parser.add_argument('--NumberThreads', default=1) + parser.add_argument('--UnconnectedCompoundFilter', action='store_true') + parser.add_argument('--IsotopeFilter', action='store_true') + + parser.add_argument('--FilterMinimumElements', default='') + parser.add_argument('--FilterMaximumElements', default='') + parser.add_argument('--FilterSmartsInclusionList', default='') + parser.add_argument('--FilterSmartsExclusionList', default='') + parser.add_argument('--FilterIncludedElements', default='') + parser.add_argument('--FilterExcludedElements', default='') + parser.add_argument('--FilterIncludedExclusiveElements', default='') -with open(args.input_pth, "r") as infile: - # number of lines for the peaks - pnumlines = 0 - # number of lines read for the peaks - plinesread = 0 - for line in infile: - line = line.strip() + parser.add_argument('--score_thrshld', default=0) + parser.add_argument('--pctexplpeak_thrshld', default=0) + parser.add_argument('--schema') + parser.add_argument('--cores_top_level', default=1) + parser.add_argument('--chunks', default=1) + parser.add_argument('--meta_select_col', default='name') + parser.add_argument('--skip_invalid_adducts', action='store_true') + parser.add_argument('--output_cl', action='store_true') - if pnumlines == 0: - # ============== Extract metadata from MSP ======================== - meta_info = parse_meta(meta_regex, meta_info) + parser.add_argument('--ScoreSuspectLists', default='') + parser.add_argument('--MetFragScoreTypes', + default="FragmenterScore,OfflineMetFusionScore") + parser.add_argument('--MetFragScoreWeights', default="1.0,1.0") + parser.add_argument('-a', '--adducts', action='append', nargs=1, + required=False, default=[], help='Adducts used') - if ('massbank' in meta_info and 'cols' in meta_info) or ( - 'msp' in meta_info and 'num_peaks' in meta_info): - pnumlines = int(meta_info['num_peaks']) - plinesread = 0 - peaklist = [] + args = parser.parse_args() + print(args) + config = ConfigParser() + config.read( + os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config.ini')) - elif plinesread < pnumlines: - # ============== Extract peaks from MSP ========================== - # .split() will split on any empty space (i.e. tab and space) - line = tuple(line.split()) - # Keep only m/z and intensity, not relative intensity - save_line = tuple(line[0].split() + line[1].split()) - plinesread += 1 - peaklist.append(save_line) + if os.stat(args.input_pth).st_size == 0: + print('Input file empty') + exit() + + # Create temporary working directory + if args.temp_dir: + wd = args.temp_dir + else: + wd = tempfile.mkdtemp() - elif plinesread and plinesread == pnumlines: - # ======= Get sample name and additional details for output ======= - spectrac += 1 - paramd, cmd = run_metfrag(meta_info, peaklist, args, wd, spectrac, - adduct_types) + if os.path.exists(wd): + shutil.rmtree(wd) + os.makedirs(wd) + else: + os.makedirs(wd) - if paramd: - paramds[paramd["SampleName"]] = paramd - cmds.append(cmd) + meta_regex = get_meta_regex(args.schema) - meta_info = {} - pnumlines = 0 - plinesread = 0 - - # end of file. Check if there is a MSP spectra to run metfrag on still - if plinesread and plinesread == pnumlines: - - paramd, cmd = run_metfrag(meta_info, peaklist, args, wd, spectrac + 1, - adduct_types) + adduct_types = { + '[M+H]+': 1.007276, + '[M+NH4]+': 18.034374, + '[M+Na]+': 22.989218, + '[M+K]+': 38.963158, + '[M+CH3OH+H]+': 33.033489, + '[M+ACN+H]+': 42.033823, + '[M+ACN+Na]+': 64.015765, + '[M+2ACN+H]+': 83.06037, + '[M-H]-': -1.007276, + '[M+Cl]-': 34.969402, + '[M+HCOO]-': 44.99819, + '[M-H+HCOOH]-': 44.99819, + # same as above but different style of writing adduct + '[M+CH3COO]-': 59.01385, + '[M-H+CH3COOH]-': 59.01385 + # same as above but different style of writing adduct + } - if paramd: - paramds[paramd["SampleName"]] = paramd - cmds.append(cmd) + ###################################################################### + # Parse MSP file and run metfrag CLI + ###################################################################### + # keep list of commands if performing in CLI in parallel + cmds = [] + # keep a dictionary of all params + paramds = {} + # keep count of spectra (for uid) + spectrac = 0 + # this dictionary will store the meta data results form the MSp file + meta_info = {} -# Perform multiprocessing on command line call level -if int(args.cores_top_level) > 1: - cmds_chunks = [cmds[x:x + int(args.chunks)] for x in - list(range(0, len(cmds), int(args.chunks)))] - pool = multiprocessing.Pool(processes=int(args.cores_top_level)) - pool.map(work, cmds_chunks) - pool.close() - pool.join() + if args.adducts: + adducts_from_cli = [ + a[0].replace('__ob__', '[').replace('__cb__', ']') for a in + args.adducts + ] + else: + adducts_from_cli = [] -###################################################################### -# Concatenate and filter the output -###################################################################### -# outputs might have different headers. Need to get a list of all the -# headers before we start merging the files -# outfiles = [os.path.join(wd, f) for f in glob.glob(os.path.join(wd, -# "*_metfrag_result.csv"))] -outfiles = glob.glob(os.path.join(wd, "*_metfrag_result.csv")) + with open(args.input_pth, "r") as infile: + # number of lines for the peaks + pnumlines = 0 + # number of lines read for the peaks + plinesread = 0 + for line in infile: + line = line.strip() + + if pnumlines == 0: + # ============== Extract metadata from MSP ==================== + meta_info = parse_meta(meta_regex, meta_info) -if len(outfiles) == 0: - print('No results') - sys.exit() + if ('massbank' in meta_info and 'cols' in meta_info) or ( + 'msp' in meta_info and 'num_peaks' in meta_info): + pnumlines = int(meta_info['num_peaks']) + plinesread = 0 + peaklist = [] + + elif plinesread < pnumlines: + # ============== Extract peaks from MSP ======================= + # .split() will split on any empty space (i.e. tab and space) + line = tuple(line.split()) + # Keep only m/z and intensity, not relative intensity + save_line = tuple(line[0].split() + line[1].split()) + plinesread += 1 + peaklist.append(save_line) -headers = [] -c = 0 -for fn in outfiles: - with open(fn, 'r') as infile: - reader = csv.reader(infile) - if sys.version_info >= (3, 0): - headers.extend(next(reader)) - else: - headers.extend(reader.next()) - # check if file has any data rows - for i, row in enumerate(reader): - c += 1 - if i == 1: - break + elif plinesread and plinesread == pnumlines: + # =Get sample name and additional details for output and RUN == + if adducts_from_cli: + for adduct in adducts_from_cli: + spectrac += 1 + meta_info['precursor_type'] = adduct + paramd, cmd = run_metfrag(meta_info, peaklist, args, + wd, spectrac, adduct_types) + if paramd: + paramds[paramd["SampleName"]] = paramd + cmds.append(cmd) + else: + spectrac += 1 + paramd, cmd = run_metfrag(meta_info, peaklist, args, + wd, spectrac, adduct_types) + if paramd: + paramds[paramd["SampleName"]] = paramd + cmds.append(cmd) -# if no data rows (e.g. matches) then do not save an -# output and leave the program -if c == 0: - print('No results') - sys.exit() + meta_info = {} + pnumlines = 0 + plinesread = 0 + + # end of file. Check if there is a MSP spectra to run metfrag on + if plinesread and plinesread == pnumlines: -additional_detail_headers = ['sample_name'] -for k, paramd in six.iteritems(paramds): - additional_detail_headers = list(set( - additional_detail_headers + list(paramd['additional_details'].keys()))) - -# add inchikey if not already present (missing in metchem output) -if 'InChIKey' not in headers: - headers.append('InChIKey') - -headers = additional_detail_headers + sorted(list(set(headers))) + if adducts_from_cli: + for adduct in adducts_from_cli: + spectrac += 1 + meta_info['precursor_type'] = adduct + paramd, cmd = run_metfrag(meta_info, peaklist, args, + wd, spectrac, adduct_types) + if paramd: + paramds[paramd["SampleName"]] = paramd + cmds.append(cmd) + else: + spectrac += 1 + paramd, cmd = run_metfrag(meta_info, peaklist, args, + wd, spectrac, adduct_types) + if paramd: + paramds[paramd["SampleName"]] = paramd + cmds.append(cmd) -# Sort files nicely -outfiles.sort( - key=lambda s: int(re.match(r'^.*/(\d+)_metfrag_result.csv', s).group(1))) - -print(outfiles) + # Perform multiprocessing on command line call level + if int(args.cores_top_level) > 1: + cmds_chunks = [cmds[x:x + int(args.chunks)] for x in + list(range(0, len(cmds), int(args.chunks)))] + pool = multiprocessing.Pool(processes=int(args.cores_top_level)) + pool.map(work, cmds_chunks) + pool.close() + pool.join() -# merge outputs -with open(args.result_pth, 'a') as merged_outfile: - dwriter = csv.DictWriter(merged_outfile, fieldnames=headers, - delimiter='\t', quotechar='"') - dwriter.writeheader() + ###################################################################### + # Concatenate and filter the output + ###################################################################### + # outputs might have different headers. Need to get a list of all the + # headers before we start merging the files + # outfiles = [os.path.join(wd, f) for f in glob.glob(os.path.join(wd, + # "*_metfrag_result.csv"))] + outfiles = glob.glob(os.path.join(wd, "*_metfrag_result.csv")) + if len(outfiles) == 0: + print('No results') + sys.exit() + + headers = [] + c = 0 for fn in outfiles: + with open(fn, 'r') as infile: + reader = csv.reader(infile) + if sys.version_info >= (3, 0): + headers.extend(next(reader)) + else: + headers.extend(reader.next()) + # check if file has any data rows + for i, row in enumerate(reader): + c += 1 + if i == 1: + break - with open(fn) as infile: - reader = csv.DictReader(infile, delimiter=',', quotechar='"') - for line in reader: - bewrite = True - for key, value in line.items(): - # Filter when no MS/MS peak matched - if key == "ExplPeaks": - if float(args.pctexplpeak_thrshld) > 0 \ - and value and "NA" in value: - bewrite = False - # Filter with a score threshold - elif key == "Score": - if value and float(value) <= float(args.score_thrshld): + # if no data rows (e.g. matches) then do not save an + # output and leave the program + if c == 0: + print('No results') + sys.exit() + + additional_detail_headers = ['sample_name'] + for k, paramd in six.iteritems(paramds): + additional_detail_headers = list(set( + additional_detail_headers + list( + paramd['additional_details'].keys()))) + + # add inchikey if not already present (missing in metchem output) + if 'InChIKey' not in headers: + headers.append('InChIKey') + + additional_detail_headers = sorted(additional_detail_headers) + headers = additional_detail_headers + sorted(list(set(headers))) + + # Sort files nicely + outfiles.sort( + key=lambda s: int( + re.match(r'^.*/(\d+)_metfrag_result.csv', s).group(1))) + + print(outfiles) + + # merge outputs + with open(args.result_pth, 'a') as merged_outfile: + dwriter = csv.DictWriter(merged_outfile, fieldnames=headers, + delimiter='\t', quotechar='"') + dwriter.writeheader() + + for fn in outfiles: + with open(fn) as infile: + reader = csv.DictReader(infile, delimiter=',', quotechar='"') + for line in reader: + bewrite = True + for key, value in line.items(): + # Filter when no MS/MS peak matched + if key == "ExplPeaks": + if float(args.pctexplpeak_thrshld) > 0 \ + and value and "NA" in value: + bewrite = False + # Filter with a score threshold + elif key == "Score": + if value and float(value) <= float( + args.score_thrshld): + bewrite = False + elif key == "NoExplPeaks": + nbfindpeak = float(value) + elif key == "NumberPeaksUsed": + totpeaks = float(value) + # Filter with a relative number of peak matched + try: + pctexplpeak = nbfindpeak / totpeaks * 100 + except ZeroDivisionError: + bewrite = False + else: + if pctexplpeak < float(args.pctexplpeak_thrshld): bewrite = False - elif key == "NoExplPeaks": - nbfindpeak = float(value) - elif key == "NumberPeaksUsed": - totpeaks = float(value) - # Filter with a relative number of peak matched - try: - pctexplpeak = nbfindpeak / totpeaks * 100 - except ZeroDivisionError: - bewrite = False - else: - if pctexplpeak < float(args.pctexplpeak_thrshld): - bewrite = False + + # Write the line if it pass all filters + if bewrite: + bfn = os.path.basename(fn) + bfn = bfn.replace(".csv", "") + line['sample_name'] = paramds[bfn]['SampleName'] + ad = paramds[bfn]['additional_details'] - # Write the line if it pass all filters - if bewrite: - bfn = os.path.basename(fn) - bfn = bfn.replace(".csv", "") - line['sample_name'] = paramds[bfn]['SampleName'] - ad = paramds[bfn]['additional_details'] + if args.MetFragDatabaseType == "MetChem": + # for some reason the metchem database option does + # not report the full inchikey (at least in the + # Bham setup. This ensures we always get + # the fully inchikey + line['InChIKey'] = '{}-{}-{}'.format( + line['InChIKey1'], + line['InChIKey2'], + line['InChIKey3']) - if args.MetFragDatabaseType == "MetChem": - # for some reason the metchem database option does - # not report the full inchikey (at least in the Bham - # setup. This ensures we always get the fully inchikey - line['InChIKey'] = '{}-{}-{}'.format(line['InChIKey1'], - line['InChIKey2'], - line['InChIKey3']) - - line.update(ad) - dwriter.writerow(line) + line.update(ad) + dwriter.writerow(line)