comparison sirius_csifingerid.py @ 0:9e6bf7278257 draft

"planemo upload for repository https://github.com/computational-metabolomics/sirius_csifingerid_galaxy commit 1d1b37a070f895c94069819237199c768da27258"
author computational-metabolomics
date Wed, 05 Feb 2020 10:41:48 -0500
parents
children 856b3761277d
comparison
equal deleted inserted replaced
-1:000000000000 0:9e6bf7278257
1 from __future__ import absolute_import, print_function
2
3 import argparse
4 import csv
5 import glob
6 import multiprocessing
7 import os
8 import re
9 import sys
10 import tempfile
11 import uuid
12 from collections import defaultdict
13
14 import six
15
16 parser = argparse.ArgumentParser()
17 parser.add_argument('--input_pth')
18 parser.add_argument('--result_pth')
19 parser.add_argument('--database')
20 parser.add_argument('--profile')
21 parser.add_argument('--candidates')
22 parser.add_argument('--ppm_max')
23 parser.add_argument('--polarity')
24 parser.add_argument('--results_name')
25 parser.add_argument('--out_dir')
26 parser.add_argument('--tool_directory')
27 parser.add_argument('--temp_dir')
28
29 parser.add_argument('--meta_select_col', default='all')
30 parser.add_argument('--cores_top_level', default=1)
31 parser.add_argument('--chunks', default=1)
32 parser.add_argument('--minMSMSpeaks', default=1)
33 parser.add_argument('--schema', default='msp')
34 args = parser.parse_args()
35 print(args)
36 if os.stat(args.input_pth).st_size == 0:
37 print('Input file empty')
38 exit()
39
40 if args.temp_dir:
41 wd = os.path.join(args.temp_dir, 'temp')
42 os.mkdir(wd)
43
44 if not os.path.exists(wd):
45 os.mkdir(wd)
46 else:
47 td = tempfile.mkdtemp()
48 wd = os.path.join(td, str(uuid.uuid4()))
49 os.mkdir(wd)
50
51 ######################################################################
52 # Setup regular expressions for MSP parsing dictionary
53 ######################################################################
54 regex_msp = {}
55 regex_msp['name'] = [r'^Name(?:=|:)(.*)$']
56 regex_msp['polarity'] = [r'^ion.*mode(?:=|:)(.*)$',
57 r'^ionization.*mode(?:=|:)(.*)$',
58 r'^polarity(?:=|:)(.*)$']
59 regex_msp['precursor_mz'] = [r'^precursor.*m/z(?:=|:)\s*(\d*[.,]?\d*)$',
60 r'^precursor.*mz(?:=|:)\s*(\d*[.,]?\d*)$']
61 regex_msp['precursor_type'] = [r'^precursor.*type(?:=|:)(.*)$',
62 r'^adduct(?:=|:)(.*)$',
63 r'^ADDUCTIONNAME(?:=|:)(.*)$']
64 regex_msp['num_peaks'] = [r'^Num.*Peaks(?:=|:)\s*(\d*)$']
65 regex_msp['msp'] = [r'^Name(?:=|:)(.*)$'] # Flag for standard MSP format
66
67 regex_massbank = {}
68 regex_massbank['name'] = [r'^RECORD_TITLE:(.*)$']
69 regex_massbank['polarity'] = \
70 [r'^AC\$MASS_SPECTROMETRY:\s+ION_MODE\s+(.*)$']
71 regex_massbank['precursor_mz'] = \
72 [r'^MS\$FOCUSED_ION:\s+PRECURSOR_M/Z\s+(\d*[.,]?\d*)$']
73 regex_massbank['precursor_type'] = \
74 [r'^MS\$FOCUSED_ION:\s+PRECURSOR_TYPE\s+(.*)$']
75 regex_massbank['num_peaks'] = [r'^PK\$NUM_PEAK:\s+(\d*)']
76 regex_massbank['cols'] = [r'^PK\$PEAK:\s+(.*)']
77 regex_massbank['massbank'] = [r'^RECORD_TITLE:(.*)$'] # Flag for massbank
78
79 if args.schema == 'msp':
80 meta_regex = regex_msp
81 elif args.schema == 'massbank':
82 meta_regex = regex_massbank
83 elif args.schema == 'auto':
84 # If auto we just check for all the available paramter names
85 # and then determine if Massbank or MSP based on
86 # the name parameter
87 meta_regex = {}
88 meta_regex.update(regex_massbank)
89 meta_regex['name'].extend(regex_msp['name'])
90 meta_regex['polarity'].extend(regex_msp['polarity'])
91 meta_regex['precursor_mz'].extend(regex_msp['precursor_mz'])
92 meta_regex['precursor_type'].extend(regex_msp['precursor_type'])
93 meta_regex['num_peaks'].extend(regex_msp['num_peaks'])
94 meta_regex['msp'] = regex_msp['msp']
95
96 print(meta_regex)
97
98 # this dictionary will store the meta data results form the MSp file
99 meta_info = {}
100
101
102 # function to extract the meta data using the regular expressions
103 def parse_meta(meta_regex, meta_info=None):
104 if meta_info is None:
105 meta_info = {}
106 for k, regexes in six.iteritems(meta_regex):
107 for reg in regexes:
108 m = re.search(reg, line, re.IGNORECASE)
109 if m:
110 meta_info[k] = '-'.join(m.groups()).strip()
111 return meta_info
112
113
114 ######################################################################
115 # Setup parameter dictionary
116 ######################################################################
117 def init_paramd(args):
118 paramd = defaultdict()
119 paramd["cli"] = {}
120 paramd["cli"]["--database"] = args.database
121 paramd["cli"]["--profile"] = args.profile
122 paramd["cli"]["--candidates"] = args.candidates
123 paramd["cli"]["--ppm-max"] = args.ppm_max
124 if args.polarity == 'positive':
125 paramd["default_ion"] = "[M+H]+"
126 elif args.polarity == 'negative':
127 paramd["default_ion"] = "[M-H]-"
128 else:
129 paramd["default_ion"] = ''
130
131 return paramd
132
133
134 ######################################################################
135 # Function to run sirius when all meta and spectra is obtained
136 ######################################################################
137 def run_sirius(meta_info, peaklist, args, wd, spectrac):
138 # Get sample details (if possible to extract) e.g. if created as part of
139 # the msPurity pipeline) choose between getting additional details to
140 # add as columns as either all meta data from msp, just details from the
141 # record name (i.e. when using msPurity and we have the columns
142 # coded into the name) or just the spectra index (spectrac)
143 paramd = init_paramd(args)
144
145 if args.meta_select_col == 'name':
146 # have additional column of just the name
147 paramd['additional_details'] = {'name': meta_info['name']}
148 elif args.meta_select_col == 'name_split':
149 # have additional columns split by "|" and
150 # then on ":" e.g. MZ:100.2 | RT:20 | xcms_grp_id:1
151 paramd['additional_details'] = {
152 sm.split(":")[0].strip(): sm.split(":")[1].strip() for sm in
153 meta_info['name'].split("|")}
154 elif args.meta_select_col == 'all':
155 # have additional columns based on all
156 # the meta information extracted from the MSP
157 paramd['additional_details'] = meta_info
158 else:
159 # Just have and index of the spectra in the MSP file
160 paramd['additional_details'] = {'spectra_idx': spectrac}
161
162 paramd["SampleName"] = "{}_sirius_result".format(spectrac)
163
164 paramd["cli"]["--output"] = \
165 os.path.join(wd, "{}_sirius_result".format(spectrac))
166
167 # =============== Output peaks to txt file ==============================
168 paramd["cli"]["--ms2"] = os.path.join(wd,
169 "{}_tmpspec.txt".format(spectrac))
170
171 # write spec file
172 with open(paramd["cli"]["--ms2"], 'w') as outfile:
173 for p in peaklist:
174 outfile.write(p[0] + "\t" + p[1] + "\n")
175
176 # =============== Update param based on MSP metadata ======================
177 # Replace param details with details from MSP if required
178 if 'precursor_type' in meta_info and meta_info['precursor_type']:
179 paramd["cli"]["--ion"] = meta_info['precursor_type']
180 else:
181 if paramd["default_ion"]:
182 paramd["cli"]["--ion"] = paramd["default_ion"]
183 else:
184 paramd["cli"]["--auto-charge"] = ''
185
186 if 'precursor_mz' in meta_info and meta_info['precursor_mz']:
187 paramd["cli"]["--precursor"] = meta_info['precursor_mz']
188
189 # ============== Create CLI cmd for metfrag ===============================
190 cmd = "sirius --fingerid"
191 for k, v in six.iteritems(paramd["cli"]):
192 cmd += " {} {}".format(str(k), str(v))
193 paramds[paramd["SampleName"]] = paramd
194
195 # =============== Run srius ==============================================
196 # Filter before process with a minimum number of MS/MS peaks
197 if plinesread >= float(args.minMSMSpeaks):
198
199 if int(args.cores_top_level) == 1:
200 os.system(cmd)
201
202 return paramd, cmd
203
204
205 def work(cmds):
206 return [os.system(cmd) for cmd in cmds]
207
208
209 ######################################################################
210 # Parse MSP file and run SIRIUS CLI
211 ######################################################################
212 # keep list of commands if performing in CLI in parallel
213 cmds = []
214 # keep a dictionary of all params
215 paramds = {}
216 # keep count of spectra (for uid)
217 spectrac = 0
218
219 with open(args.input_pth, "r") as infile:
220 # number of lines for the peaks
221 pnumlines = 0
222 # number of lines read for the peaks
223 plinesread = 0
224 for line in infile:
225
226 line = line.strip()
227
228 if pnumlines == 0:
229
230 # ============== Extract metadata from MSP ========================
231 meta_info = parse_meta(meta_regex, meta_info)
232
233 if ('massbank' in meta_info and 'cols' in meta_info) or \
234 ('msp' in meta_info and 'num_peaks' in meta_info):
235 pnumlines = int(meta_info['num_peaks'])
236 peaklist = []
237 plinesread = 0
238
239 elif plinesread < pnumlines:
240 # =============== Extract peaks from MSP ==========================
241 # .split() will split on any empty space (i.e. tab and space)
242 line = tuple(line.split())
243 # Keep only m/z and intensity, not relative intensity
244 save_line = tuple(line[0].split() + line[1].split())
245 plinesread += 1
246
247 peaklist.append(save_line)
248
249 elif plinesread and plinesread == pnumlines:
250 # ======= Get sample name and additional details for output =======
251 spectrac += 1
252 paramd, cmd = run_sirius(meta_info, peaklist, args, wd, spectrac)
253
254 paramds[paramd["SampleName"]] = paramd
255 cmds.append(cmd)
256
257 meta_info = {}
258 pnumlines = 0
259 plinesread = 0
260
261 # end of file. Check if there is a MSP spectra to
262 # run metfrag on still
263
264 if plinesread and plinesread == pnumlines:
265 paramd, cmd = run_sirius(meta_info, peaklist, args, wd, spectrac + 1)
266
267 paramds[paramd["SampleName"]] = paramd
268 cmds.append(cmd)
269
270 # Perform multiprocessing on command line call level
271 if int(args.cores_top_level) > 1:
272 cmds_chunks = [cmds[x:x + int(args.chunks)]
273 for x in list(range(0, len(cmds), int(args.chunks)))]
274 pool = multiprocessing.Pool(processes=int(args.cores_top_level))
275 pool.map(work, cmds_chunks)
276 pool.close()
277 pool.join()
278
279 ######################################################################
280 # Concatenate and filter the output
281 ######################################################################
282 # outputs might have different headers. Need to get a list of all the headers
283 # before we start merging the files outfiles = [os.path.join(wd, f) for f in
284 # glob.glob(os.path.join(wd, "*_metfrag_result.csv"))]
285 outfiles = glob.glob(os.path.join(wd, '*', '*', 'summary_csi_fingerid.csv'))
286
287 # sort files nicely
288 outfiles.sort(key=lambda s: int(re.match(r'^.*/('
289 r'\d+).*/.*/summary_csi_fingerid.csv',
290 s).group(1)))
291 print(outfiles)
292
293 if len(outfiles) == 0:
294 print('No results')
295 sys.exit()
296
297 headers = []
298 c = 0
299 for fn in outfiles:
300 with open(fn, 'r') as infile:
301 reader = csv.reader(infile, delimiter='\t')
302 if sys.version_info >= (3, 0):
303 headers.extend(next(reader))
304 else:
305 headers.extend(reader.next())
306 break
307
308 headers = list(paramd['additional_details'].keys()) + headers
309
310 with open(args.result_pth, 'a') as merged_outfile:
311 dwriter = csv.DictWriter(merged_outfile,
312 fieldnames=headers, delimiter='\t')
313 dwriter.writeheader()
314
315 for fn in sorted(outfiles):
316 print(fn)
317
318 with open(fn) as infile:
319 reader = csv.DictReader(infile, delimiter='\t')
320
321 ad = paramds[fn.split(os.sep)[-3]]['additional_details']
322
323 for line in reader:
324 line.update(ad)
325 # round score to 5 d.p.
326 line['score'] = round(float(line['score']), 5)
327
328 dwriter.writerow(line)