Mercurial > repos > computational-metabolomics > sirius_csifingerid
comparison sirius_csifingerid.py @ 0:9e6bf7278257 draft
"planemo upload for repository https://github.com/computational-metabolomics/sirius_csifingerid_galaxy commit 1d1b37a070f895c94069819237199c768da27258"
author | computational-metabolomics |
---|---|
date | Wed, 05 Feb 2020 10:41:48 -0500 |
parents | |
children | 856b3761277d |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:9e6bf7278257 |
---|---|
1 from __future__ import absolute_import, print_function | |
2 | |
3 import argparse | |
4 import csv | |
5 import glob | |
6 import multiprocessing | |
7 import os | |
8 import re | |
9 import sys | |
10 import tempfile | |
11 import uuid | |
12 from collections import defaultdict | |
13 | |
14 import six | |
15 | |
16 parser = argparse.ArgumentParser() | |
17 parser.add_argument('--input_pth') | |
18 parser.add_argument('--result_pth') | |
19 parser.add_argument('--database') | |
20 parser.add_argument('--profile') | |
21 parser.add_argument('--candidates') | |
22 parser.add_argument('--ppm_max') | |
23 parser.add_argument('--polarity') | |
24 parser.add_argument('--results_name') | |
25 parser.add_argument('--out_dir') | |
26 parser.add_argument('--tool_directory') | |
27 parser.add_argument('--temp_dir') | |
28 | |
29 parser.add_argument('--meta_select_col', default='all') | |
30 parser.add_argument('--cores_top_level', default=1) | |
31 parser.add_argument('--chunks', default=1) | |
32 parser.add_argument('--minMSMSpeaks', default=1) | |
33 parser.add_argument('--schema', default='msp') | |
34 args = parser.parse_args() | |
35 print(args) | |
36 if os.stat(args.input_pth).st_size == 0: | |
37 print('Input file empty') | |
38 exit() | |
39 | |
40 if args.temp_dir: | |
41 wd = os.path.join(args.temp_dir, 'temp') | |
42 os.mkdir(wd) | |
43 | |
44 if not os.path.exists(wd): | |
45 os.mkdir(wd) | |
46 else: | |
47 td = tempfile.mkdtemp() | |
48 wd = os.path.join(td, str(uuid.uuid4())) | |
49 os.mkdir(wd) | |
50 | |
51 ###################################################################### | |
52 # Setup regular expressions for MSP parsing dictionary | |
53 ###################################################################### | |
54 regex_msp = {} | |
55 regex_msp['name'] = [r'^Name(?:=|:)(.*)$'] | |
56 regex_msp['polarity'] = [r'^ion.*mode(?:=|:)(.*)$', | |
57 r'^ionization.*mode(?:=|:)(.*)$', | |
58 r'^polarity(?:=|:)(.*)$'] | |
59 regex_msp['precursor_mz'] = [r'^precursor.*m/z(?:=|:)\s*(\d*[.,]?\d*)$', | |
60 r'^precursor.*mz(?:=|:)\s*(\d*[.,]?\d*)$'] | |
61 regex_msp['precursor_type'] = [r'^precursor.*type(?:=|:)(.*)$', | |
62 r'^adduct(?:=|:)(.*)$', | |
63 r'^ADDUCTIONNAME(?:=|:)(.*)$'] | |
64 regex_msp['num_peaks'] = [r'^Num.*Peaks(?:=|:)\s*(\d*)$'] | |
65 regex_msp['msp'] = [r'^Name(?:=|:)(.*)$'] # Flag for standard MSP format | |
66 | |
67 regex_massbank = {} | |
68 regex_massbank['name'] = [r'^RECORD_TITLE:(.*)$'] | |
69 regex_massbank['polarity'] = \ | |
70 [r'^AC\$MASS_SPECTROMETRY:\s+ION_MODE\s+(.*)$'] | |
71 regex_massbank['precursor_mz'] = \ | |
72 [r'^MS\$FOCUSED_ION:\s+PRECURSOR_M/Z\s+(\d*[.,]?\d*)$'] | |
73 regex_massbank['precursor_type'] = \ | |
74 [r'^MS\$FOCUSED_ION:\s+PRECURSOR_TYPE\s+(.*)$'] | |
75 regex_massbank['num_peaks'] = [r'^PK\$NUM_PEAK:\s+(\d*)'] | |
76 regex_massbank['cols'] = [r'^PK\$PEAK:\s+(.*)'] | |
77 regex_massbank['massbank'] = [r'^RECORD_TITLE:(.*)$'] # Flag for massbank | |
78 | |
79 if args.schema == 'msp': | |
80 meta_regex = regex_msp | |
81 elif args.schema == 'massbank': | |
82 meta_regex = regex_massbank | |
83 elif args.schema == 'auto': | |
84 # If auto we just check for all the available paramter names | |
85 # and then determine if Massbank or MSP based on | |
86 # the name parameter | |
87 meta_regex = {} | |
88 meta_regex.update(regex_massbank) | |
89 meta_regex['name'].extend(regex_msp['name']) | |
90 meta_regex['polarity'].extend(regex_msp['polarity']) | |
91 meta_regex['precursor_mz'].extend(regex_msp['precursor_mz']) | |
92 meta_regex['precursor_type'].extend(regex_msp['precursor_type']) | |
93 meta_regex['num_peaks'].extend(regex_msp['num_peaks']) | |
94 meta_regex['msp'] = regex_msp['msp'] | |
95 | |
96 print(meta_regex) | |
97 | |
98 # this dictionary will store the meta data results form the MSp file | |
99 meta_info = {} | |
100 | |
101 | |
102 # function to extract the meta data using the regular expressions | |
103 def parse_meta(meta_regex, meta_info=None): | |
104 if meta_info is None: | |
105 meta_info = {} | |
106 for k, regexes in six.iteritems(meta_regex): | |
107 for reg in regexes: | |
108 m = re.search(reg, line, re.IGNORECASE) | |
109 if m: | |
110 meta_info[k] = '-'.join(m.groups()).strip() | |
111 return meta_info | |
112 | |
113 | |
114 ###################################################################### | |
115 # Setup parameter dictionary | |
116 ###################################################################### | |
117 def init_paramd(args): | |
118 paramd = defaultdict() | |
119 paramd["cli"] = {} | |
120 paramd["cli"]["--database"] = args.database | |
121 paramd["cli"]["--profile"] = args.profile | |
122 paramd["cli"]["--candidates"] = args.candidates | |
123 paramd["cli"]["--ppm-max"] = args.ppm_max | |
124 if args.polarity == 'positive': | |
125 paramd["default_ion"] = "[M+H]+" | |
126 elif args.polarity == 'negative': | |
127 paramd["default_ion"] = "[M-H]-" | |
128 else: | |
129 paramd["default_ion"] = '' | |
130 | |
131 return paramd | |
132 | |
133 | |
134 ###################################################################### | |
135 # Function to run sirius when all meta and spectra is obtained | |
136 ###################################################################### | |
137 def run_sirius(meta_info, peaklist, args, wd, spectrac): | |
138 # Get sample details (if possible to extract) e.g. if created as part of | |
139 # the msPurity pipeline) choose between getting additional details to | |
140 # add as columns as either all meta data from msp, just details from the | |
141 # record name (i.e. when using msPurity and we have the columns | |
142 # coded into the name) or just the spectra index (spectrac) | |
143 paramd = init_paramd(args) | |
144 | |
145 if args.meta_select_col == 'name': | |
146 # have additional column of just the name | |
147 paramd['additional_details'] = {'name': meta_info['name']} | |
148 elif args.meta_select_col == 'name_split': | |
149 # have additional columns split by "|" and | |
150 # then on ":" e.g. MZ:100.2 | RT:20 | xcms_grp_id:1 | |
151 paramd['additional_details'] = { | |
152 sm.split(":")[0].strip(): sm.split(":")[1].strip() for sm in | |
153 meta_info['name'].split("|")} | |
154 elif args.meta_select_col == 'all': | |
155 # have additional columns based on all | |
156 # the meta information extracted from the MSP | |
157 paramd['additional_details'] = meta_info | |
158 else: | |
159 # Just have and index of the spectra in the MSP file | |
160 paramd['additional_details'] = {'spectra_idx': spectrac} | |
161 | |
162 paramd["SampleName"] = "{}_sirius_result".format(spectrac) | |
163 | |
164 paramd["cli"]["--output"] = \ | |
165 os.path.join(wd, "{}_sirius_result".format(spectrac)) | |
166 | |
167 # =============== Output peaks to txt file ============================== | |
168 paramd["cli"]["--ms2"] = os.path.join(wd, | |
169 "{}_tmpspec.txt".format(spectrac)) | |
170 | |
171 # write spec file | |
172 with open(paramd["cli"]["--ms2"], 'w') as outfile: | |
173 for p in peaklist: | |
174 outfile.write(p[0] + "\t" + p[1] + "\n") | |
175 | |
176 # =============== Update param based on MSP metadata ====================== | |
177 # Replace param details with details from MSP if required | |
178 if 'precursor_type' in meta_info and meta_info['precursor_type']: | |
179 paramd["cli"]["--ion"] = meta_info['precursor_type'] | |
180 else: | |
181 if paramd["default_ion"]: | |
182 paramd["cli"]["--ion"] = paramd["default_ion"] | |
183 else: | |
184 paramd["cli"]["--auto-charge"] = '' | |
185 | |
186 if 'precursor_mz' in meta_info and meta_info['precursor_mz']: | |
187 paramd["cli"]["--precursor"] = meta_info['precursor_mz'] | |
188 | |
189 # ============== Create CLI cmd for metfrag =============================== | |
190 cmd = "sirius --fingerid" | |
191 for k, v in six.iteritems(paramd["cli"]): | |
192 cmd += " {} {}".format(str(k), str(v)) | |
193 paramds[paramd["SampleName"]] = paramd | |
194 | |
195 # =============== Run srius ============================================== | |
196 # Filter before process with a minimum number of MS/MS peaks | |
197 if plinesread >= float(args.minMSMSpeaks): | |
198 | |
199 if int(args.cores_top_level) == 1: | |
200 os.system(cmd) | |
201 | |
202 return paramd, cmd | |
203 | |
204 | |
205 def work(cmds): | |
206 return [os.system(cmd) for cmd in cmds] | |
207 | |
208 | |
209 ###################################################################### | |
210 # Parse MSP file and run SIRIUS CLI | |
211 ###################################################################### | |
212 # keep list of commands if performing in CLI in parallel | |
213 cmds = [] | |
214 # keep a dictionary of all params | |
215 paramds = {} | |
216 # keep count of spectra (for uid) | |
217 spectrac = 0 | |
218 | |
219 with open(args.input_pth, "r") as infile: | |
220 # number of lines for the peaks | |
221 pnumlines = 0 | |
222 # number of lines read for the peaks | |
223 plinesread = 0 | |
224 for line in infile: | |
225 | |
226 line = line.strip() | |
227 | |
228 if pnumlines == 0: | |
229 | |
230 # ============== Extract metadata from MSP ======================== | |
231 meta_info = parse_meta(meta_regex, meta_info) | |
232 | |
233 if ('massbank' in meta_info and 'cols' in meta_info) or \ | |
234 ('msp' in meta_info and 'num_peaks' in meta_info): | |
235 pnumlines = int(meta_info['num_peaks']) | |
236 peaklist = [] | |
237 plinesread = 0 | |
238 | |
239 elif plinesread < pnumlines: | |
240 # =============== Extract peaks from MSP ========================== | |
241 # .split() will split on any empty space (i.e. tab and space) | |
242 line = tuple(line.split()) | |
243 # Keep only m/z and intensity, not relative intensity | |
244 save_line = tuple(line[0].split() + line[1].split()) | |
245 plinesread += 1 | |
246 | |
247 peaklist.append(save_line) | |
248 | |
249 elif plinesread and plinesread == pnumlines: | |
250 # ======= Get sample name and additional details for output ======= | |
251 spectrac += 1 | |
252 paramd, cmd = run_sirius(meta_info, peaklist, args, wd, spectrac) | |
253 | |
254 paramds[paramd["SampleName"]] = paramd | |
255 cmds.append(cmd) | |
256 | |
257 meta_info = {} | |
258 pnumlines = 0 | |
259 plinesread = 0 | |
260 | |
261 # end of file. Check if there is a MSP spectra to | |
262 # run metfrag on still | |
263 | |
264 if plinesread and plinesread == pnumlines: | |
265 paramd, cmd = run_sirius(meta_info, peaklist, args, wd, spectrac + 1) | |
266 | |
267 paramds[paramd["SampleName"]] = paramd | |
268 cmds.append(cmd) | |
269 | |
270 # Perform multiprocessing on command line call level | |
271 if int(args.cores_top_level) > 1: | |
272 cmds_chunks = [cmds[x:x + int(args.chunks)] | |
273 for x in list(range(0, len(cmds), int(args.chunks)))] | |
274 pool = multiprocessing.Pool(processes=int(args.cores_top_level)) | |
275 pool.map(work, cmds_chunks) | |
276 pool.close() | |
277 pool.join() | |
278 | |
279 ###################################################################### | |
280 # Concatenate and filter the output | |
281 ###################################################################### | |
282 # outputs might have different headers. Need to get a list of all the headers | |
283 # before we start merging the files outfiles = [os.path.join(wd, f) for f in | |
284 # glob.glob(os.path.join(wd, "*_metfrag_result.csv"))] | |
285 outfiles = glob.glob(os.path.join(wd, '*', '*', 'summary_csi_fingerid.csv')) | |
286 | |
287 # sort files nicely | |
288 outfiles.sort(key=lambda s: int(re.match(r'^.*/(' | |
289 r'\d+).*/.*/summary_csi_fingerid.csv', | |
290 s).group(1))) | |
291 print(outfiles) | |
292 | |
293 if len(outfiles) == 0: | |
294 print('No results') | |
295 sys.exit() | |
296 | |
297 headers = [] | |
298 c = 0 | |
299 for fn in outfiles: | |
300 with open(fn, 'r') as infile: | |
301 reader = csv.reader(infile, delimiter='\t') | |
302 if sys.version_info >= (3, 0): | |
303 headers.extend(next(reader)) | |
304 else: | |
305 headers.extend(reader.next()) | |
306 break | |
307 | |
308 headers = list(paramd['additional_details'].keys()) + headers | |
309 | |
310 with open(args.result_pth, 'a') as merged_outfile: | |
311 dwriter = csv.DictWriter(merged_outfile, | |
312 fieldnames=headers, delimiter='\t') | |
313 dwriter.writeheader() | |
314 | |
315 for fn in sorted(outfiles): | |
316 print(fn) | |
317 | |
318 with open(fn) as infile: | |
319 reader = csv.DictReader(infile, delimiter='\t') | |
320 | |
321 ad = paramds[fn.split(os.sep)[-3]]['additional_details'] | |
322 | |
323 for line in reader: | |
324 line.update(ad) | |
325 # round score to 5 d.p. | |
326 line['score'] = round(float(line['score']), 5) | |
327 | |
328 dwriter.writerow(line) |