Mercurial > repos > thanhlv > customize_metaphlan_database
comparison customizemetadata.py @ 0:c0473c69ac9f draft
planemo upload for repository https://github.com/quadram-institute-bioscience/galaxy-tools/tree/master/tools/metaphlan/
author | thanhlv |
---|---|
date | Mon, 13 Feb 2023 11:36:16 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:c0473c69ac9f |
---|---|
1 #!/usr/bin/env python | |
2 # -*- coding: utf-8 -*- | |
3 | |
4 import argparse | |
5 import bz2 | |
6 import json | |
7 import pickle | |
8 import re | |
9 from pathlib import Path | |
10 | |
11 | |
12 def load_from_json(json_fp): | |
13 ''' | |
14 Read JSON file with marker metadata | |
15 | |
16 :param json_fp: Path to JSON file | |
17 ''' | |
18 with open(json_fp, 'r') as json_f: | |
19 data = json.load(json_f) | |
20 | |
21 for m in data['markers']: | |
22 data['markers'][m]['ext'] = set(data['markers'][m]['ext']) | |
23 | |
24 for t in data['taxonomy']: | |
25 if isinstance(data['taxonomy'][t], list): | |
26 data['taxonomy'][t] = tuple(data['taxonomy'][t]) | |
27 return data | |
28 | |
29 | |
30 def dump_to_json(data, json_fp): | |
31 ''' | |
32 Dump marker metadata to JSON file | |
33 | |
34 :param json_fp: Path to JSON file | |
35 ''' | |
36 for m in data['markers']: | |
37 data['markers'][m]['ext'] = list(data['markers'][m]['ext']) | |
38 | |
39 with open(json_fp, 'w') as json_f: | |
40 json.dump(data, json_f) | |
41 | |
42 | |
43 def transform_pkl_to_json(pkl_fp, json_fp): | |
44 ''' | |
45 Read Pickle file and drop it to a JSON file | |
46 | |
47 :param pkl_fp: Path to input Pickle file | |
48 :param json_fp: Path to output JSON file | |
49 ''' | |
50 # load metadata from Pickle file | |
51 with bz2.BZ2File(pkl_fp, 'r') as pkl_f: | |
52 in_metadata = pickle.load(pkl_f) | |
53 | |
54 out_metadata = { | |
55 'markers': in_metadata['markers'], | |
56 'taxonomy': in_metadata['taxonomy'], | |
57 'merged_taxon': {} | |
58 } | |
59 | |
60 # transform merged_taxons tuple keys to string | |
61 for k in in_metadata['merged_taxon']: | |
62 n = ' , '.join(k) | |
63 out_metadata[n] = in_metadata['merged_taxon'][k] | |
64 | |
65 # dump metadata to JSON file | |
66 dump_to_json(out_metadata, json_fp) | |
67 | |
68 | |
69 def transform_json_to_pkl(json_fp, pkl_fp): | |
70 ''' | |
71 Read JSON file and drop it to a Pickle file | |
72 | |
73 :param json_fp: Path to input JSON file | |
74 :param pkl_fp: Path to output Pickle file | |
75 ''' | |
76 # load metadata from JSON file | |
77 in_metadata = load_from_json(json_fp) | |
78 | |
79 out_metadata = { | |
80 'markers': in_metadata['markers'], | |
81 'taxonomy': in_metadata['taxonomy'], | |
82 'merged_taxon': {} | |
83 } | |
84 # transform merged_taxons keys to tuple | |
85 for k in in_metadata['merged_taxon']: | |
86 n = ' , '.split(k) | |
87 out_metadata[n] = in_metadata['merged_taxon'][k] | |
88 | |
89 # Ensure that there are 8 taxonomy levels (for compatibility between Metaphlan v3 and v4) | |
90 # v3 DB release encodes the taxids as: ('2|1224|1236|91347|543|547|354276', 4404432) | |
91 # v4 DB release encodes the taxids as: ('2|1224|1236|91347|543|547|354276|', 4404432) | |
92 for k in out_metadata['taxonomy']: | |
93 if out_metadata['taxonomy'][k][0].count('|') == 6: | |
94 out_metadata['taxonomy'][k] = (out_metadata['taxonomy'][k][0] + '|', out_metadata['taxonomy'][k][1]) | |
95 | |
96 # dump metadata to Pickle file | |
97 with bz2.BZ2File(pkl_fp, 'w') as pkl_f: | |
98 pickle.dump(out_metadata, pkl_f) | |
99 | |
100 | |
101 def add_marker(in_json_fp, out_json_fp, name, m_length, g_length, gca, k_name, k_id, p_name, p_id, c_name, c_id, o_name, o_id, f_name, f_id, g_name, g_id, s_name, s_id, t_name): | |
102 ''' | |
103 Add marker to JSON file | |
104 | |
105 :param in_json_fp: Path to input JSON file | |
106 :param out_json_fp: Path to output JSON file | |
107 :param name: Name of new marker | |
108 :param m_length: Length of new marker | |
109 :param g_length: List with lengths of genomes from which the new marker has been extracted | |
110 :param gca: List with GCA of genomes from which the new marker has been extracted | |
111 :param k_name: List with Name of Kingdom for genomes from which the new marker has been extracted | |
112 :param k_id: List with NCBI id of Kingdom for genomes from which the new marker has been extracted | |
113 :param p_name: List with Name of Phylum for genomes from which the new marker has been extracted | |
114 :param p_id: List with NCBI id of Phylum for genomes from which the new marker has been extracted | |
115 :param c_name: List with Name of Class for genomes from which the new marker has been extracted | |
116 :param c_id: List with NCBI id of Class for genomes from which the new marker has been extracted | |
117 :param o_name: List with Name of Order for genomes from which the new marker has been extracted | |
118 :param o_id: List with NCBI id of Order for genomes from which the new marker has been extracted | |
119 :param f_name: List with Name of Family for genomes from which the new marker has been extracted | |
120 :param f_id: List with NCBI id of Family for genomes from which the new marker has been extracted | |
121 :param g_name: List with Name of Genus for genomes from which the new marker has been extracted | |
122 :param g_id: List with NCBI id of Genus for genomes from which the new marker has been extracted | |
123 :param s_name: List with Name of Species for genomes from which the new marker has been extracted | |
124 :param s_id: List with NCBI id of Species for genomes from which the new marker has been extracted | |
125 :param t_name: List with Name of Strain for genomes from which the new marker has been extracted | |
126 ''' | |
127 metadata = load_from_json(in_json_fp) | |
128 | |
129 # check that all lists have same size | |
130 genome_n = len(g_length) | |
131 if len(gca) != genome_n: | |
132 raise ValueError("Missing/Extra values in GCA list") | |
133 if len(k_name) != genome_n: | |
134 raise ValueError("Missing/Extra values in Kingdom name list") | |
135 if len(k_id) != genome_n: | |
136 raise ValueError("Missing/Extra values in Kingdom ID list") | |
137 if len(p_name) != genome_n: | |
138 raise ValueError("Missing/Extra values in Phylum name list") | |
139 if len(p_id) != genome_n: | |
140 raise ValueError("Missing/Extra values in Phylum ID list") | |
141 if len(c_name) != genome_n: | |
142 raise ValueError("Missing/Extra values in Class name list") | |
143 if len(c_id) != genome_n: | |
144 raise ValueError("Missing/Extra values in Class ID list") | |
145 if len(o_name) != genome_n: | |
146 raise ValueError("Missing/Extra values in Order name list") | |
147 if len(o_id) != genome_n: | |
148 raise ValueError("Missing/Extra values in Order ID list") | |
149 if len(f_name) != genome_n: | |
150 raise ValueError("Missing/Extra values in Family name list") | |
151 if len(f_id) != genome_n: | |
152 raise ValueError("Missing/Extra values in Family ID list") | |
153 if len(g_name) != genome_n: | |
154 raise ValueError("Missing/Extra values in Genus name list") | |
155 if len(g_id) != genome_n: | |
156 raise ValueError("Missing/Extra values in Genus ID list") | |
157 if len(s_name) != genome_n: | |
158 raise ValueError("Missing/Extra values in Species name list") | |
159 if len(s_id) != genome_n: | |
160 raise ValueError("Missing/Extra values in Species ID list") | |
161 if len(t_name) != genome_n: | |
162 raise ValueError("Missing/Extra values in Strain name list") | |
163 | |
164 # create dictionary to aggregate genome taxonomies and identify marker taxonomy | |
165 taxonomy = { | |
166 'k': set(), | |
167 'p': set(), | |
168 'c': set(), | |
169 'o': set(), | |
170 'f': set(), | |
171 'g': set(), | |
172 's': set(), | |
173 't': set(), | |
174 } | |
175 | |
176 # parse genomes | |
177 for i in range(genome_n): | |
178 # add taxonomy of new genome | |
179 g_taxo_names = "k__%s|p__%s|c__%s|o__%s|f__%s|g__%s|s__%s|t__%s" % ( | |
180 k_name[i], | |
181 p_name[i], | |
182 c_name[i], | |
183 o_name[i], | |
184 f_name[i], | |
185 g_name[i], | |
186 s_name[i], | |
187 t_name[i] | |
188 ) | |
189 g_taxo_ids = "%s|%s|%s|%s|%s|%s|%s" % ( | |
190 k_id[i], | |
191 p_id[i], | |
192 c_id[i], | |
193 o_id[i], | |
194 f_id[i], | |
195 g_id[i], | |
196 s_id[i] | |
197 ) | |
198 metadata['taxonomy'][g_taxo_names] = (g_taxo_ids, g_length[i]) | |
199 # aggregate taxon levels using sets | |
200 taxonomy['k'].add(k_name[i]) | |
201 taxonomy['p'].add(p_name[i]) | |
202 taxonomy['c'].add(c_name[i]) | |
203 taxonomy['o'].add(o_name[i]) | |
204 taxonomy['f'].add(f_name[i]) | |
205 taxonomy['g'].add(g_name[i]) | |
206 taxonomy['s'].add(s_name[i]) | |
207 taxonomy['t'].add(t_name[i]) | |
208 | |
209 # extract clade and taxon of marker | |
210 clade = '' # last level before taxomy of genomes diverge | |
211 taxon = '' # combination of levels before divergence | |
212 for level in ['k', 'p', 'c', 'o', 'f', 'g', 's', 't']: | |
213 taxo = list(taxonomy[level]) | |
214 if len(taxo) == 1: | |
215 clade = taxo[0] | |
216 taxon = "%s|%s__%s" % (taxon, level, taxo) | |
217 | |
218 # add information about the new marker | |
219 metadata['markers'][name] = { | |
220 'clade': clade, | |
221 'ext': set(gca), | |
222 'len': m_length, | |
223 'taxon': taxon | |
224 } | |
225 | |
226 dump_to_json(metadata, out_json_fp) | |
227 | |
228 | |
229 def format_markers(marker_l): | |
230 ''' | |
231 Format markers | |
232 | |
233 :param marker_l: list of markers | |
234 ''' | |
235 markers = [] | |
236 for m in marker_l: | |
237 m = m.rstrip() | |
238 if ' ' in m: | |
239 markers.append(m.split(' ')[0]) | |
240 else: | |
241 markers.append(m) | |
242 return markers | |
243 | |
244 | |
245 def get_markers(marker_fp): | |
246 ''' | |
247 Get markers from a file | |
248 | |
249 :param marker_fp: Path to file with markers (1 per line) | |
250 ''' | |
251 # load markers | |
252 with open(marker_fp, 'r') as marker_f: | |
253 markers = marker_f.readlines() | |
254 | |
255 # format markers | |
256 markers = format_markers(markers) | |
257 | |
258 return markers | |
259 | |
260 | |
261 def check_not_found_markers(found_markers, original_markers): | |
262 ''' | |
263 Check list of markers | |
264 | |
265 :param found_markers: list of found markers | |
266 :param original_markers: list of original markers | |
267 ''' | |
268 if len(found_markers) != len(original_markers): | |
269 print('markers not found:') | |
270 for m in original_markers: | |
271 if m not in found_markers: | |
272 print('- "%s"' % m) | |
273 | |
274 | |
275 def prune_taxonomy(in_taxonomy, taxon_s, gca_s): | |
276 ''' | |
277 Prune taxonomy to keep only listed taxonomy | |
278 | |
279 :param in_taxonomy: dictionary with list of taxonomy | |
280 :param taxon_s: set of taxons to keep | |
281 :param gca_s: set of GCA ids to keep | |
282 ''' | |
283 out_taxonomy = {} | |
284 kept_taxonomy = set() | |
285 kept_taxons = set() | |
286 kept_gca = set() | |
287 for t, v in in_taxonomy.items(): | |
288 # check if t match element in list of taxon_s | |
289 kept_taxon = False | |
290 for t_k in taxon_s: | |
291 if t_k in t: | |
292 kept_taxon = True | |
293 out_taxonomy[t] = v | |
294 kept_taxonomy.add(t) | |
295 kept_taxons.add(t_k) | |
296 break | |
297 # check if GCA in the taxon id | |
298 s = re.search(r'GCA_\d+$', t) | |
299 if s: | |
300 gca = s[0] | |
301 # check if GCA in taxon id is in the list GCA to keep | |
302 if gca in gca_s: | |
303 kept_gca.add(gca) | |
304 if not kept_taxon: | |
305 out_taxonomy[t] = v | |
306 kept_taxonomy.add(t) | |
307 | |
308 print('%s kept taxonomy' % len(kept_taxonomy)) | |
309 print('%s / %s taxons not found' % (len(taxon_s) - len(kept_taxons), len(taxon_s))) | |
310 print('%s / %s GCA taxons not found' % (len(gca_s) - len(kept_gca), len(gca_s))) | |
311 return out_taxonomy | |
312 | |
313 | |
314 def remove_markers(in_json_fp, marker_fp, out_json_fp, kept_marker_fp): | |
315 ''' | |
316 Remove markers from JSON file | |
317 | |
318 :param in_json_fp: Path to input JSON file | |
319 :param marker_fp: Path to file with markers to remove (1 per line) | |
320 :param out_json_fp: Path to output JSON file | |
321 :param kept_marker_fp: Path to file with kept markers | |
322 ''' | |
323 in_metadata = load_from_json(in_json_fp) | |
324 | |
325 # load markers | |
326 markers_to_remove = set(get_markers(marker_fp)) | |
327 print('%s markers to remove' % len(markers_to_remove)) | |
328 | |
329 # keep merged_taxon | |
330 out_metadata = { | |
331 'markers': {}, | |
332 'taxonomy': {}, | |
333 'merged_taxon': in_metadata['merged_taxon'] | |
334 } | |
335 | |
336 # parse markers to keep | |
337 removed_markers = [] | |
338 kept_markers = [] | |
339 taxons_to_keep = set() | |
340 gca_to_keep = set() | |
341 for m, v in in_metadata['markers'].items(): | |
342 if m not in markers_to_remove: | |
343 out_metadata['markers'][m] = v | |
344 kept_markers.append(m) | |
345 taxons_to_keep.add(v['taxon']) | |
346 gca_to_keep.update(v['ext']) | |
347 else: | |
348 removed_markers.append(m) | |
349 print('%s removed markers' % len(removed_markers)) | |
350 | |
351 # check markers that are not found | |
352 check_not_found_markers(removed_markers, markers_to_remove) | |
353 | |
354 # keep only taxonomy in taxons_to_keep or with GCA in gca_to_keep | |
355 out_metadata['taxonomy'] = prune_taxonomy(in_metadata['taxonomy'], taxons_to_keep, gca_to_keep) | |
356 | |
357 # save to JSON | |
358 dump_to_json(out_metadata, out_json_fp) | |
359 | |
360 # write list of kept markers | |
361 with open(kept_marker_fp, 'w') as kept_marker_f: | |
362 for m in kept_markers: | |
363 kept_marker_f.write("%s\n" % m) | |
364 | |
365 | |
366 def keep_markers(in_json_fp, marker_fp, out_json_fp): | |
367 ''' | |
368 Keep markers from JSON file, others will be removed | |
369 | |
370 :param in_json_fp: Path to input JSON file | |
371 :param marker_fp: Path to file with markers to keep (1 per line) | |
372 :param out_json_fp: Path to output JSON file | |
373 ''' | |
374 in_metadata = load_from_json(in_json_fp) | |
375 | |
376 # load markers | |
377 markers_to_keep = set(get_markers(marker_fp)) | |
378 print('%s markers to keep' % len(markers_to_keep)) | |
379 | |
380 # keep merged_taxon | |
381 out_metadata = { | |
382 'markers': {}, | |
383 'taxonomy': {}, | |
384 'merged_taxon': in_metadata['merged_taxon'] | |
385 } | |
386 | |
387 # parse markers to keep | |
388 kept_markers = [] | |
389 taxons_to_keep = set() | |
390 gca_to_keep = set() | |
391 for m, v in in_metadata['markers'].items(): | |
392 if m in markers_to_keep: | |
393 out_metadata['markers'][m] = v | |
394 kept_markers.append(m) | |
395 taxons_to_keep.add(v['taxon']) | |
396 gca_to_keep.update(v['ext']) | |
397 print('%s kept markers' % len(kept_markers)) | |
398 | |
399 # check markers that are not found | |
400 check_not_found_markers(kept_markers, markers_to_keep) | |
401 | |
402 # keep only taxonomy in taxons_to_keep or with GCA in gca_to_keep | |
403 out_metadata['taxonomy'] = prune_taxonomy(in_metadata['taxonomy'], taxons_to_keep, gca_to_keep) | |
404 | |
405 # save to JSON | |
406 dump_to_json(out_metadata, out_json_fp) | |
407 | |
408 | |
409 if __name__ == '__main__': | |
410 # Read command line | |
411 parser = argparse.ArgumentParser(description='Customize MetaPhlan database') | |
412 subparsers = parser.add_subparsers(dest='function') | |
413 # transform_pkl_to_json subcommand | |
414 pkl_to_json_parser = subparsers.add_parser('transform_pkl_to_json', help='Transform Pickle to JSON to get marker metadata') | |
415 pkl_to_json_parser.add_argument('--pkl', help="Path to input Pickle file") | |
416 pkl_to_json_parser.add_argument('--json', help="Path to output JSON file") | |
417 # transform_json_to_pkl subcommand | |
418 json_to_pkl_parser = subparsers.add_parser('transform_json_to_pkl', help='Transform JSON to Pickle to push marker metadata') | |
419 json_to_pkl_parser.add_argument('--json', help="Path to input JSON file") | |
420 json_to_pkl_parser.add_argument('--pkl', help="Path to output Pickle file") | |
421 # add_marker subcommand | |
422 add_marker_parser = subparsers.add_parser('add_marker', help='Add new marker to JSON file') | |
423 add_marker_parser.add_argument('--in_json', help="Path to input JSON file") | |
424 add_marker_parser.add_argument('--out_json', help="Path to output JSON file") | |
425 add_marker_parser.add_argument('--name', help="Name of new marker") | |
426 add_marker_parser.add_argument('--m_length', help="Length of new marker") | |
427 add_marker_parser.add_argument('--g_length', help="Length of genome from which the new marker has been extracted", action="append") | |
428 add_marker_parser.add_argument('--gca', help="GCA of genome from which the new marker has been extracted", action="append") | |
429 add_marker_parser.add_argument('--k_name', help="Name of Kingdom for genome from which the new marker has been extracted", action="append") | |
430 add_marker_parser.add_argument('--k_id', help="NCBI id of Kingdom for genome from which the new marker has been extracted", action="append") | |
431 add_marker_parser.add_argument('--p_name', help="Name of Phylum for genome from which the new marker has been extracted", action="append") | |
432 add_marker_parser.add_argument('--p_id', help="NCBI id of Phylum for genome from which the new marker has been extracted", action="append") | |
433 add_marker_parser.add_argument('--c_name', help="Name of Class for genome from which the new marker has been extracted", action="append") | |
434 add_marker_parser.add_argument('--c_id', help="NCBI id of Class for genome from which the new marker has been extracted", action="append") | |
435 add_marker_parser.add_argument('--o_name', help="Name of Order for genome from which the new marker has been extracted", action="append") | |
436 add_marker_parser.add_argument('--o_id', help="NCBI id of Order for genome from which the new marker has been extracted", action="append") | |
437 add_marker_parser.add_argument('--f_name', help="Name of Family for genome from which the new marker has been extracted", action="append") | |
438 add_marker_parser.add_argument('--f_id', help="NCBI id of Family for genome from which the new marker has been extracted", action="append") | |
439 add_marker_parser.add_argument('--g_name', help="Name of Genus for genome from which the new marker has been extracted", action="append") | |
440 add_marker_parser.add_argument('--g_id', help="NCBI id of Genus for genome from which the new marker has been extracted", action="append") | |
441 add_marker_parser.add_argument('--s_name', help="Name of Species for genome from which the new marker has been extracted", action="append") | |
442 add_marker_parser.add_argument('--s_id', help="NCBI id of Species for genome from which the new marker has been extracted", action="append") | |
443 add_marker_parser.add_argument('--t_name', help="Name of Strain for genome from which the new marker has been extracted", action="append") | |
444 # remove_markers subcommand | |
445 remove_markers_parser = subparsers.add_parser('remove_markers', help='Remove markers from JSON file') | |
446 remove_markers_parser.add_argument('--in_json', help="Path to input JSON file") | |
447 remove_markers_parser.add_argument('--markers', help="Path to file with markers to remove (1 per line)") | |
448 remove_markers_parser.add_argument('--out_json', help="Path to output JSON file") | |
449 remove_markers_parser.add_argument('--kept_markers', help="Path to file with kept markers") | |
450 # keep_markers subcommand | |
451 keep_markers_parser = subparsers.add_parser('keep_markers', help='Keep markers from JSON file, others will be removed') | |
452 keep_markers_parser.add_argument('--in_json', help="Path to input JSON file") | |
453 keep_markers_parser.add_argument('--markers', help="Path to file with markers to keep (1 per line)") | |
454 keep_markers_parser.add_argument('--out_json', help="Path to output JSON file") | |
455 | |
456 args = parser.parse_args() | |
457 | |
458 if args.function == 'transform_pkl_to_json': | |
459 transform_pkl_to_json(Path(args.pkl), Path(args.json)) | |
460 elif args.function == 'transform_json_to_pkl': | |
461 transform_json_to_pkl(Path(args.json), Path(args.pkl)) | |
462 elif args.function == 'add_marker': | |
463 add_marker( | |
464 args.in_json, | |
465 args.out_json, | |
466 args.name, | |
467 args.m_length, | |
468 args.g_length, | |
469 args.gca, | |
470 args.k_name, | |
471 args.k_id, | |
472 args.p_name, | |
473 args.p_id, | |
474 args.c_name, | |
475 args.c_id, | |
476 args.o_name, | |
477 args.o_id, | |
478 args.f_name, | |
479 args.f_id, | |
480 args.g_name, | |
481 args.g_id, | |
482 args.s_name, | |
483 args.s_id, | |
484 args.t_name) | |
485 elif args.function == 'remove_markers': | |
486 remove_markers(args.in_json, args.markers, args.out_json, args.kept_markers) | |
487 elif args.function == 'keep_markers': | |
488 keep_markers(args.in_json, args.markers, args.out_json) |