comparison customizemetadata.py @ 0:f5df500fcc3c draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/metaphlan/ commit 1e543a44ceffd8e4c5537b9015606ab3b90a114c"
author iuc
date Mon, 19 Apr 2021 20:56:20 +0000
parents
children 6dee4abadccb
comparison
equal deleted inserted replaced
-1:000000000000 0:f5df500fcc3c
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import argparse
5 import bz2
6 import json
7 import pickle
8 import re
9 from pathlib import Path
10
11
12 def load_from_json(json_fp):
13 '''
14 Read JSON file with marker metadata
15
16 :param json_fp: Path to JSON file
17 '''
18 with open(json_fp, 'r') as json_f:
19 data = json.load(json_f)
20
21 for m in data['markers']:
22 data['markers'][m]['ext'] = set(data['markers'][m]['ext'])
23
24 for t in data['taxonomy']:
25 if isinstance(data['taxonomy'][t], list):
26 data['taxonomy'][t] = tuple(data['taxonomy'][t])
27 return data
28
29
30 def dump_to_json(data, json_fp):
31 '''
32 Dump marker metadata to JSON file
33
34 :param json_fp: Path to JSON file
35 '''
36 for m in data['markers']:
37 data['markers'][m]['ext'] = list(data['markers'][m]['ext'])
38
39 with open(json_fp, 'w') as json_f:
40 json.dump(data, json_f)
41
42
43 def transform_pkl_to_json(pkl_fp, json_fp):
44 '''
45 Read Pickle file and drop it to a JSON file
46
47 :param pkl_fp: Path to input Pickle file
48 :param json_fp: Path to output JSON file
49 '''
50 # load metadata from Pickle file
51 with bz2.BZ2File(pkl_fp, 'r') as pkl_f:
52 in_metadata = pickle.load(pkl_f)
53
54 out_metadata = {
55 'markers': in_metadata['markers'],
56 'taxonomy': in_metadata['taxonomy'],
57 'merged_taxon': {}
58 }
59 # transform merged_taxons tuple keys to string
60 for k in in_metadata['merged_taxon']:
61 n = ' , '.join(k)
62 out_metadata[n] = in_metadata['merged_taxon'][k]
63
64 # dump metadata to JSON file
65 dump_to_json(out_metadata, json_fp)
66
67
68 def transform_json_to_pkl(json_fp, pkl_fp):
69 '''
70 Read JSON file and drop it to a Pickle file
71
72 :param json_fp: Path to input JSON file
73 :param pkl_fp: Path to output Pickle file
74 '''
75 # load metadata from JSON file
76 in_metadata = load_from_json(json_fp)
77
78 out_metadata = {
79 'markers': in_metadata['markers'],
80 'taxonomy': in_metadata['taxonomy'],
81 'merged_taxon': {}
82 }
83 # transform merged_taxons keys to tuple
84 for k in in_metadata['merged_taxon']:
85 n = ' , '.split(k)
86 out_metadata[n] = in_metadata['merged_taxon'][k]
87
88 # dump metadata to Pickle file
89 with bz2.BZ2File(pkl_fp, 'w') as pkl_f:
90 pickle.dump(out_metadata, pkl_f)
91
92
93 def add_marker(in_json_fp, out_json_fp, name, m_length, g_length, gca, k_name, k_id, p_name, p_id, c_name, c_id, o_name, o_id, f_name, f_id, g_name, g_id, s_name, s_id, t_name):
94 '''
95 Add marker to JSON file
96
97 :param in_json_fp: Path to input JSON file
98 :param out_json_fp: Path to output JSON file
99 :param name: Name of new marker
100 :param m_length: Length of new marker
101 :param g_length: List with lengths of genomes from which the new marker has been extracted
102 :param gca: List with GCA of genomes from which the new marker has been extracted
103 :param k_name: List with Name of Kingdom for genomes from which the new marker has been extracted
104 :param k_id: List with NCBI id of Kingdom for genomes from which the new marker has been extracted
105 :param p_name: List with Name of Phylum for genomes from which the new marker has been extracted
106 :param p_id: List with NCBI id of Phylum for genomes from which the new marker has been extracted
107 :param c_name: List with Name of Class for genomes from which the new marker has been extracted
108 :param c_id: List with NCBI id of Class for genomes from which the new marker has been extracted
109 :param o_name: List with Name of Order for genomes from which the new marker has been extracted
110 :param o_id: List with NCBI id of Order for genomes from which the new marker has been extracted
111 :param f_name: List with Name of Family for genomes from which the new marker has been extracted
112 :param f_id: List with NCBI id of Family for genomes from which the new marker has been extracted
113 :param g_name: List with Name of Genus for genomes from which the new marker has been extracted
114 :param g_id: List with NCBI id of Genus for genomes from which the new marker has been extracted
115 :param s_name: List with Name of Species for genomes from which the new marker has been extracted
116 :param s_id: List with NCBI id of Species for genomes from which the new marker has been extracted
117 :param t_name: List with Name of Strain for genomes from which the new marker has been extracted
118 '''
119 metadata = load_from_json(in_json_fp)
120
121 # check that all lists have same size
122 genome_n = len(g_length)
123 if len(gca) != genome_n:
124 raise ValueError("Missing/Extra values in GCA list")
125 if len(k_name) != genome_n:
126 raise ValueError("Missing/Extra values in Kingdom name list")
127 if len(k_id) != genome_n:
128 raise ValueError("Missing/Extra values in Kingdom ID list")
129 if len(p_name) != genome_n:
130 raise ValueError("Missing/Extra values in Phylum name list")
131 if len(p_id) != genome_n:
132 raise ValueError("Missing/Extra values in Phylum ID list")
133 if len(c_name) != genome_n:
134 raise ValueError("Missing/Extra values in Class name list")
135 if len(c_id) != genome_n:
136 raise ValueError("Missing/Extra values in Class ID list")
137 if len(o_name) != genome_n:
138 raise ValueError("Missing/Extra values in Order name list")
139 if len(o_id) != genome_n:
140 raise ValueError("Missing/Extra values in Order ID list")
141 if len(f_name) != genome_n:
142 raise ValueError("Missing/Extra values in Family name list")
143 if len(f_id) != genome_n:
144 raise ValueError("Missing/Extra values in Family ID list")
145 if len(g_name) != genome_n:
146 raise ValueError("Missing/Extra values in Genus name list")
147 if len(g_id) != genome_n:
148 raise ValueError("Missing/Extra values in Genus ID list")
149 if len(s_name) != genome_n:
150 raise ValueError("Missing/Extra values in Species name list")
151 if len(s_id) != genome_n:
152 raise ValueError("Missing/Extra values in Species ID list")
153 if len(t_name) != genome_n:
154 raise ValueError("Missing/Extra values in Strain name list")
155
156 # create dictionary to aggregate genome taxonomies and identify marker taxonomy
157 taxonomy = {
158 'k': set(),
159 'p': set(),
160 'c': set(),
161 'o': set(),
162 'f': set(),
163 'g': set(),
164 's': set(),
165 't': set(),
166 }
167
168 # parse genomes
169 for i in range(genome_n):
170 # add taxonomy of new genome
171 g_taxo_names = "k__%s|p__%s|c__%s|o__%s|f__%s|g__%s|s__%s|t__%s" % (
172 k_name[i],
173 p_name[i],
174 c_name[i],
175 o_name[i],
176 f_name[i],
177 g_name[i],
178 s_name[i],
179 t_name[i]
180 )
181 g_taxo_ids = "%s|%s|%s|%s|%s|%s|%s" % (
182 k_id[i],
183 p_id[i],
184 c_id[i],
185 o_id[i],
186 f_id[i],
187 g_id[i],
188 s_id[i]
189 )
190 metadata['taxonomy'][g_taxo_names] = (g_taxo_ids, g_length[i])
191 # aggregate taxon levels using sets
192 taxonomy['k'].add(k_name[i])
193 taxonomy['p'].add(p_name[i])
194 taxonomy['c'].add(c_name[i])
195 taxonomy['o'].add(o_name[i])
196 taxonomy['f'].add(f_name[i])
197 taxonomy['g'].add(g_name[i])
198 taxonomy['s'].add(s_name[i])
199 taxonomy['t'].add(t_name[i])
200
201 # extract clade and taxon of marker
202 clade = '' # last level before taxomy of genomes diverge
203 taxon = '' # combination of levels before divergence
204 for level in ['k', 'p', 'c', 'o', 'f', 'g', 's', 't']:
205 taxo = list(taxonomy[level])
206 if len(taxo) == 1:
207 clade = taxo[0]
208 taxon = "%s|%s__%s" % (taxon, level, taxo)
209
210 # add information about the new marker
211 metadata['markers'][name] = {
212 'clade': clade,
213 'ext': set(gca),
214 'len': m_length,
215 'taxon': taxon
216 }
217
218 dump_to_json(metadata, out_json_fp)
219
220
221 def format_markers(marker_l):
222 '''
223 Format markers
224
225 :param marker_l: list of markers
226 '''
227 markers = []
228 for m in marker_l:
229 m = m.rstrip()
230 if ' ' in m:
231 markers.append(m.split(' ')[0])
232 else:
233 markers.append(m)
234 return markers
235
236
237 def get_markers(marker_fp):
238 '''
239 Get markers from a file
240
241 :param marker_fp: Path to file with markers (1 per line)
242 '''
243 # load markers
244 with open(marker_fp, 'r') as marker_f:
245 markers = marker_f.readlines()
246
247 # format markers
248 markers = format_markers(markers)
249
250 return markers
251
252
253 def check_not_found_markers(found_markers, original_markers):
254 '''
255 Check list of markers
256
257 :param found_markers: list of found markers
258 :param original_markers: list of original markers
259 '''
260 if len(found_markers) != len(original_markers):
261 print('markers not found:')
262 for m in original_markers:
263 if m not in found_markers:
264 print('- "%s"' % m)
265
266
267 def prune_taxonomy(in_taxonomy, taxon_s, gca_s):
268 '''
269 Prune taxonomy to keep only listed taxonomy
270
271 :param in_taxonomy: dictionary with list of taxonomy
272 :param taxon_s: set of taxons to keep
273 :param gca_s: set of GCA ids to keep
274 '''
275 out_taxonomy = {}
276 kept_taxonomy = set()
277 kept_taxons = set()
278 kept_gca = set()
279 for t, v in in_taxonomy.items():
280 # check if t match element in list of taxon_s
281 kept_taxon = False
282 for t_k in taxon_s:
283 if t_k in t:
284 kept_taxon = True
285 out_taxonomy[t] = v
286 kept_taxonomy.add(t)
287 kept_taxons.add(t_k)
288 break
289 # check if GCA in the taxon id
290 s = re.search(r'GCA_\d+$', t)
291 if s:
292 gca = s[0]
293 # check if GCA in taxon id is in the list GCA to keep
294 if gca in gca_s:
295 kept_gca.add(gca)
296 if not kept_taxon:
297 out_taxonomy[t] = v
298 kept_taxonomy.add(t)
299
300 print('%s kept taxonomy' % len(kept_taxonomy))
301 print('%s / %s taxons not found' % (len(taxon_s) - len(kept_taxons), len(taxon_s)))
302 print('%s / %s GCA taxons not found' % (len(gca_s) - len(kept_gca), len(gca_s)))
303 return out_taxonomy
304
305
306 def remove_markers(in_json_fp, marker_fp, out_json_fp, kept_marker_fp):
307 '''
308 Remove markers from JSON file
309
310 :param in_json_fp: Path to input JSON file
311 :param marker_fp: Path to file with markers to remove (1 per line)
312 :param out_json_fp: Path to output JSON file
313 :param kept_marker_fp: Path to file with kept markers
314 '''
315 in_metadata = load_from_json(in_json_fp)
316
317 # load markers
318 markers_to_remove = set(get_markers(marker_fp))
319 print('%s markers to remove' % len(markers_to_remove))
320
321 # keep merged_taxon
322 out_metadata = {
323 'markers': {},
324 'taxonomy': {},
325 'merged_taxon': in_metadata['merged_taxon']
326 }
327
328 # parse markers to keep
329 removed_markers = []
330 kept_markers = []
331 taxons_to_keep = set()
332 gca_to_keep = set()
333 for m, v in in_metadata['markers'].items():
334 if m not in markers_to_remove:
335 out_metadata['markers'][m] = v
336 kept_markers.append(m)
337 taxons_to_keep.add(v['taxon'])
338 gca_to_keep.update(v['ext'])
339 else:
340 removed_markers.append(m)
341 print('%s removed markers' % len(removed_markers))
342
343 # check markers that are not found
344 check_not_found_markers(removed_markers, markers_to_remove)
345
346 # keep only taxonomy in taxons_to_keep or with GCA in gca_to_keep
347 out_metadata['taxonomy'] = prune_taxonomy(in_metadata['taxonomy'], taxons_to_keep, gca_to_keep)
348
349 # save to JSON
350 dump_to_json(out_metadata, out_json_fp)
351
352 # write list of kept markers
353 with open(kept_marker_fp, 'w') as kept_marker_f:
354 for m in kept_markers:
355 kept_marker_f.write("%s\n" % m)
356
357
358 def keep_markers(in_json_fp, marker_fp, out_json_fp):
359 '''
360 Keep markers from JSON file, others will be removed
361
362 :param in_json_fp: Path to input JSON file
363 :param marker_fp: Path to file with markers to keep (1 per line)
364 :param out_json_fp: Path to output JSON file
365 '''
366 in_metadata = load_from_json(in_json_fp)
367
368 # load markers
369 markers_to_keep = set(get_markers(marker_fp))
370 print('%s markers to keep' % len(markers_to_keep))
371
372 # keep merged_taxon
373 out_metadata = {
374 'markers': {},
375 'taxonomy': {},
376 'merged_taxon': in_metadata['merged_taxon']
377 }
378
379 # parse markers to keep
380 kept_markers = []
381 taxons_to_keep = set()
382 gca_to_keep = set()
383 for m, v in in_metadata['markers'].items():
384 if m in markers_to_keep:
385 out_metadata['markers'][m] = v
386 kept_markers.append(m)
387 taxons_to_keep.add(v['taxon'])
388 gca_to_keep.update(v['ext'])
389 print('%s kept markers' % len(kept_markers))
390
391 # check markers that are not found
392 check_not_found_markers(kept_markers, markers_to_keep)
393
394 # keep only taxonomy in taxons_to_keep or with GCA in gca_to_keep
395 out_metadata['taxonomy'] = prune_taxonomy(in_metadata['taxonomy'], taxons_to_keep, gca_to_keep)
396
397 # save to JSON
398 dump_to_json(out_metadata, out_json_fp)
399
400
401 if __name__ == '__main__':
402 # Read command line
403 parser = argparse.ArgumentParser(description='Customize MetaPhlan database')
404 subparsers = parser.add_subparsers(dest='function')
405 # transform_pkl_to_json subcommand
406 pkl_to_json_parser = subparsers.add_parser('transform_pkl_to_json', help='Transform Pickle to JSON to get marker metadata')
407 pkl_to_json_parser.add_argument('--pkl', help="Path to input Pickle file")
408 pkl_to_json_parser.add_argument('--json', help="Path to output JSON file")
409 # transform_json_to_pkl subcommand
410 json_to_pkl_parser = subparsers.add_parser('transform_json_to_pkl', help='Transform JSON to Pickle to push marker metadata')
411 json_to_pkl_parser.add_argument('--json', help="Path to input JSON file")
412 json_to_pkl_parser.add_argument('--pkl', help="Path to output Pickle file")
413 # add_marker subcommand
414 add_marker_parser = subparsers.add_parser('add_marker', help='Add new marker to JSON file')
415 add_marker_parser.add_argument('--in_json', help="Path to input JSON file")
416 add_marker_parser.add_argument('--out_json', help="Path to output JSON file")
417 add_marker_parser.add_argument('--name', help="Name of new marker")
418 add_marker_parser.add_argument('--m_length', help="Length of new marker")
419 add_marker_parser.add_argument('--g_length', help="Length of genome from which the new marker has been extracted", action="append")
420 add_marker_parser.add_argument('--gca', help="GCA of genome from which the new marker has been extracted", action="append")
421 add_marker_parser.add_argument('--k_name', help="Name of Kingdom for genome from which the new marker has been extracted", action="append")
422 add_marker_parser.add_argument('--k_id', help="NCBI id of Kingdom for genome from which the new marker has been extracted", action="append")
423 add_marker_parser.add_argument('--p_name', help="Name of Phylum for genome from which the new marker has been extracted", action="append")
424 add_marker_parser.add_argument('--p_id', help="NCBI id of Phylum for genome from which the new marker has been extracted", action="append")
425 add_marker_parser.add_argument('--c_name', help="Name of Class for genome from which the new marker has been extracted", action="append")
426 add_marker_parser.add_argument('--c_id', help="NCBI id of Class for genome from which the new marker has been extracted", action="append")
427 add_marker_parser.add_argument('--o_name', help="Name of Order for genome from which the new marker has been extracted", action="append")
428 add_marker_parser.add_argument('--o_id', help="NCBI id of Order for genome from which the new marker has been extracted", action="append")
429 add_marker_parser.add_argument('--f_name', help="Name of Family for genome from which the new marker has been extracted", action="append")
430 add_marker_parser.add_argument('--f_id', help="NCBI id of Family for genome from which the new marker has been extracted", action="append")
431 add_marker_parser.add_argument('--g_name', help="Name of Genus for genome from which the new marker has been extracted", action="append")
432 add_marker_parser.add_argument('--g_id', help="NCBI id of Genus for genome from which the new marker has been extracted", action="append")
433 add_marker_parser.add_argument('--s_name', help="Name of Species for genome from which the new marker has been extracted", action="append")
434 add_marker_parser.add_argument('--s_id', help="NCBI id of Species for genome from which the new marker has been extracted", action="append")
435 add_marker_parser.add_argument('--t_name', help="Name of Strain for genome from which the new marker has been extracted", action="append")
436 # remove_markers subcommand
437 remove_markers_parser = subparsers.add_parser('remove_markers', help='Remove markers from JSON file')
438 remove_markers_parser.add_argument('--in_json', help="Path to input JSON file")
439 remove_markers_parser.add_argument('--markers', help="Path to file with markers to remove (1 per line)")
440 remove_markers_parser.add_argument('--out_json', help="Path to output JSON file")
441 remove_markers_parser.add_argument('--kept_markers', help="Path to file with kept markers")
442 # keep_markers subcommand
443 keep_markers_parser = subparsers.add_parser('keep_markers', help='Keep markers from JSON file, others will be removed')
444 keep_markers_parser.add_argument('--in_json', help="Path to input JSON file")
445 keep_markers_parser.add_argument('--markers', help="Path to file with markers to keep (1 per line)")
446 keep_markers_parser.add_argument('--out_json', help="Path to output JSON file")
447
448 args = parser.parse_args()
449
450 if args.function == 'transform_pkl_to_json':
451 transform_pkl_to_json(Path(args.pkl), Path(args.json))
452 elif args.function == 'transform_json_to_pkl':
453 transform_json_to_pkl(Path(args.json), Path(args.pkl))
454 elif args.function == 'add_marker':
455 add_marker(
456 args.in_json,
457 args.out_json,
458 args.name,
459 args.m_length,
460 args.g_length,
461 args.gca,
462 args.k_name,
463 args.k_id,
464 args.p_name,
465 args.p_id,
466 args.c_name,
467 args.c_id,
468 args.o_name,
469 args.o_id,
470 args.f_name,
471 args.f_id,
472 args.g_name,
473 args.g_id,
474 args.s_name,
475 args.s_id,
476 args.t_name)
477 elif args.function == 'remove_markers':
478 remove_markers(args.in_json, args.markers, args.out_json, args.kept_markers)
479 elif args.function == 'keep_markers':
480 keep_markers(args.in_json, args.markers, args.out_json)