comparison resfinder/run_resfinder.py @ 0:55051a9bc58d draft default tip

Uploaded
author dcouvin
date Mon, 10 Jan 2022 20:06:07 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:55051a9bc58d
1 #!/usr/bin/env python3
2 import sys
3 import os
4 import subprocess
5 from argparse import ArgumentParser
6 import pickle
7
8 from cge.resfinder import ResFinder
9 from cge.pointfinder import PointFinder
10
11 # Modules used to create the extended ResFinder output (phenotype output)
12 from cge.phenotype2genotype.isolate import Isolate
13 from cge.phenotype2genotype.res_profile import PhenoDB
14 from cge.phenotype2genotype.res_sumtable import ResSumTable
15 from cge.phenotype2genotype.res_sumtable import PanelNameError
16 from cge.out.util.generator import Generator
17 from cge.standardize_results import ResFinderResultHandler, DatabaseHandler
18 from cge.standardize_results import PointFinderResultHandler
19
20 import json
21 # TODO list:
22 # TODO: Add input data check
23
24
25 # ########################################################################### #
26 # ######### FUNCTIONS ######### #
27 # ########################################################################### #
28
29
30 def eprint(*args, **kwargs):
31 print(*args, file=sys.stderr, **kwargs)
32
33 # TODO: Add fix species choice
34 species_transl = {"c. jejuni": "campylobacter jejuni",
35 "c.jejuni": "campylobacter jejuni",
36 "c jejuni": "campylobacter jejuni",
37 "c. coli": "campylobacter coli",
38 "c.coli": "campylobacter coli",
39 "c coli": "campylobacter coli",
40 "e. coli": "escherichia coli",
41 "e.coli": "escherichia coli",
42 "e coli": "escherichia coli",
43 "ecoli": "escherichia coli",
44 "s. enterica": "salmonella enterica",
45 "s.enterica": "salmonella enterica",
46 "s enterica": "salmonella enterica",
47 "senterica": "salmonella enterica",
48 }
49
50 ##########################################################################
51 # PARSE COMMAND LINE OPTIONS
52 ##########################################################################
53
54 parser = ArgumentParser(allow_abbrev=False)
55
56 # General options
57 parser.add_argument("-ifa", "--inputfasta",
58 help="Input fasta file.",
59 default=None)
60 parser.add_argument("-ifq", "--inputfastq",
61 help="Input fastq file(s). Assumed to be single-end fastq \
62 if only one file is provided, and assumed to be \
63 paired-end data if two files are provided.",
64 nargs="+",
65 default=None)
66
67 parser.add_argument("-o", "--outputPath",
68 dest="out_path",
69 help="Path to blast output",
70 default='')
71 parser.add_argument("-b", "--blastPath",
72 dest="blast_path",
73 help="Path to blastn",
74 default='blastn')
75 parser.add_argument("-k", "--kmaPath",
76 dest="kma_path",
77 help="Path to KMA",
78 default=None)
79 parser.add_argument("-s", "--species",
80 help="Species in the sample",
81 default=None)
82
83 # Acquired resistance options
84 parser.add_argument("-db_res", "--db_path_res",
85 help="Path to the databases for ResFinder",
86 default=None)
87 parser.add_argument("-db_res_kma", "--db_path_res_kma",
88 help="Path to the ResFinder databases indexed with KMA. \
89 Defaults to the 'kma_indexing' directory inside the \
90 given database directory.",
91 default=None)
92 parser.add_argument("-d", "--databases",
93 dest="databases",
94 help="Databases chosen to search in - if none is specified\
95 all is used",
96 default=None)
97 parser.add_argument("-acq", "--acquired",
98 action="store_true",
99 dest="acquired",
100 help="Run resfinder for acquired resistance genes",
101 default=False)
102 parser.add_argument("-ao", "--acq_overlap",
103 help="Genes are allowed to overlap this number of\
104 nucleotides. Default: 30.",
105 type=int,
106 default=30)
107 parser.add_argument("-l", "--min_cov",
108 dest="min_cov",
109 help="Minimum (breadth-of) coverage of ResFinder within the range 0-1.",
110 type=float,
111 default=0.60)
112 parser.add_argument("-t", "--threshold",
113 dest="threshold",
114 help="Threshold for identity of ResFinder within the range 0-1.",
115 type=float,
116 default=0.80)
117 parser.add_argument("-nano", "--nanopore",
118 action="store_true",
119 dest="nanopore",
120 help="If nanopore data is used",
121 default=False)
122 # Point resistance option
123 parser.add_argument("-c", "--point",
124 action="store_true",
125 dest="point",
126 help="Run pointfinder for chromosomal mutations",
127 default=False)
128 parser.add_argument("-db_point", "--db_path_point",
129 help="Path to the databases for PointFinder",
130 default=None)
131 parser.add_argument("-db_point_kma", "--db_path_point_kma",
132 help="Path to the PointFinder databases indexed with KMA. \
133 Defaults to the 'kma_indexing' directory inside the \
134 given database directory.",
135 default=None)
136 parser.add_argument("-g",
137 dest="specific_gene",
138 nargs='+',
139 help="Specify genes existing in the database to \
140 search for - if none is specified all genes are \
141 included in the search.",
142 default=None)
143 parser.add_argument("-u", "--unknown_mut",
144 dest="unknown_mutations",
145 action="store_true",
146 help="Show all mutations found even if in unknown to the\
147 resistance database",
148 default=False)
149 parser.add_argument("-l_p", "--min_cov_point",
150 dest="min_cov_point",
151 help="Minimum (breadth-of) coverage of Pointfinder within the range 0-1. \
152 If None is selected, the minimum coverage of \
153 ResFinder will be used.",
154 type=float,
155 default=None)
156 parser.add_argument("-t_p", "--threshold_point",
157 dest="threshold_point",
158 help="Threshold for identity of Pointfinder within the range 0-1. \
159 If None is selected, the minimum coverage of \
160 ResFinder will be used.",
161 type=float,
162 default=None)
163 # Temporary option only available temporary
164 parser.add_argument("--pickle",
165 action="store_true",
166 help="Create a pickle dump of the Isolate object. \
167 Currently needed in the CGE webserver. Dependency \
168 and this option is being removed.",
169 default=False)
170
171 args = parser.parse_args()
172
173 if(args.species is not None and args.species.lower() == "other"):
174 args.species = None
175
176 if(args.point and not args.species):
177 sys.exit("ERROR: Chromosomal point mutations cannot be located if no "
178 "species has been provided. Please provide species using the "
179 "--species option.")
180
181 # Check if coverage/identity parameters are valid
182 if(args.min_cov > 1.0 or args.min_cov < 0.0):
183 sys.exit("ERROR: Minimum coverage above 1 or below 0 is not allowed. Please select a minimum coverage within the range 0-1 with the flag -l.")
184
185 if(args.threshold > 1.0 or args.threshold < 0.0):
186 sys.exit("ERROR: Threshold for identity of ResFinder above 1 or below 0 is not allowed. Please select a threshold for identity within the range 0-1 with the flag -t.")
187
188
189 # Create a "sample" name
190 if(args.inputfasta):
191 args.inputfasta = os.path.abspath(args.inputfasta)
192 if(not os.path.isfile(args.inputfasta)):
193 sys.exit("ERROR: Input FASTA file not found: " + args.inputfasta)
194 sample_name = os.path.basename(args.inputfasta)
195 method = PointFinder.TYPE_BLAST
196 else:
197 sample_name = os.path.basename(args.inputfastq[0])
198 method = PointFinder.TYPE_KMA
199
200 if(args.inputfastq):
201 inputfastq_1 = args.inputfastq[0]
202 inputfastq_1 = os.path.abspath(inputfastq_1)
203 if(not os.path.isfile(inputfastq_1)):
204 sys.exit("ERROR: Input fastq file 1 not found: " + inputfastq_1)
205 if(len(args.inputfastq) == 2):
206 inputfastq_2 = args.inputfastq[1]
207 inputfastq_2 = os.path.abspath(inputfastq_2)
208 if(not os.path.isfile(inputfastq_2)):
209 sys.exit("ERROR: Input fastq file 2 not found: " + inputfastq_2)
210 else:
211 inputfastq_2 = None
212
213 blast = args.blast_path
214 if(args.inputfasta):
215 try:
216 _ = subprocess.check_output([blast, "-h"])
217 except FileNotFoundError as e:
218 sys.exit("ERROR: Unable to execute blastn from the path: {}"
219 .format(blast))
220
221 # Check KMA path cge/kma/kma
222 if(args.inputfastq):
223 if(args.kma_path is None):
224 kma = (os.path.dirname(os.path.realpath(__file__)) + "/cge/kma/kma")
225 kma = os.path.abspath(kma)
226 try:
227 _ = subprocess.check_output([kma, "-h"])
228 except FileNotFoundError as e:
229 kma = "kma"
230 else:
231 kma = args.kma_path
232 try:
233 _ = subprocess.check_output([kma, "-h"])
234 except FileNotFoundError as e:
235 sys.exit("ERROR: Unable to execute kma from the path: {}".format(kma))
236 else:
237 kma = None
238
239 db_path_point = None
240
241 if(args.species):
242 args.species = args.species.lower()
243
244 fixed_species = species_transl.get(args.species, None)
245 if(fixed_species):
246 args.species = fixed_species
247
248 tmp_list = args.species.split()
249 if(len(tmp_list) != 1 and len(tmp_list) != 2):
250 sys.exit("ERROR: Species name must contain 1 or 2 names.")
251
252 # Check Poinfinder database
253 if(args.point):
254 if(len(tmp_list) == 2):
255 point_species = "_".join(tmp_list)
256 else:
257 point_species = tmp_list[0]
258
259 if(args.db_path_point is None and args.point):
260 db_path_point = (os.path.dirname(
261 os.path.realpath(__file__)) + "/db_pointfinder")
262 elif(args.db_path_point is not None):
263 db_path_point = args.db_path_point
264
265 db_path_point = os.path.abspath(db_path_point)
266 point_dbs = PointFinder.get_db_names(db_path_point)
267
268 # Check if a database for species exists
269 if(point_species not in point_dbs and args.point):
270 # If not db for species is found check if db for genus is found
271 # and use that instead
272 if(tmp_list[0] in point_dbs):
273 point_species = tmp_list[0]
274 else:
275 sys.exit("ERROR: species '%s' (%s) does not seem to exist"
276 " as a PointFinder database."
277 % (args.species, point_species))
278
279 db_path_point_root = db_path_point
280 db_path_point = db_path_point + "/" + point_species
281
282 # Check output directory
283 args.out_path = os.path.abspath(args.out_path)
284 os.makedirs(args.out_path, exist_ok=True)
285
286 if args.acquired is False and args.point is False:
287 sys.exit("Please specify to look for acquired resistance genes, "
288 "chromosomal mutaitons or both!\n")
289
290 # Check ResFinder database
291 if(args.db_path_res is None):
292 args.db_path_res = (os.path.dirname(
293 os.path.realpath(__file__)) + "/db_resfinder")
294 args.db_path_res = os.path.abspath(args.db_path_res)
295 if(not os.path.exists(args.db_path_res)):
296 sys.exit("Could not locate ResFinder database path: %s"
297 % args.db_path_res)
298
299 if(args.db_path_res_kma is None and args.acquired and args.inputfastq):
300 args.db_path_res_kma = args.db_path_res
301 if(not os.path.exists(args.db_path_res_kma)):
302 sys.exit("Could not locate ResFinder database index path: %s"
303 % args.db_path_res_kma)
304
305 min_cov = float(args.min_cov)
306
307 # Initialise result dict
308 init_software_result = {"software_name": "ResFinder"}
309 git_path = os.path.abspath(os.path.dirname(__file__))
310 std_result = Generator.init_software_result(name="ResFinder", gitdir=git_path)
311
312 if(args.acquired):
313 DatabaseHandler.load_database_metadata("ResFinder", std_result,
314 args.db_path_res)
315 if(args.point):
316 DatabaseHandler.load_database_metadata("PointFinder", std_result,
317 db_path_point_root)
318 ##########################################################################
319 # ResFinder
320 ##########################################################################
321
322 if args.acquired is True:
323
324 databases = args.databases
325 threshold = float(args.threshold)
326
327 if(args.inputfasta):
328 out_res_blast = args.out_path + "/resfinder_blast"
329 os.makedirs(out_res_blast, exist_ok=True)
330 if(args.inputfastq):
331 out_res_kma = args.out_path + "/resfinder_kma"
332 os.makedirs(out_res_kma, exist_ok=True)
333
334 db_path_res = args.db_path_res
335
336 # Check if valid database is provided
337 if(db_path_res is None):
338 db_path_res = (os.path.dirname(os.path.realpath(__file__))
339 + "/db_resfinder")
340
341 if not os.path.exists(db_path_res):
342 sys.exit("Input Error: The specified database directory does not "
343 "exist!\nProvided path: " + str(db_path_res))
344 else:
345 # Check existence of config file
346 db_config_file = '%s/config' % (db_path_res)
347 if not os.path.exists(db_config_file):
348 sys.exit("Input Error: The database config file could not be "
349 "found!")
350
351 # Check existence of notes file
352 notes_path = "%s/notes.txt" % (db_path_res)
353 if not os.path.exists(notes_path):
354 sys.exit('Input Error: notes.txt not found! (%s)' % (notes_path))
355
356 blast_results = None
357 kma_results = None
358
359 # Actually running ResFinder (for acquired resistance)
360 acquired_finder = ResFinder(db_conf_file=db_config_file,
361 databases=args.databases, db_path=db_path_res,
362 notes=notes_path,
363 db_path_kma=args.db_path_res_kma)
364
365 if(args.inputfasta):
366 blast_results = acquired_finder.blast(inputfile=args.inputfasta,
367 out_path=out_res_blast,
368 min_cov=min_cov,
369 threshold=threshold,
370 blast=blast,
371 allowed_overlap=args.acq_overlap)
372
373 # DEPRECATED
374 # use std_result
375 new_std_res = ResFinder.old_results_to_standard_output(
376 blast_results.results, software="ResFinder", version="4.1.0",
377 run_date="fake_run_date", run_cmd="Fake run cmd",
378 id=sample_name)
379
380 # DEPRECATED
381 # use std_result
382 acquired_finder.write_results(out_path=args.out_path,
383 result=blast_results,
384 res_type=ResFinder.TYPE_BLAST)
385
386 ResFinderResultHandler.standardize_results(std_result,
387 blast_results.results,
388 "ResFinder")
389 #DEBUG
390 # print("STD RESULT:\n{}".format(json.dumps(std_result)))
391
392 else:
393 if args.nanopore:
394 kma_run = acquired_finder.kma(inputfile_1=inputfastq_1,
395 inputfile_2=inputfastq_2,
396 out_path=out_res_kma,
397 db_path_kma=args.db_path_res_kma,
398 databases=acquired_finder.databases,
399 min_cov=min_cov,
400 threshold=args.threshold,
401 kma_path=kma,
402 sample_name="",
403 kma_cge=True,
404 kma_apm="p",
405 kma_1t1=True,
406 kma_add_args='-ont -md 5')
407 else:
408 kma_run = acquired_finder.kma(inputfile_1=inputfastq_1,
409 inputfile_2=inputfastq_2,
410 out_path=out_res_kma,
411 db_path_kma=args.db_path_res_kma,
412 databases=acquired_finder.databases,
413 min_cov=min_cov,
414 threshold=args.threshold,
415 kma_path=kma,
416 sample_name="",
417 kma_cge=True,
418 kma_apm="p",
419 kma_1t1=True)
420
421 # DEPRECATED
422 # use std_result
423 new_std_res = ResFinder.old_results_to_standard_output(
424 kma_run.results, software="ResFinder", version="4.1.0",
425 run_date="fake_run_date", run_cmd="Fake run cmd",
426 id=sample_name)
427
428 # DEPRECATED
429 # use std_result
430 acquired_finder.write_results(out_path=args.out_path,
431 result=kma_run.results,
432 res_type=ResFinder.TYPE_KMA)
433
434 ResFinderResultHandler.standardize_results(std_result,
435 kma_run.results,
436 "ResFinder")
437 #DEBUG
438 # print("STD RESULT:\n{}".format(json.dumps(std_result)))
439
440 ##########################################################################
441 # PointFinder
442 ##########################################################################
443
444 if args.point is True and args.species:
445
446 if(args.inputfasta):
447 out_point = os.path.abspath(args.out_path + "/pointfinder_blast")
448 os.makedirs(out_point, exist_ok=True)
449 if(args.inputfastq):
450 out_point = os.path.abspath(args.out_path + "/pointfinder_kma")
451 os.makedirs(out_point, exist_ok=True)
452
453 if args.min_cov_point is None:
454 min_cov_point = args.min_cov
455 else:
456 min_cov_point = args.min_cov_point
457 if(args.min_cov_point > 1.0 or args.min_cov_point < 0.0):
458 sys.exit("ERROR: Minimum coverage above 1 or below 0 is not allowed. Please select a minimum coverage within the range 0-1 with the flag -l_p.")
459
460 if args.threshold_point is None:
461 threshold_point = args.threshold
462 else:
463 threshold_point = args.threshold_point
464 if(args.threshold_point > 1.0 or args.threshold_point < 0.0):
465 sys.exit("ERROR: Threshold for identity of PointFinder above 1 or below 0 is not allowed. Please select a threshold for identity within the range 0-1 with the flag -t_p.")
466
467 finder = PointFinder(db_path=db_path_point, species=point_species,
468 gene_list=args.specific_gene)
469
470 if(args.inputfasta):
471 blast_run = finder.blast(inputfile=args.inputfasta,
472 out_path=out_point,
473 # min_cov=min_cov_point,
474 min_cov=0.01,
475 threshold=threshold_point,
476 blast=blast,
477 cut_off=False)
478 results = blast_run.results
479
480 else:
481
482 method = PointFinder.TYPE_KMA
483
484 kma_run = finder.kma(inputfile_1=inputfastq_1,
485 inputfile_2=inputfastq_2,
486 out_path=out_point,
487 db_path_kma=db_path_point,
488 databases=[point_species],
489 min_cov=0.01,
490 # min_cov=min_cov_point,
491 threshold=threshold_point,
492 kma_path=kma,
493 sample_name="",
494 kma_cge=True,
495 kma_apm="p",
496 kma_1t1=True)
497
498 results = kma_run.results
499
500 if(args.specific_gene):
501 results = PointFinder.discard_unwanted_results(
502 results=results, wanted=args.specific_gene)
503
504 if(method == PointFinder.TYPE_BLAST):
505 results_pnt = finder.find_best_seqs(results, min_cov)
506 else:
507 results_pnt = results[finder.species]
508 if(results_pnt == "No hit found"):
509 results_pnt = {}
510 else:
511 results_pnt["excluded"] = results["excluded"]
512
513 # DEPRECATED
514 # use std_result
515 new_std_pnt = finder.old_results_to_standard_output(
516 result=results_pnt, software="ResFinder", version="4.1.0",
517 run_date="fake_run_date", run_cmd="Fake run cmd",
518 id=sample_name)
519
520 # DEPRECATED
521 # use std_result
522 finder.write_results(out_path=args.out_path, result=results,
523 res_type=method, unknown_flag=args.unknown_mutations,
524 min_cov=min_cov_point, perc_iden=threshold_point)
525
526 #DEBUG
527 # print("POINT RES:\n{}".format(json.dumps(results_pnt)))
528
529 PointFinderResultHandler.standardize_results(std_result,
530 results_pnt,
531 "PointFinder")
532
533 #DEBUG
534 # print("STD RESULT:\n{}".format(json.dumps(std_result)))
535 ##########################################################################
536 # Phenotype to genotype
537 ##########################################################################
538
539 # Load genotype to phenotype database
540 if(db_path_point):
541 point_file = db_path_point + "/resistens-overview.txt"
542 else:
543 point_file = None
544
545 res_pheno_db = PhenoDB(
546 abclassdef_file=(args.db_path_res + "/antibiotic_classes.txt"),
547 acquired_file=args.db_path_res + "/phenotypes.txt", point_file=point_file)
548 # Isolate object store results
549 isolate = Isolate(name=sample_name)
550 if(args.acquired):
551 isolate.load_finder_results(std_table=std_result,
552 phenodb=res_pheno_db,
553 type="genes")
554 # isolate.load_finder_results(std_table=std_result,
555 # phenodb=res_pheno_db)
556 # isolate.load_finder_results(std_table=new_std_res,
557 # phenodb=res_pheno_db)
558 if(args.point):
559 isolate.load_finder_results(std_table=std_result,
560 phenodb=res_pheno_db,
561 type="seq_variations")
562 # isolate.load_finder_results_old(std_table=new_std_pnt,
563 # phenodb=res_pheno_db)
564 # isolate.load_pointfinder_tab(args.out_path + "/PointFinder_results.txt",
565 # res_pheno_db)
566 isolate.calc_res_profile(res_pheno_db)
567 if(args.acquired):
568 ResFinderResultHandler.load_res_profile(std_result, isolate)
569 if(args.point):
570 PointFinderResultHandler.load_res_profile(std_result, isolate)
571
572
573 #TODO
574 std_result_file = "{}/std_format_under_development.json".format(args.out_path)
575 with open(std_result_file, 'w') as fh:
576 fh.write(json.dumps(std_result))
577
578 # Create and write the downloadable tab file
579 pheno_profile_str = isolate.profile_to_str_table(header=True)
580 # TODO: REMOVE THE NEED FOR THE PICKLED FILE
581 if(args.pickle):
582 isolate_pickle = open("{}/isolate.p".format(args.out_path), "wb")
583 pickle.dump(isolate, isolate_pickle, protocol=2)
584
585 pheno_table_file = args.out_path + '/pheno_table.txt'
586 with open(pheno_table_file, 'w') as fh:
587 fh.write(pheno_profile_str)
588
589 if(args.species is not None):
590 # Apply AMR panel
591 input_amr_panels = args.db_path_res + "/phenotype_panels.txt"
592 res_sum_table = ResSumTable(pheno_profile_str)
593 res_sum_table.load_amr_panels(input_amr_panels)
594
595 try:
596 panel_profile_str = res_sum_table.get_amr_panel_str(
597 panel_name_raw=args.species, header=True)
598 # If specified species does not have an associated panel, just ignore it
599 # and exit.
600 except PanelNameError:
601 eprint("Warning: No panel was detected for the species: {}"
602 .format(args.species))
603 sys.exit()
604
605 amr_panel_filename = args.species.replace(" ", "_")
606
607 panel_tabel_file = (pheno_table_file[:-4] + "_" + amr_panel_filename
608 + ".txt")
609 with open(panel_tabel_file, "w") as fh:
610 fh.write(panel_profile_str)
611
612 sys.exit()