resfinder4: resfinder/run_resfinder.py comparison

comparison resfinder/run_resfinder.py @ 0:55051a9bc58d draft default tip

Uploaded

author	dcouvin
date	Mon, 10 Jan 2022 20:06:07 +0000
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:55051a9bc58d
+#!/usr/bin/env python3
+import sys
+import os
+import subprocess
+from argparse import ArgumentParser
+import pickle
+from cge.resfinder import ResFinder
+from cge.pointfinder import PointFinder
+#  Modules used to create the extended ResFinder output (phenotype output)
+from cge.phenotype2genotype.isolate import Isolate
+from cge.phenotype2genotype.res_profile import PhenoDB
+from cge.phenotype2genotype.res_sumtable import ResSumTable
+from cge.phenotype2genotype.res_sumtable import PanelNameError
+from cge.out.util.generator import Generator
+from cge.standardize_results import ResFinderResultHandler, DatabaseHandler
+from cge.standardize_results import PointFinderResultHandler
+import json
+# TODO list:
+# TODO: Add input data check
+# ########################################################################### #
+# #########                                 FUNCTIONS               ######### #
+# ########################################################################### #
+def eprint(*args, **kwargs):
+print(*args, file=sys.stderr, **kwargs)
+# TODO: Add fix species choice
+species_transl = {"c. jejuni": "campylobacter jejuni",
+"c.jejuni": "campylobacter jejuni",
+"c jejuni": "campylobacter jejuni",
+"c. coli": "campylobacter coli",
+"c.coli": "campylobacter coli",
+"c coli": "campylobacter coli",
+"e. coli": "escherichia coli",
+"e.coli": "escherichia coli",
+"e coli": "escherichia coli",
+"ecoli": "escherichia coli",
+"s. enterica": "salmonella enterica",
+"s.enterica": "salmonella enterica",
+"s enterica": "salmonella enterica",
+"senterica": "salmonella enterica",
+}
+##########################################################################
+# PARSE COMMAND LINE OPTIONS
+##########################################################################
+parser = ArgumentParser(allow_abbrev=False)
+# General options
+parser.add_argument("-ifa", "--inputfasta",
+help="Input fasta file.",
+default=None)
+parser.add_argument("-ifq", "--inputfastq",
+help="Input fastq file(s). Assumed to be single-end fastq \
+if only one file is provided, and assumed to be \
+paired-end data if two files are provided.",
+nargs="+",
+default=None)
+parser.add_argument("-o", "--outputPath",
+dest="out_path",
+help="Path to blast output",
+default='')
+parser.add_argument("-b", "--blastPath",
+dest="blast_path",
+help="Path to blastn",
+default='blastn')
+parser.add_argument("-k", "--kmaPath",
+dest="kma_path",
+help="Path to KMA",
+default=None)
+parser.add_argument("-s", "--species",
+help="Species in the sample",
+default=None)
+# Acquired resistance options
+parser.add_argument("-db_res", "--db_path_res",
+help="Path to the databases for ResFinder",
+default=None)
+parser.add_argument("-db_res_kma", "--db_path_res_kma",
+help="Path to the ResFinder databases indexed with KMA. \
+Defaults to the 'kma_indexing' directory inside the \
+given database directory.",
+default=None)
+parser.add_argument("-d", "--databases",
+dest="databases",
+help="Databases chosen to search in - if none is specified\
+all is used",
+default=None)
+parser.add_argument("-acq", "--acquired",
+action="store_true",
+dest="acquired",
+help="Run resfinder for acquired resistance genes",
+default=False)
+parser.add_argument("-ao", "--acq_overlap",
+help="Genes are allowed to overlap this number of\
+nucleotides. Default: 30.",
+type=int,
+default=30)
+parser.add_argument("-l", "--min_cov",
+dest="min_cov",
+help="Minimum (breadth-of) coverage of ResFinder within the range 0-1.",
+type=float,
+default=0.60)
+parser.add_argument("-t", "--threshold",
+dest="threshold",
+help="Threshold for identity of ResFinder within the range 0-1.",
+type=float,
+default=0.80)
+parser.add_argument("-nano", "--nanopore",
+action="store_true",
+dest="nanopore",
+help="If nanopore data is used",
+default=False)
+# Point resistance option
+parser.add_argument("-c", "--point",
+action="store_true",
+dest="point",
+help="Run pointfinder for chromosomal mutations",
+default=False)
+parser.add_argument("-db_point", "--db_path_point",
+help="Path to the databases for PointFinder",
+default=None)
+parser.add_argument("-db_point_kma", "--db_path_point_kma",
+help="Path to the PointFinder databases indexed with KMA. \
+Defaults to the 'kma_indexing' directory inside the \
+given database directory.",
+default=None)
+parser.add_argument("-g",
+dest="specific_gene",
+nargs='+',
+help="Specify genes existing in the database to \
+search for - if none is specified all genes are \
+included in the search.",
+default=None)
+parser.add_argument("-u", "--unknown_mut",
+dest="unknown_mutations",
+action="store_true",
+help="Show all mutations found even if in unknown to the\
+resistance database",
+default=False)
+parser.add_argument("-l_p", "--min_cov_point",
+dest="min_cov_point",
+help="Minimum (breadth-of) coverage of Pointfinder within the range 0-1. \
+If None is selected, the minimum coverage of \
+ResFinder will be used.",
+type=float,
+default=None)
+parser.add_argument("-t_p", "--threshold_point",
+dest="threshold_point",
+help="Threshold for identity of Pointfinder within the range 0-1. \
+If None is selected, the minimum coverage of \
+ResFinder will be used.",
+type=float,
+default=None)
+# Temporary option only available temporary
+parser.add_argument("--pickle",
+action="store_true",
+help="Create a pickle dump of the Isolate object. \
+Currently needed in the CGE webserver. Dependency \
+and this option is being removed.",
+default=False)
+args = parser.parse_args()
+if(args.species is not None and args.species.lower() == "other"):
+args.species = None
+if(args.point and not args.species):
+sys.exit("ERROR: Chromosomal point mutations cannot be located if no "
+"species has been provided. Please provide species using the "
+"--species option.")
+# Check if coverage/identity parameters are valid
+if(args.min_cov > 1.0 or args.min_cov < 0.0):
+sys.exit("ERROR: Minimum coverage above 1 or below 0 is not allowed. Please select a minimum coverage within the range 0-1 with the flag -l.")
+if(args.threshold > 1.0 or args.threshold < 0.0):
+sys.exit("ERROR: Threshold for identity of ResFinder above 1 or below 0 is not allowed. Please select a threshold for identity within the range 0-1 with the flag -t.")
+# Create a "sample" name
+if(args.inputfasta):
+args.inputfasta = os.path.abspath(args.inputfasta)
+if(not os.path.isfile(args.inputfasta)):
+sys.exit("ERROR: Input FASTA file not found: " + args.inputfasta)
+sample_name = os.path.basename(args.inputfasta)
+method = PointFinder.TYPE_BLAST
+else:
+sample_name = os.path.basename(args.inputfastq[0])
+method = PointFinder.TYPE_KMA
+if(args.inputfastq):
+inputfastq_1 = args.inputfastq[0]
+inputfastq_1 = os.path.abspath(inputfastq_1)
+if(not os.path.isfile(inputfastq_1)):
+sys.exit("ERROR: Input fastq file 1 not found: " + inputfastq_1)
+if(len(args.inputfastq) == 2):
+inputfastq_2 = args.inputfastq[1]
+inputfastq_2 = os.path.abspath(inputfastq_2)
+if(not os.path.isfile(inputfastq_2)):
+sys.exit("ERROR: Input fastq file 2 not found: " + inputfastq_2)
+else:
+inputfastq_2 = None
+blast = args.blast_path
+if(args.inputfasta):
+try:
+_ = subprocess.check_output([blast, "-h"])
+except FileNotFoundError as e:
+sys.exit("ERROR: Unable to execute blastn from the path: {}"
+.format(blast))
+# Check KMA path cge/kma/kma
+if(args.inputfastq):
+if(args.kma_path is None):
+kma = (os.path.dirname(os.path.realpath(__file__)) + "/cge/kma/kma")
+kma = os.path.abspath(kma)
+try:
+_ = subprocess.check_output([kma, "-h"])
+except FileNotFoundError as e:
+kma = "kma"
+else:
+kma = args.kma_path
+try:
+_ = subprocess.check_output([kma, "-h"])
+except FileNotFoundError as e:
+sys.exit("ERROR: Unable to execute kma from the path: {}".format(kma))
+else:
+kma = None
+db_path_point = None
+if(args.species):
+args.species = args.species.lower()
+fixed_species = species_transl.get(args.species, None)
+if(fixed_species):
+args.species = fixed_species
+tmp_list = args.species.split()
+if(len(tmp_list) != 1 and len(tmp_list) != 2):
+sys.exit("ERROR: Species name must contain 1 or 2 names.")
+# Check Poinfinder database
+if(args.point):
+if(len(tmp_list) == 2):
+point_species = "_".join(tmp_list)
+else:
+point_species = tmp_list[0]
+if(args.db_path_point is None and args.point):
+db_path_point = (os.path.dirname(
+os.path.realpath(__file__)) + "/db_pointfinder")
+elif(args.db_path_point is not None):
+db_path_point = args.db_path_point
+db_path_point = os.path.abspath(db_path_point)
+point_dbs = PointFinder.get_db_names(db_path_point)
+# Check if a database for species exists
+if(point_species not in point_dbs and args.point):
+# If not db for species is found check if db for genus is found
+# and use that instead
+if(tmp_list[0] in point_dbs):
+point_species = tmp_list[0]
+else:
+sys.exit("ERROR: species '%s' (%s) does not seem to exist"
+" as a PointFinder database."
+% (args.species, point_species))
+db_path_point_root = db_path_point
+db_path_point = db_path_point + "/" + point_species
+# Check output directory
+args.out_path = os.path.abspath(args.out_path)
+os.makedirs(args.out_path, exist_ok=True)
+if args.acquired is False and args.point is False:
+sys.exit("Please specify to look for acquired resistance genes, "
+"chromosomal mutaitons or both!\n")
+# Check ResFinder database
+if(args.db_path_res is None):
+args.db_path_res = (os.path.dirname(
+os.path.realpath(__file__)) + "/db_resfinder")
+args.db_path_res = os.path.abspath(args.db_path_res)
+if(not os.path.exists(args.db_path_res)):
+sys.exit("Could not locate ResFinder database path: %s"
+% args.db_path_res)
+if(args.db_path_res_kma is None and args.acquired and args.inputfastq):
+args.db_path_res_kma = args.db_path_res
+if(not os.path.exists(args.db_path_res_kma)):
+sys.exit("Could not locate ResFinder database index path: %s"
+% args.db_path_res_kma)
+min_cov = float(args.min_cov)
+# Initialise result dict
+init_software_result = {"software_name": "ResFinder"}
+git_path = os.path.abspath(os.path.dirname(__file__))
+std_result = Generator.init_software_result(name="ResFinder", gitdir=git_path)
+if(args.acquired):
+DatabaseHandler.load_database_metadata("ResFinder", std_result,
+args.db_path_res)
+if(args.point):
+DatabaseHandler.load_database_metadata("PointFinder", std_result,
+db_path_point_root)
+##########################################################################
+# ResFinder
+##########################################################################
+if args.acquired is True:
+databases = args.databases
+threshold = float(args.threshold)
+if(args.inputfasta):
+out_res_blast = args.out_path + "/resfinder_blast"
+os.makedirs(out_res_blast, exist_ok=True)
+if(args.inputfastq):
+out_res_kma = args.out_path + "/resfinder_kma"
+os.makedirs(out_res_kma, exist_ok=True)
+db_path_res = args.db_path_res
+# Check if valid database is provided
+if(db_path_res is None):
+db_path_res = (os.path.dirname(os.path.realpath(__file__))
++ "/db_resfinder")
+if not os.path.exists(db_path_res):
+sys.exit("Input Error: The specified database directory does not "
+"exist!\nProvided path: " + str(db_path_res))
+else:
+# Check existence of config file
+db_config_file = '%s/config' % (db_path_res)
+if not os.path.exists(db_config_file):
+sys.exit("Input Error: The database config file could not be "
+"found!")
+# Check existence of notes file
+notes_path = "%s/notes.txt" % (db_path_res)
+if not os.path.exists(notes_path):
+sys.exit('Input Error: notes.txt not found! (%s)' % (notes_path))
+blast_results = None
+kma_results = None
+# Actually running ResFinder (for acquired resistance)
+acquired_finder = ResFinder(db_conf_file=db_config_file,
+databases=args.databases, db_path=db_path_res,
+notes=notes_path,
+db_path_kma=args.db_path_res_kma)
+if(args.inputfasta):
+blast_results = acquired_finder.blast(inputfile=args.inputfasta,
+out_path=out_res_blast,
+min_cov=min_cov,
+threshold=threshold,
+blast=blast,
+allowed_overlap=args.acq_overlap)
+# DEPRECATED
+# use std_result
+new_std_res = ResFinder.old_results_to_standard_output(
+blast_results.results, software="ResFinder", version="4.1.0",
+run_date="fake_run_date", run_cmd="Fake run cmd",
+id=sample_name)
+# DEPRECATED
+# use std_result
+acquired_finder.write_results(out_path=args.out_path,
+result=blast_results,
+res_type=ResFinder.TYPE_BLAST)
+ResFinderResultHandler.standardize_results(std_result,
+blast_results.results,
+"ResFinder")
+#DEBUG
+#        print("STD RESULT:\n{}".format(json.dumps(std_result)))
+else:
+if args.nanopore:
+kma_run = acquired_finder.kma(inputfile_1=inputfastq_1,
+inputfile_2=inputfastq_2,
+out_path=out_res_kma,
+db_path_kma=args.db_path_res_kma,
+databases=acquired_finder.databases,
+min_cov=min_cov,
+threshold=args.threshold,
+kma_path=kma,
+sample_name="",
+kma_cge=True,
+kma_apm="p",
+kma_1t1=True,
+kma_add_args='-ont -md 5')
+else:
+kma_run = acquired_finder.kma(inputfile_1=inputfastq_1,
+inputfile_2=inputfastq_2,
+out_path=out_res_kma,
+db_path_kma=args.db_path_res_kma,
+databases=acquired_finder.databases,
+min_cov=min_cov,
+threshold=args.threshold,
+kma_path=kma,
+sample_name="",
+kma_cge=True,
+kma_apm="p",
+kma_1t1=True)
+# DEPRECATED
+# use std_result
+new_std_res = ResFinder.old_results_to_standard_output(
+kma_run.results, software="ResFinder", version="4.1.0",
+run_date="fake_run_date", run_cmd="Fake run cmd",
+id=sample_name)
+# DEPRECATED
+# use std_result
+acquired_finder.write_results(out_path=args.out_path,
+result=kma_run.results,
+res_type=ResFinder.TYPE_KMA)
+ResFinderResultHandler.standardize_results(std_result,
+kma_run.results,
+"ResFinder")
+#DEBUG
+#        print("STD RESULT:\n{}".format(json.dumps(std_result)))
+##########################################################################
+# PointFinder
+##########################################################################
+if args.point is True and args.species:
+if(args.inputfasta):
+out_point = os.path.abspath(args.out_path + "/pointfinder_blast")
+os.makedirs(out_point, exist_ok=True)
+if(args.inputfastq):
+out_point = os.path.abspath(args.out_path + "/pointfinder_kma")
+os.makedirs(out_point, exist_ok=True)
+if args.min_cov_point is None:
+min_cov_point = args.min_cov
+else:
+min_cov_point = args.min_cov_point
+if(args.min_cov_point > 1.0 or args.min_cov_point < 0.0):
+sys.exit("ERROR: Minimum coverage above 1 or below 0 is not allowed. Please select a minimum coverage within the range 0-1 with the flag -l_p.")
+if args.threshold_point is None:
+threshold_point = args.threshold
+else:
+threshold_point = args.threshold_point
+if(args.threshold_point > 1.0 or args.threshold_point < 0.0):
+sys.exit("ERROR: Threshold for identity of PointFinder above 1 or below 0 is not allowed. Please select a threshold for identity within the range 0-1 with the flag -t_p.")
+finder = PointFinder(db_path=db_path_point, species=point_species,
+gene_list=args.specific_gene)
+if(args.inputfasta):
+blast_run = finder.blast(inputfile=args.inputfasta,
+out_path=out_point,
+#         min_cov=min_cov_point,
+min_cov=0.01,
+threshold=threshold_point,
+blast=blast,
+cut_off=False)
+results = blast_run.results
+else:
+method = PointFinder.TYPE_KMA
+kma_run = finder.kma(inputfile_1=inputfastq_1,
+inputfile_2=inputfastq_2,
+out_path=out_point,
+db_path_kma=db_path_point,
+databases=[point_species],
+min_cov=0.01,
+#    min_cov=min_cov_point,
+threshold=threshold_point,
+kma_path=kma,
+sample_name="",
+kma_cge=True,
+kma_apm="p",
+kma_1t1=True)
+results = kma_run.results
+if(args.specific_gene):
+results = PointFinder.discard_unwanted_results(
+results=results, wanted=args.specific_gene)
+if(method == PointFinder.TYPE_BLAST):
+results_pnt = finder.find_best_seqs(results, min_cov)
+else:
+results_pnt = results[finder.species]
+if(results_pnt == "No hit found"):
+results_pnt = {}
+else:
+results_pnt["excluded"] = results["excluded"]
+# DEPRECATED
+# use std_result
+new_std_pnt = finder.old_results_to_standard_output(
+result=results_pnt, software="ResFinder", version="4.1.0",
+run_date="fake_run_date", run_cmd="Fake run cmd",
+id=sample_name)
+# DEPRECATED
+# use std_result
+finder.write_results(out_path=args.out_path, result=results,
+res_type=method, unknown_flag=args.unknown_mutations,
+min_cov=min_cov_point, perc_iden=threshold_point)
+#DEBUG
+#    print("POINT RES:\n{}".format(json.dumps(results_pnt)))
+PointFinderResultHandler.standardize_results(std_result,
+results_pnt,
+"PointFinder")
+#DEBUG
+#        print("STD RESULT:\n{}".format(json.dumps(std_result)))
+##########################################################################
+# Phenotype to genotype
+##########################################################################
+# Load genotype to phenotype database
+if(db_path_point):
+point_file = db_path_point + "/resistens-overview.txt"
+else:
+point_file = None
+res_pheno_db = PhenoDB(
+abclassdef_file=(args.db_path_res + "/antibiotic_classes.txt"),
+acquired_file=args.db_path_res + "/phenotypes.txt", point_file=point_file)
+# Isolate object store results
+isolate = Isolate(name=sample_name)
+if(args.acquired):
+isolate.load_finder_results(std_table=std_result,
+phenodb=res_pheno_db,
+type="genes")
+# isolate.load_finder_results(std_table=std_result,
+#                             phenodb=res_pheno_db)
+# isolate.load_finder_results(std_table=new_std_res,
+#                             phenodb=res_pheno_db)
+if(args.point):
+isolate.load_finder_results(std_table=std_result,
+phenodb=res_pheno_db,
+type="seq_variations")
+# isolate.load_finder_results_old(std_table=new_std_pnt,
+#                                 phenodb=res_pheno_db)
+# isolate.load_pointfinder_tab(args.out_path + "/PointFinder_results.txt",
+#                                      res_pheno_db)
+isolate.calc_res_profile(res_pheno_db)
+if(args.acquired):
+ResFinderResultHandler.load_res_profile(std_result, isolate)
+if(args.point):
+PointFinderResultHandler.load_res_profile(std_result, isolate)
+#TODO
+std_result_file = "{}/std_format_under_development.json".format(args.out_path)
+with open(std_result_file, 'w') as fh:
+fh.write(json.dumps(std_result))
+# Create and write the downloadable tab file
+pheno_profile_str = isolate.profile_to_str_table(header=True)
+# TODO: REMOVE THE NEED FOR THE PICKLED FILE
+if(args.pickle):
+isolate_pickle = open("{}/isolate.p".format(args.out_path), "wb")
+pickle.dump(isolate, isolate_pickle, protocol=2)
+pheno_table_file = args.out_path + '/pheno_table.txt'
+with open(pheno_table_file, 'w') as fh:
+fh.write(pheno_profile_str)
+if(args.species is not None):
+# Apply AMR panel
+input_amr_panels = args.db_path_res + "/phenotype_panels.txt"
+res_sum_table = ResSumTable(pheno_profile_str)
+res_sum_table.load_amr_panels(input_amr_panels)
+try:
+panel_profile_str = res_sum_table.get_amr_panel_str(
+panel_name_raw=args.species, header=True)
+# If specified species does not have an associated panel, just ignore it
+# and exit.
+except PanelNameError:
+eprint("Warning: No panel was detected for the species: {}"
+.format(args.species))
+sys.exit()
+amr_panel_filename = args.species.replace(" ", "_")
+panel_tabel_file = (pheno_table_file[:-4] + "_" + amr_panel_filename
++ ".txt")
+with open(panel_tabel_file, "w") as fh:
+fh.write(panel_profile_str)
+sys.exit()

Mercurial > repos > dcouvin > resfinder4

comparison resfinder/run_resfinder.py @ 0:55051a9bc58d draft default tip