Mercurial > repos > peterjc > blast_rbh
view tools/blast_rbh/blast_rbh.py @ 5:8f4500f6f2aa draft
Refactored to use more than one Python file (internal change only).
author | peterjc |
---|---|
date | Tue, 06 Dec 2022 15:53:36 +0000 |
parents | d8d9a9069586 |
children |
line wrap: on
line source
#!/usr/bin/env python """BLAST Reciprocal Best Hit (RBH) from two FASTA input files. Run "blast_rbh.py -h" to see the help text, or read the associated blast_rbh.xml and README.rst files which are available on GitHub at: https://github.com/peterjc/galaxy_blast/tree/master/tools/blast_rbh This requires Python and the NCBI BLAST+ tools to be installed and on the $PATH. You can also run this tool via Galaxy using the "blast_rbh.xml" definition file. This is available as a package on the Galaxy Tool Shed: http://toolshed.g2.bx.psu.edu/view/peterjc/blast_rbh """ # TODO - Output more columns, e.g. pident, qcovs, descriptions? # TODO - Use new -qcov_hsp_perc option in BLAST+ 2.2.30 to filter # results, rather than doing minimum HSP coverage in Python. # [Not doing this right now as would break on older BLAST+] from __future__ import print_function import os import shutil import sys import tempfile import best_hits from optparse import OptionParser def run(cmd): """Run the given command line string.""" return_code = os.system(cmd) if return_code: sys.exit("Error %i from: %s" % (return_code, cmd)) if "--version" in sys.argv[1:]: # TODO - Capture version of BLAST+ binaries too? print("BLAST RBH v0.2.0") sys.exit(0) try: threads = int(os.environ.get("GALAXY_SLOTS", "1")) except ValueError: threads = 1 assert 1 <= threads, threads # Parse Command Line usage = """Use as follows: $ python blast_rbh.py [options] A.fasta B.fasta Many of the options are required. Example with proteins and blastp: $ python blast_rbh.py -a prot -t blasp -o output.tsv protA.fasta protB.fasta There is additional guidance in the help text in the blast_rbh.xml file, which is shown to the user via the Galaxy interface to this tool. """ parser = OptionParser(usage=usage) parser.add_option( "-a", "--alphabet", dest="dbtype", default=None, help="Alphabet type (nucl or prot; required)", ) parser.add_option( "-t", "--task", dest="task", default=None, help="BLAST task (e.g. blastp, blastn, megablast; required)", ) parser.add_option( "-i", "--identity", dest="min_identity", default="70", help="Minimum percentage identity (optional, default 70)", ) parser.add_option( "-c", "--coverage", dest="min_coverage", default="50", help="Minimum HSP coverage (optional, default 50)", ) parser.add_option( "--nr", dest="nr", default=False, action="store_true", help="Preprocess FASTA files to collapse identitical " "entries (make sequences non-redundant)", ) parser.add_option( "-o", "--output", dest="output", default=None, metavar="FILE", help="Output filename (required)", ) parser.add_option( "--threads", dest="threads", default=threads, help="Number of threads when running BLAST. Defaults to the " "$GALAXY_SLOTS environment variable if set, or 1.", ) options, args = parser.parse_args() if len(args) != 2: sys.exit("Expects two input FASTA filenames") fasta_a, fasta_b = args if not os.path.isfile(fasta_a): sys.exit("Missing input file for species A: %r" % fasta_a) if not os.path.isfile(fasta_b): sys.exit("Missing input file for species B: %r" % fasta_b) if os.path.abspath(fasta_a) == os.path.abspath(fasta_b): self_comparison = True print("Doing self comparison; ignoring self matches.") else: self_comparison = False if not options.output: sys.exit("Output filename required, e.g. -o example.tab") out_file = options.output try: min_identity = float(options.min_identity) except ValueError: sys.exit( "Expected number between 0 and 100 for minimum identity, not %r" % min_identity ) if not (0 <= min_identity <= 100): sys.exit("Expected minimum identity between 0 and 100, not %0.2f" % min_identity) try: min_coverage = float(options.min_coverage) except ValueError: sys.exit( "Expected number between 0 and 100 for minimum coverage, not %r" % min_coverage ) if not (0 <= min_coverage <= 100): sys.exit("Expected minimum coverage between 0 and 100, not %0.2f" % min_coverage) if not options.task: sys.exit("Missing BLAST task, e.g. -t blastp") blast_type = options.task if not options.dbtype: sys.exit("Missing database type, -a nucl, or -a prot") dbtype = options.dbtype if dbtype == "nucl": if blast_type in ["megablast", "blastn", "blastn-short", "dc-megablast"]: blast_cmd = "blastn -task %s" % blast_type elif blast_type == "tblastx": blast_cmd = "tblastx" else: sys.exit("Invalid BLAST type for BLASTN: %r" % blast_type) elif dbtype == "prot": if blast_type not in ["blastp", "blastp-fast", "blastp-short"]: sys.exit("Invalid BLAST type for BLASTP: %r" % blast_type) blast_cmd = "blastp -task %s" % blast_type else: sys.exit("Expected 'nucl' or 'prot' for BLAST database type, not %r" % blast_type) try: threads = int(options.threads) except ValueError: sys.exit( "Expected positive integer for number of threads, not %r" % options.threads ) if threads < 1: sys.exit("Expected positive integer for number of threads, not %r" % threads) makeblastdb_exe = "makeblastdb" base_path = tempfile.mkdtemp() tmp_a = os.path.join(base_path, "SpeciesA.fasta") db_a = os.path.join(base_path, "SpeciesA") a_vs_b = os.path.join(base_path, "A_vs_B.tabular") if self_comparison: tmp_b = tmp_a db_b = db_a b_vs_a = a_vs_b else: tmp_b = os.path.join(base_path, "SpeciesB.fasta") db_b = os.path.join(base_path, "SpeciesB") b_vs_a = os.path.join(base_path, "B_vs_A.tabular") log = os.path.join(base_path, "blast.log") def check_duplicate_ids(filename): """Check for duplicate identifiers in a FASTA file.""" # Copied from tools/ncbi_blast_plus/check_no_duplicates.py # TODO - just use Biopython's FASTA parser? if not os.path.isfile(filename): sys.stderr.write("Missing FASTA file %r\n" % filename) sys.exit(2) identifiers = set() handle = open(filename) for line in handle: if line.startswith(">"): # The split will also take care of the new line character, # e.g. ">test\n" and ">test description here\n" both give "test" seq_id = line[1:].split(None, 1)[0] if seq_id in identifiers: handle.close() sys.stderr.write("Repeated identifiers, e.g. %r\n" % seq_id) sys.exit(3) identifiers.add(seq_id) handle.close() def make_nr(input_fasta, output_fasta, sep=";"): """Make the sequences in a FASTA file non-redundant.""" # TODO - seq-hash based to avoid loading everything into RAM? by_seq = dict() try: from Bio import SeqIO except ImportError: sys.exit("Missing Biopython") for record in SeqIO.parse(input_fasta, "fasta"): s = str(record.seq).upper() try: by_seq[s].append(record.id) except KeyError: by_seq[s] = [record.id] unique = 0 representatives = dict() duplicates = set() for cluster in by_seq.values(): if len(cluster) > 1: representatives[cluster[0]] = cluster duplicates.update(cluster[1:]) else: unique += 1 del by_seq if duplicates: # TODO - refactor as a generator with single SeqIO.write(...) call with open(output_fasta, "w") as handle: for record in SeqIO.parse(input_fasta, "fasta"): if record.id in representatives: cluster = representatives[record.id] record.id = sep.join(cluster) record.description = "representing %i records" % len(cluster) elif record.id in duplicates: continue SeqIO.write(record, handle, "fasta") print( "%i unique entries; removed %i duplicates leaving %i representative records" % (unique, len(duplicates), len(representatives)) ) else: os.symlink(os.path.abspath(input_fasta), output_fasta) print("No perfect duplicates in file, %i unique entries" % unique) # print("Starting...") check_duplicate_ids(fasta_a) if not self_comparison: check_duplicate_ids(fasta_b) if options.nr: make_nr(fasta_a, tmp_a) if not self_comparison: make_nr(fasta_b, tmp_b) fasta_a = tmp_a fasta_b = tmp_b # TODO - Report log in case of error? run( '%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_a, db_a, log) ) if not self_comparison: run( '%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_b, db_b, log) ) # print("BLAST databases prepared.") run( '%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i' % (blast_cmd, fasta_a, db_b, a_vs_b, best_hits.cols, threads) ) # print("BLAST species A vs species B done.") if not self_comparison: run( '%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i' % (blast_cmd, fasta_b, db_a, b_vs_a, best_hits.cols, threads) ) # print("BLAST species B vs species A done.") best_b_vs_a = dict( best_hits.best_hits(b_vs_a, min_identity, min_coverage, self_comparison) ) count = 0 outfile = open(out_file, "w") outfile.write( "#A_id\tB_id\tA_length\tB_length\tA_qcovhsp\tB_qcovhsp\tlength\tpident\tbitscore\n" ) for ( a, (b, a_score_float, a_score_str, a_identity_str, a_coverage_str, a_qlen, a_length), ) in best_hits.best_hits(a_vs_b, min_identity, min_coverage, self_comparison): if b not in best_b_vs_a: # Match b has no best hit continue ( a2, b_score_float, b_score_str, b_identity_str, b_coverage_str, b_qlen, b_length, ) = best_b_vs_a[b] if a != a2: # Not an RBH continue # Start with IDs, lengths, coverage values = [a, b, a_qlen, b_qlen, a_coverage_str, b_coverage_str] # Alignment length was an integer so don't care about original string values.append(min(a_length, b_length)) # Output the original string versions of the scores if float(a_identity_str) < float(b_identity_str): values.append(a_identity_str) else: values.append(b_identity_str) if a_score_float < b_score_float: values.append(a_score_str) else: values.append(b_score_str) outfile.write("%s\t%s\t%i\t%i\t%s\t%s\t%i\t%s\t%s\n" % tuple(values)) count += 1 outfile.close() print("Done, %i RBH found" % count) if best_hits.tie_warning: sys.stderr.write( "Warning: Sequences with tied best hits found, " "you may have duplicates/clusters\n" ) # Remove temp files... shutil.rmtree(base_path)