diff tools/blast_rbh/blast_rbh.py @ 5:8f4500f6f2aa draft

Refactored to use more than one Python file (internal change only).
author peterjc
date Tue, 06 Dec 2022 15:53:36 +0000
parents d8d9a9069586
children
line wrap: on
line diff
--- a/tools/blast_rbh/blast_rbh.py	Wed Apr 19 07:44:47 2017 -0400
+++ b/tools/blast_rbh/blast_rbh.py	Tue Dec 06 15:53:36 2022 +0000
@@ -24,11 +24,14 @@
 import shutil
 import sys
 import tempfile
+import best_hits
+
 
 from optparse import OptionParser
 
 
 def run(cmd):
+    """Run the given command line string."""
     return_code = os.system(cmd)
     if return_code:
         sys.exit("Error %i from: %s" % (return_code, cmd))
@@ -36,7 +39,7 @@
 
 if "--version" in sys.argv[1:]:
     # TODO - Capture version of BLAST+ binaries too?
-    print("BLAST RBH v0.1.11")
+    print("BLAST RBH v0.2.0")
     sys.exit(0)
 
 try:
@@ -54,33 +57,62 @@
 
 $ python blast_rbh.py -a prot -t blasp -o output.tsv protA.fasta protB.fasta
 
-There is additional guideance in the help text in the blast_rbh.xml file,
+There is additional guidance in the help text in the blast_rbh.xml file,
 which is shown to the user via the Galaxy interface to this tool.
 """
 
 parser = OptionParser(usage=usage)
-parser.add_option("-a", "--alphabet", dest="dbtype",
-                  default=None,
-                  help="Alphabet type (nucl or prot; required)")
-parser.add_option("-t", "--task", dest="task",
-                  default=None,
-                  help="BLAST task (e.g. blastp, blastn, megablast; required)")
-parser.add_option("-i", "--identity", dest="min_identity",
-                  default="70",
-                  help="Minimum percentage identity (optional, default 70)")
-parser.add_option("-c", "--coverage", dest="min_coverage",
-                  default="50",
-                  help="Minimum HSP coverage (optional, default 50)")
-parser.add_option("--nr", dest="nr", default=False, action="store_true",
-                  help="Preprocess FASTA files to collapse identitical "
-                  "entries (make sequences non-redundant)")
-parser.add_option("-o", "--output", dest="output",
-                  default=None, metavar="FILE",
-                  help="Output filename (required)")
-parser.add_option("--threads", dest="threads",
-                  default=threads,
-                  help="Number of threads when running BLAST. Defaults to the "
-                       "$GALAXY_SLOTS environment variable if set, or 1.")
+parser.add_option(
+    "-a",
+    "--alphabet",
+    dest="dbtype",
+    default=None,
+    help="Alphabet type (nucl or prot; required)",
+)
+parser.add_option(
+    "-t",
+    "--task",
+    dest="task",
+    default=None,
+    help="BLAST task (e.g. blastp, blastn, megablast; required)",
+)
+parser.add_option(
+    "-i",
+    "--identity",
+    dest="min_identity",
+    default="70",
+    help="Minimum percentage identity (optional, default 70)",
+)
+parser.add_option(
+    "-c",
+    "--coverage",
+    dest="min_coverage",
+    default="50",
+    help="Minimum HSP coverage (optional, default 50)",
+)
+parser.add_option(
+    "--nr",
+    dest="nr",
+    default=False,
+    action="store_true",
+    help="Preprocess FASTA files to collapse identitical "
+    "entries (make sequences non-redundant)",
+)
+parser.add_option(
+    "-o",
+    "--output",
+    dest="output",
+    default=None,
+    metavar="FILE",
+    help="Output filename (required)",
+)
+parser.add_option(
+    "--threads",
+    dest="threads",
+    default=threads,
+    help="Number of threads when running BLAST. Defaults to the "
+    "$GALAXY_SLOTS environment variable if set, or 1.",
+)
 options, args = parser.parse_args()
 
 if len(args) != 2:
@@ -103,13 +135,17 @@
 try:
     min_identity = float(options.min_identity)
 except ValueError:
-    sys.exit("Expected number between 0 and 100 for minimum identity, not %r" % min_identity)
+    sys.exit(
+        "Expected number between 0 and 100 for minimum identity, not %r" % min_identity
+    )
 if not (0 <= min_identity <= 100):
     sys.exit("Expected minimum identity between 0 and 100, not %0.2f" % min_identity)
 try:
     min_coverage = float(options.min_coverage)
 except ValueError:
-    sys.exit("Expected number between 0 and 100 for minimum coverage, not %r" % min_coverage)
+    sys.exit(
+        "Expected number between 0 and 100 for minimum coverage, not %r" % min_coverage
+    )
 if not (0 <= min_coverage <= 100):
     sys.exit("Expected minimum coverage between 0 and 100, not %0.2f" % min_coverage)
 
@@ -137,7 +173,9 @@
 try:
     threads = int(options.threads)
 except ValueError:
-    sys.exit("Expected positive integer for number of threads, not %r" % options.threads)
+    sys.exit(
+        "Expected positive integer for number of threads, not %r" % options.threads
+    )
 if threads < 1:
     sys.exit("Expected positive integer for number of threads, not %r" % threads)
 
@@ -157,95 +195,9 @@
     b_vs_a = os.path.join(base_path, "B_vs_A.tabular")
 log = os.path.join(base_path, "blast.log")
 
-cols = "qseqid sseqid bitscore pident qcovhsp qlen length"  # Or qcovs?
-c_query = 0
-c_match = 1
-c_score = 2
-c_identity = 3
-c_coverage = 4
-c_qlen = 5
-c_length = 6
-
-tie_warning = 0
-
-
-def best_hits(blast_tabular, ignore_self=False):
-    """Iterate over BLAST tabular output, returns best hits as 2-tuples.
-
-    Each return value is (query name, tuple of value for the best hit).
-
-    Tied best hits to different sequences are NOT returned.
-
-    One hit is returned for tied best hits to the same sequence
-    (e.g. repeated domains).
-    """
-    global tie_warning
-    current = None
-    best_score = None
-    best = None
-    col_count = len(cols.split())
-    with open(blast_tabular) as h:
-        for line in h:
-            if line.startswith("#"):
-                continue
-            parts = line.rstrip("\n").split("\t")
-            if len(parts) != col_count:
-                # Using NCBI BLAST+ 2.2.27 the undefined field is ignored
-                # Even NCBI BLAST+ 2.5.0 silently ignores unknown fields :(
-                sys.exit("Old version of NCBI BLAST? Expected %i columns, got %i:\n%s\n"
-                         "Note the qcovhsp field was only added in version 2.2.28\n"
-                         % (col_count, len(parts), line))
-            if float(parts[c_identity]) < min_identity or float(parts[c_coverage]) < min_coverage:
-                continue
-            a = parts[c_query]
-            b = parts[c_match]
-            if ignore_self and a == b:
-                continue
-            score = float(parts[c_score])
-            qlen = int(parts[c_qlen])
-            length = int(parts[c_length])
-            # print("Considering hit for %s to %s with score %s..." % (a, b, score))
-            if current is None:
-                # First hit
-                assert best is None
-                assert best_score is None
-                best = dict()
-                # Now append this hit...
-            elif a != current:
-                # New hit
-                if len(best) == 1:
-                    # Unambiguous (no tied matches)
-                    yield current, list(best.values())[0]
-                else:
-                    # print("%s has %i equally good hits: %s" % (a, len(best), ", ".join(best)))
-                    tie_warning += 1
-                best = dict()
-                # Now append this hit...
-            elif score < best_score:
-                # print("No improvement for %s, %s < %s" % (a, score, best_score))
-                continue
-            elif score > best_score:
-                # This is better, discard old best
-                best = dict()
-                # Now append this hit...
-            else:
-                # print("Tied best hits for %s" % a)
-                assert best_score == score
-                # Now append this hit...
-            current = a
-            best_score = score
-            # This will collapse two equally good hits to the same target (e.g. duplicated domain)
-            best[b] = (b, score, parts[c_score], parts[c_identity], parts[c_coverage], qlen, length)
-    # Best hit for final query, if unambiguous:
-    if current is not None:
-        if len(best) == 1:
-            yield current, list(best.values())[0]
-        else:
-            # print("%s has %i equally good hits: %s" % (a, len(best), ", ".join(best)))
-            tie_warning += 1
-
 
 def check_duplicate_ids(filename):
+    """Check for duplicate identifiers in a FASTA file."""
     # Copied from tools/ncbi_blast_plus/check_no_duplicates.py
     # TODO - just use Biopython's FASTA parser?
     if not os.path.isfile(filename):
@@ -267,11 +219,12 @@
 
 
 def make_nr(input_fasta, output_fasta, sep=";"):
+    """Make the sequences in a FASTA file non-redundant."""
     # TODO - seq-hash based to avoid loading everything into RAM?
     by_seq = dict()
     try:
         from Bio import SeqIO
-    except KeyError:
+    except ImportError:
         sys.exit("Missing Biopython")
     for record in SeqIO.parse(input_fasta, "fasta"):
         s = str(record.seq).upper()
@@ -300,8 +253,10 @@
                 elif record.id in duplicates:
                     continue
                 SeqIO.write(record, handle, "fasta")
-        print("%i unique entries; removed %i duplicates leaving %i representative records"
-              % (unique, len(duplicates), len(representatives)))
+        print(
+            "%i unique entries; removed %i duplicates leaving %i representative records"
+            % (unique, len(duplicates), len(representatives))
+        )
     else:
         os.symlink(os.path.abspath(input_fasta), output_fasta)
         print("No perfect duplicates in file, %i unique entries" % unique)
@@ -320,31 +275,55 @@
     fasta_b = tmp_b
 
 # TODO - Report log in case of error?
-run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_a, db_a, log))
+run(
+    '%s -dbtype %s -in "%s" -out "%s" -logfile "%s"'
+    % (makeblastdb_exe, dbtype, fasta_a, db_a, log)
+)
 if not self_comparison:
-    run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_b, db_b, log))
+    run(
+        '%s -dbtype %s -in "%s" -out "%s" -logfile "%s"'
+        % (makeblastdb_exe, dbtype, fasta_b, db_b, log)
+    )
 # print("BLAST databases prepared.")
-run('%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i'
-    % (blast_cmd, fasta_a, db_b, a_vs_b, cols, threads))
+run(
+    '%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i'
+    % (blast_cmd, fasta_a, db_b, a_vs_b, best_hits.cols, threads)
+)
 # print("BLAST species A vs species B done.")
 if not self_comparison:
-    run('%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i'
-        % (blast_cmd, fasta_b, db_a, b_vs_a, cols, threads))
+    run(
+        '%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i'
+        % (blast_cmd, fasta_b, db_a, b_vs_a, best_hits.cols, threads)
+    )
     # print("BLAST species B vs species A done.")
 
 
-best_b_vs_a = dict(best_hits(b_vs_a, self_comparison))
+best_b_vs_a = dict(
+    best_hits.best_hits(b_vs_a, min_identity, min_coverage, self_comparison)
+)
 
 
 count = 0
-outfile = open(out_file, 'w')
-outfile.write("#A_id\tB_id\tA_length\tB_length\tA_qcovhsp\tB_qcovhsp\tlength\tpident\tbitscore\n")
-for a, (b, a_score_float, a_score_str,
-        a_identity_str, a_coverage_str, a_qlen, a_length) in best_hits(a_vs_b, self_comparison):
+outfile = open(out_file, "w")
+outfile.write(
+    "#A_id\tB_id\tA_length\tB_length\tA_qcovhsp\tB_qcovhsp\tlength\tpident\tbitscore\n"
+)
+for (
+    a,
+    (b, a_score_float, a_score_str, a_identity_str, a_coverage_str, a_qlen, a_length),
+) in best_hits.best_hits(a_vs_b, min_identity, min_coverage, self_comparison):
     if b not in best_b_vs_a:
         # Match b has no best hit
         continue
-    a2, b_score_float, b_score_str, b_identity_str, b_coverage_str, b_qlen, b_length = best_b_vs_a[b]
+    (
+        a2,
+        b_score_float,
+        b_score_str,
+        b_identity_str,
+        b_coverage_str,
+        b_qlen,
+        b_length,
+    ) = best_b_vs_a[b]
     if a != a2:
         # Not an RBH
         continue
@@ -365,8 +344,11 @@
     count += 1
 outfile.close()
 print("Done, %i RBH found" % count)
-if tie_warning:
-    sys.stderr.write("Warning: Sequences with tied best hits found, you may have duplicates/clusters\n")
+if best_hits.tie_warning:
+    sys.stderr.write(
+        "Warning: Sequences with tied best hits found, "
+        "you may have duplicates/clusters\n"
+    )
 
 # Remove temp files...
 shutil.rmtree(base_path)