Mercurial > repos > peterjc > blast_rbh

diff tools/blast_rbh/blast_rbh.py @ 4:d8d9a9069586 draft
v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
author: peterjc
date: Wed, 19 Apr 2017 07:44:47 -0400
parents: 14b2e159b310
children: 8f4500f6f2aa
--- a/tools/blast_rbh/blast_rbh.py	Mon Sep 07 04:40:51 2015 -0400
+++ b/tools/blast_rbh/blast_rbh.py	Wed Apr 19 07:44:47 2017 -0400
@@ -2,7 +2,7 @@
 """BLAST Reciprocal Best Hit (RBH) from two FASTA input files.
 
 Run "blast_rbh.py -h" to see the help text, or read the associated
-README.rst file which is also available on GitHub at:
+blast_rbh.xml and README.rst files which are available on GitHub at:
 https://github.com/peterjc/galaxy_blast/tree/master/tools/blast_rbh
 
 This requires Python and the NCBI BLAST+ tools to be installed
@@ -18,60 +18,78 @@
 #        results, rather than doing minimum HSP coverage in Python.
 #        [Not doing this right now as would break on older BLAST+]
 
+from __future__ import print_function
+
 import os
+import shutil
 import sys
 import tempfile
-import shutil
+
 from optparse import OptionParser
 
-def stop_err( msg ):
-    sys.stderr.write("%s\n" % msg)
-    sys.exit(1)
 
 def run(cmd):
     return_code = os.system(cmd)
     if return_code:
-        stop_err("Error %i from: %s" % (return_code, cmd))
+        sys.exit("Error %i from: %s" % (return_code, cmd))
+
 
 if "--version" in sys.argv[1:]:
-    #TODO - Capture version of BLAST+ binaries too?
-    print "BLAST RBH v0.1.6"
+    # TODO - Capture version of BLAST+ binaries too?
+    print("BLAST RBH v0.1.11")
     sys.exit(0)
 
-#Parse Command Line
+try:
+    threads = int(os.environ.get("GALAXY_SLOTS", "1"))
+except ValueError:
+    threads = 1
+assert 1 <= threads, threads
+
+# Parse Command Line
 usage = """Use as follows:
 
 $ python blast_rbh.py [options] A.fasta B.fasta
+
+Many of the options are required. Example with proteins and blastp:
+
+$ python blast_rbh.py -a prot -t blasp -o output.tsv protA.fasta protB.fasta
+
+There is additional guideance in the help text in the blast_rbh.xml file,
+which is shown to the user via the Galaxy interface to this tool.
 """
 
 parser = OptionParser(usage=usage)
 parser.add_option("-a", "--alphabet", dest="dbtype",
                   default=None,
-                  help="Alphabet type (nucl or prot)")
+                  help="Alphabet type (nucl or prot; required)")
 parser.add_option("-t", "--task", dest="task",
                   default=None,
-                  help="BLAST task (e.g. blastp, blastn, megablast)")
-parser.add_option("-i","--identity", dest="min_identity",
+                  help="BLAST task (e.g. blastp, blastn, megablast; required)")
+parser.add_option("-i", "--identity", dest="min_identity",
                   default="70",
                   help="Minimum percentage identity (optional, default 70)")
 parser.add_option("-c", "--coverage", dest="min_coverage",
                   default="50",
                   help="Minimum HSP coverage (optional, default 50)")
 parser.add_option("--nr", dest="nr", default=False, action="store_true",
-                  help="Preprocess FASTA files to collapse identifical "
+                  help="Preprocess FASTA files to collapse identitical "
                   "entries (make sequences non-redundant)")
 parser.add_option("-o", "--output", dest="output",
                   default=None, metavar="FILE",
-                  help="Output filename")
+                  help="Output filename (required)")
+parser.add_option("--threads", dest="threads",
+                  default=threads,
+                  help="Number of threads when running BLAST. Defaults to the "
+                       "$GALAXY_SLOTS environment variable if set, or 1.")
 options, args = parser.parse_args()
 
 if len(args) != 2:
-    stop_err("Expects two input FASTA filenames")
+    sys.exit("Expects two input FASTA filenames")
 fasta_a, fasta_b = args
 if not os.path.isfile(fasta_a):
-    stop_err("Missing input file for species A: %r" % fasta_a)
+    sys.exit("Missing input file for species A: %r" % fasta_a)
 if not os.path.isfile(fasta_b):
-    stop_err("Missing input file for species B: %r" % fasta_b)
+    sys.exit("Missing input file for species B: %r" % fasta_b)
 if os.path.abspath(fasta_a) == os.path.abspath(fasta_b):
     self_comparison = True
     print("Doing self comparison; ignoring self matches.")
@@ -79,48 +97,49 @@
     self_comparison = False
 
 if not options.output:
-    stop_err("Output filename required, e.g. -o example.tab")
+    sys.exit("Output filename required, e.g. -o example.tab")
 out_file = options.output
 
 try:
     min_identity = float(options.min_identity)
 except ValueError:
-    stop_err("Expected number between 0 and 100 for minimum identity, not %r" % min_identity)
+    sys.exit("Expected number between 0 and 100 for minimum identity, not %r" % min_identity)
 if not (0 <= min_identity <= 100):
-    stop_err("Expected minimum identity between 0 and 100, not %0.2f" % min_identity)
+    sys.exit("Expected minimum identity between 0 and 100, not %0.2f" % min_identity)
 try:
     min_coverage = float(options.min_coverage)
 except ValueError:
-    stop_err("Expected number between 0 and 100 for minimum coverage, not %r" % min_coverage)
+    sys.exit("Expected number between 0 and 100 for minimum coverage, not %r" % min_coverage)
 if not (0 <= min_coverage <= 100):
-    stop_err("Expected minimum coverage between 0 and 100, not %0.2f" % min_coverage)
+    sys.exit("Expected minimum coverage between 0 and 100, not %0.2f" % min_coverage)
 
 if not options.task:
-    stop_err("Missing BLAST task, e.g. -t blastp")
+    sys.exit("Missing BLAST task, e.g. -t blastp")
 blast_type = options.task
 
 if not options.dbtype:
-    stop_err("Missing database type, -a nucl, or -a prot")
+    sys.exit("Missing database type, -a nucl, or -a prot")
 dbtype = options.dbtype
 if dbtype == "nucl":
     if blast_type in ["megablast", "blastn", "blastn-short", "dc-megablast"]:
-         blast_cmd = "blastn -task %s" % blast_type
+        blast_cmd = "blastn -task %s" % blast_type
     elif blast_type == "tblastx":
         blast_cmd = "tblastx"
     else:
-        stop_err("Invalid BLAST type for BLASTN: %r" % blast_type)
+        sys.exit("Invalid BLAST type for BLASTN: %r" % blast_type)
 elif dbtype == "prot":
     if blast_type not in ["blastp", "blastp-fast", "blastp-short"]:
-        stop_err("Invalid BLAST type for BLASTP: %r" % blast_type)
+        sys.exit("Invalid BLAST type for BLASTP: %r" % blast_type)
     blast_cmd = "blastp -task %s" % blast_type
 else:
-    stop_err("Expected 'nucl' or 'prot' for BLAST database type, not %r" % blast_type)
+    sys.exit("Expected 'nucl' or 'prot' for BLAST database type, not %r" % blast_type)
 
 try:
-    threads = int(os.environ.get("GALAXY_SLOTS", "1"))
-except:
-    threads = 1
-assert 1 <= threads, threads
+    threads = int(options.threads)
+except ValueError:
+    sys.exit("Expected positive integer for number of threads, not %r" % options.threads)
+if threads < 1:
+    sys.exit("Expected positive integer for number of threads, not %r" % threads)
 
 makeblastdb_exe = "makeblastdb"
 
@@ -138,7 +157,7 @@
     b_vs_a = os.path.join(base_path, "B_vs_A.tabular")
 log = os.path.join(base_path, "blast.log")
 
-cols = "qseqid sseqid bitscore pident qcovhsp qlen length" #Or qcovs?
+cols = "qseqid sseqid bitscore pident qcovhsp qlen length"  # Or qcovs?
 c_query = 0
 c_match = 1
 c_score = 2
@@ -149,6 +168,7 @@
 
 tie_warning = 0
 
+
 def best_hits(blast_tabular, ignore_self=False):
     """Iterate over BLAST tabular output, returns best hits as 2-tuples.
 
@@ -163,11 +183,18 @@
     current = None
     best_score = None
     best = None
+    col_count = len(cols.split())
     with open(blast_tabular) as h:
         for line in h:
             if line.startswith("#"):
                 continue
             parts = line.rstrip("\n").split("\t")
+            if len(parts) != col_count:
+                # Using NCBI BLAST+ 2.2.27 the undefined field is ignored
+                # Even NCBI BLAST+ 2.5.0 silently ignores unknown fields :(
+                sys.exit("Old version of NCBI BLAST? Expected %i columns, got %i:\n%s\n"
+                         "Note the qcovhsp field was only added in version 2.2.28\n"
+                         % (col_count, len(parts), line))
             if float(parts[c_identity]) < min_identity or float(parts[c_coverage]) < min_coverage:
                 continue
             a = parts[c_query]
@@ -177,71 +204,75 @@
             score = float(parts[c_score])
             qlen = int(parts[c_qlen])
             length = int(parts[c_length])
-            #print("Considering hit for %s to %s with score %s..." % (a, b, score))
+            # print("Considering hit for %s to %s with score %s..." % (a, b, score))
             if current is None:
-                #First hit
+                # First hit
                 assert best is None
                 assert best_score is None
                 best = dict()
-                #Now append this hit...
+                # Now append this hit...
             elif a != current:
-                #New hit
+                # New hit
                 if len(best) == 1:
-                    #Unambiguous (no tied matches)
+                    # Unambiguous (no tied matches)
                     yield current, list(best.values())[0]
                 else:
-                    #print("%s has %i equally good hits: %s" % (a, len(best), ", ".join(best)))
+                    # print("%s has %i equally good hits: %s" % (a, len(best), ", ".join(best)))
                     tie_warning += 1
                 best = dict()
-                #Now append this hit...
+                # Now append this hit...
             elif score < best_score:
-                #print("No improvement for %s, %s < %s" % (a, score, best_score))
+                # print("No improvement for %s, %s < %s" % (a, score, best_score))
                 continue
             elif score > best_score:
-                #This is better, discard old best
+                # This is better, discard old best
                 best = dict()
-                #Now append this hit...
+                # Now append this hit...
             else:
-                #print("Tied best hits for %s" % a)
+                # print("Tied best hits for %s" % a)
                 assert best_score == score
-                #Now append this hit...
+                # Now append this hit...
             current = a
             best_score = score
-            #This will collapse two equally good hits to the same target (e.g. duplicated domain)
+            # This will collapse two equally good hits to the same target (e.g. duplicated domain)
             best[b] = (b, score, parts[c_score], parts[c_identity], parts[c_coverage], qlen, length)
-    #Best hit for final query, if unambiguous:
+    # Best hit for final query, if unambiguous:
     if current is not None:
-        if len(best)==1:
+        if len(best) == 1:
             yield current, list(best.values())[0]
         else:
-            #print("%s has %i equally good hits: %s" % (a, len(best), ", ".join(best)))
+            # print("%s has %i equally good hits: %s" % (a, len(best), ", ".join(best)))
             tie_warning += 1
 
+
 def check_duplicate_ids(filename):
     # Copied from tools/ncbi_blast_plus/check_no_duplicates.py
     # TODO - just use Biopython's FASTA parser?
     if not os.path.isfile(filename):
-        stop_err("Missing FASTA file %r" % filename, 2)
+        sys.stderr.write("Missing FASTA file %r\n" % filename)
+        sys.exit(2)
     identifiers = set()
     handle = open(filename)
     for line in handle:
         if line.startswith(">"):
-        # The split will also take care of the new line character,
-        # e.g. ">test\n" and ">test description here\n" both give "test"
+            # The split will also take care of the new line character,
+            # e.g. ">test\n" and ">test description here\n" both give "test"
             seq_id = line[1:].split(None, 1)[0]
             if seq_id in identifiers:
                 handle.close()
-                stop_err("Repeated identifiers, e.g. %r" % seq_id, 3)
+                sys.stderr.write("Repeated identifiers, e.g. %r\n" % seq_id)
+                sys.exit(3)
             identifiers.add(seq_id)
     handle.close()
 
+
 def make_nr(input_fasta, output_fasta, sep=";"):
-    #TODO - seq-hash based to avoid loading everything into RAM?
+    # TODO - seq-hash based to avoid loading everything into RAM?
     by_seq = dict()
     try:
         from Bio import SeqIO
     except KeyError:
-        stop_err("Missing Biopython")
+        sys.exit("Missing Biopython")
     for record in SeqIO.parse(input_fasta, "fasta"):
         s = str(record.seq).upper()
         try:
@@ -259,7 +290,7 @@
             unique += 1
     del by_seq
     if duplicates:
-        #TODO - refactor as a generator with single SeqIO.write(...) call
+        # TODO - refactor as a generator with single SeqIO.write(...) call
         with open(output_fasta, "w") as handle:
             for record in SeqIO.parse(input_fasta, "fasta"):
                 if record.id in representatives:
@@ -269,12 +300,14 @@
                 elif record.id in duplicates:
                     continue
                 SeqIO.write(record, handle, "fasta")
-        print("%i unique entries; removed %i duplicates leaving %i representative records" % (unique, len(duplicates), len(representatives)))
+        print("%i unique entries; removed %i duplicates leaving %i representative records"
+              % (unique, len(duplicates), len(representatives)))
     else:
         os.symlink(os.path.abspath(input_fasta), output_fasta)
         print("No perfect duplicates in file, %i unique entries" % unique)
 
-#print("Starting...")
+
+# print("Starting...")
 check_duplicate_ids(fasta_a)
 if not self_comparison:
     check_duplicate_ids(fasta_b)
@@ -286,18 +319,18 @@
     fasta_a = tmp_a
     fasta_b = tmp_b
 
-#TODO - Report log in case of error?
+# TODO - Report log in case of error?
 run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_a, db_a, log))
 if not self_comparison:
     run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_b, db_b, log))
-#print("BLAST databases prepared.")
+# print("BLAST databases prepared.")
 run('%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i'
     % (blast_cmd, fasta_a, db_b, a_vs_b, cols, threads))
-#print("BLAST species A vs species B done.")
+# print("BLAST species A vs species B done.")
 if not self_comparison:
     run('%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i'
         % (blast_cmd, fasta_b, db_a, b_vs_a, cols, threads))
-    #print("BLAST species B vs species A done.")
+    # print("BLAST species B vs species A done.")
 
 
 best_b_vs_a = dict(best_hits(b_vs_a, self_comparison))
@@ -306,19 +339,20 @@
 count = 0
 outfile = open(out_file, 'w')
 outfile.write("#A_id\tB_id\tA_length\tB_length\tA_qcovhsp\tB_qcovhsp\tlength\tpident\tbitscore\n")
-for a, (b, a_score_float, a_score_str, a_identity_str, a_coverage_str, a_qlen, a_length) in best_hits(a_vs_b, self_comparison):
+for a, (b, a_score_float, a_score_str,
+        a_identity_str, a_coverage_str, a_qlen, a_length) in best_hits(a_vs_b, self_comparison):
     if b not in best_b_vs_a:
-        #Match b has no best hit
+        # Match b has no best hit
         continue
     a2, b_score_float, b_score_str, b_identity_str, b_coverage_str, b_qlen, b_length = best_b_vs_a[b]
     if a != a2:
-        #Not an RBH
+        # Not an RBH
         continue
-    #Start with IDs, lengths, coverage
+    # Start with IDs, lengths, coverage
     values = [a, b, a_qlen, b_qlen, a_coverage_str, b_coverage_str]
-    #Alignment length was an integer so don't care about original string
+    # Alignment length was an integer so don't care about original string
     values.append(min(a_length, b_length))
-    #Output the original string versions of the scores
+    # Output the original string versions of the scores
     if float(a_identity_str) < float(b_identity_str):
         values.append(a_identity_str)
     else:
@@ -330,9 +364,9 @@
     outfile.write("%s\t%s\t%i\t%i\t%s\t%s\t%i\t%s\t%s\n" % tuple(values))
     count += 1
 outfile.close()
-print "Done, %i RBH found" % count
+print("Done, %i RBH found" % count)
 if tie_warning:
     sys.stderr.write("Warning: Sequences with tied best hits found, you may have duplicates/clusters\n")
 
-#Remove temp files...
+# Remove temp files...
 shutil.rmtree(base_path)
author	peterjc
date	Wed, 19 Apr 2017 07:44:47 -0400
parents	14b2e159b310
children	8f4500f6f2aa