# HG changeset patch
# User peterjc
# Date 1416571034 18000
# Node ID ff0b814c1320172de6eb020b9a58e73dda75cf64
# Parent b828ca44a313c626849188bfaac69eb45096a3a3
Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
diff -r b828ca44a313 -r ff0b814c1320 test-data/rbh_tblastx_rhodopsin_nucs_vs_three_human_mRNA.tabular
--- a/test-data/rbh_tblastx_rhodopsin_nucs_vs_three_human_mRNA.tabular Mon Aug 04 08:13:39 2014 -0400
+++ b/test-data/rbh_tblastx_rhodopsin_nucs_vs_three_human_mRNA.tabular Fri Nov 21 06:57:14 2014 -0500
@@ -1,2 +1,2 @@
#A_id B_id A_length B_length A_qcovhsp B_qcovhsp length pident bitscore
-gi|57163782|ref|NM_001009242.1| ENA|BC112106|BC112106.1 1047 1213 66 57 230 97.39 559
+gi|57163782|ref|NM_001009242.1| ENA|BC112106|BC112106.1 1047 1213 22 19 230 97.39 559
diff -r b828ca44a313 -r ff0b814c1320 tools/blast_rbh/README.rst
--- a/tools/blast_rbh/README.rst Mon Aug 04 08:13:39 2014 -0400
+++ b/tools/blast_rbh/README.rst Fri Nov 21 06:57:14 2014 -0500
@@ -1,16 +1,19 @@
-Galaxy tool to find BLAST Reciprocal Best Hits (RBH)
-====================================================
+Find BLAST Reciprocal Best Hits (RBH), with Galaxy wrapper
+==========================================================
This tool is copyright 2011-2014 by Peter Cock, The James Hutton Institute
(formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved.
See the licence text below.
This tool is a short Python script to run reciprocal BLAST searches on a
-pair of sequence files, and extract the reciprocal best hits.
+pair of sequence files, and extract the reciprocal best hits. The script
+``blast_rbh.py`` can be used directly (without Galaxy) as long as NCBI
+BLAST+ is installed.
-This is a work in progress, and builds on an earlier implementation which
-prequired the two BLAST searches be prepared in advance. Integration allows
-a much simpler user experience, and can ensure sensible filters are used.
+It comes with an optional Galaxy tool definition file ``blast_rbh.xml``
+allowing the Python script to be run from within Galaxy. It is available
+from the Galaxy Tool Shed at:
+http://toolshed.g2.bx.psu.edu/view/peterjc/blast_rbh
Automated Installation
@@ -34,11 +37,10 @@
-If you want to run the functional tests, include the same line in your
-``tool_conf.xml.sample`` file, and the sample test files under Galaxy's
-``test-data/`` directory. Then::
+If you want to run the functional tests, copy the sample test files under
+sample test files under Galaxy's ``test-data/`` directory. Then::
- ./run_functional_tests.sh -id blast_reciprocal_best_hits
+ ./run_tests.sh -id blast_reciprocal_best_hits
You will need to have the NCBI BLAST+ binaries installed and on the ``$PATH``.
@@ -52,7 +54,14 @@
v0.1.0 - Initial Test Tool Shed release, targetting NCBI BLAST+ 2.2.29
v0.1.1 - Supports self-comparison, sometimes useful for spotting duplicates.
v0.1.2 - Using optparse for command line API.
+ - Tool definition now embeds citation information.
- Fixed Tool Shed dependency definition.
+v0.1.3 - Option to make FASTA files non-redundant (via Biopython dependency).
+ - Avoid extra database and BLAST search in self-comparison mode.
+v0.1.4 - Check for duplicate FASTA identifiers (workaround for makeblastdb
+ not treating this as an error, leading to confusing RBH output).
+v0.1.5 - Clarify documentation for using the Python script outside Galaxy.
+ - Updated to depend on NCBI BLAST+ 2.2.30 via ToolShed install.
======= ======================================================================
diff -r b828ca44a313 -r ff0b814c1320 tools/blast_rbh/blast_rbh.py
--- a/tools/blast_rbh/blast_rbh.py Mon Aug 04 08:13:39 2014 -0400
+++ b/tools/blast_rbh/blast_rbh.py Fri Nov 21 06:57:14 2014 -0500
@@ -1,14 +1,16 @@
#!/usr/bin/env python
"""BLAST Reciprocal Best Hit (RBH) from two FASTA input files.
-Takes the following command line options,
-1. FASTA filename of species A
-2. FASTA filename of species B
-3. Sequence type (prot/nucl)
-4. BLAST type (e.g. blastn, or blastp) consistent with sequence type
-5. Minimum BLAST Percentage identity
-6. Minimum BLAST query coverage
-7. Output filename
+Run "blast_rbh.py -h" to see the help text, or read the associated
+README.rst file which is also available on GitHub at:
+https://github.com/peterjc/galaxy_blast/tree/master/tools/blast_rbh
+
+This requires Python and the NCBI BLAST+ tools to be installed
+and on the $PATH.
+
+You can also run this tool via Galaxy using the "blast_rbh.xml"
+definition file. This is available as a package on the Galaxy
+Tool Shed: http://toolshed.g2.bx.psu.edu/view/peterjc/blast_rbh
"""
# TODO - Output more columns, e.g. pident, qcovs, descriptions?
@@ -30,7 +32,7 @@
if "--version" in sys.argv[1:]:
#TODO - Capture version of BLAST+ binaries too?
- print "BLAST RBH v0.1.2"
+ print "BLAST RBH v0.1.5"
sys.exit(0)
#Parse Command Line
@@ -47,11 +49,14 @@
default=None,
help="BLAST task (e.g. blastp, blastn, megablast)")
parser.add_option("-i","--identity", dest="min_identity",
- default="0",
- help="Minimum percentage identity (optional, default 0)")
+ default="70",
+ help="Minimum percentage identity (optional, default 70)")
parser.add_option("-c", "--coverage", dest="min_coverage",
- default="0",
- help="Minimum HSP coverage (optional, default 0)")
+ default="50",
+ help="Minimum HSP coverage (optional, default 50)")
+parser.add_option("--nr", dest="nr", default=False, action="store_true",
+ help="Preprocess FASTA files to collapse identifical "
+ "entries (make sequences non-redundant)")
parser.add_option("-o", "--output", dest="output",
default=None, metavar="FILE",
help="Output filename")
@@ -117,10 +122,17 @@
makeblastdb_exe = "makeblastdb"
base_path = tempfile.mkdtemp()
+tmp_a = os.path.join(base_path, "SpeciesA.fasta")
db_a = os.path.join(base_path, "SpeciesA")
-db_b = os.path.join(base_path, "SpeciesB")
a_vs_b = os.path.join(base_path, "A_vs_B.tabular")
-b_vs_a = os.path.join(base_path, "B_vs_A.tabular")
+if self_comparison:
+ tmp_b = tmp_a
+ db_b = db_a
+ b_vs_a = a_vs_b
+else:
+ tmp_b = os.path.join(base_path, "SpeciesB.fasta")
+ db_b = os.path.join(base_path, "SpeciesB")
+ b_vs_a = os.path.join(base_path, "B_vs_A.tabular")
log = os.path.join(base_path, "blast.log")
cols = "qseqid sseqid bitscore pident qcovhsp qlen length" #Or qcovs?
@@ -202,18 +214,87 @@
#print("%s has %i equally good hits: %s" % (a, len(best), ", ".join(best)))
tie_warning += 1
+def check_duplicate_ids(filename):
+ # Copied from tools/ncbi_blast_plus/check_no_duplicates.py
+ # TODO - just use Biopython's FASTA parser?
+ if not os.path.isfile(filename):
+ stop_err("Missing FASTA file %r" % filename, 2)
+ identifiers = set()
+ handle = open(filename)
+ for line in handle:
+ if line.startswith(">"):
+ # The split will also take care of the new line character,
+ # e.g. ">test\n" and ">test description here\n" both give "test"
+ seq_id = line[1:].split(None, 1)[0]
+ if seq_id in identifiers:
+ handle.close()
+ stop_err("Repeated identifiers, e.g. %r" % seq_id, 3)
+ identifiers.add(seq_id)
+ handle.close()
+
+def make_nr(input_fasta, output_fasta, sep=";"):
+ #TODO - seq-hash based to avoid loading everything into RAM?
+ by_seq = dict()
+ try:
+ from Bio import SeqIO
+ except KeyError:
+ stop_err("Missing Biopython")
+ for record in SeqIO.parse(input_fasta, "fasta"):
+ s = str(record.seq).upper()
+ try:
+ by_seq[s].append(record.id)
+ except KeyError:
+ by_seq[s] = [record.id]
+ unique = 0
+ representatives = dict()
+ duplicates = set()
+ for cluster in by_seq.values():
+ if len(cluster) > 1:
+ representatives[cluster[0]] = cluster
+ duplicates.update(cluster[1:])
+ else:
+ unique += 1
+ del by_seq
+ if duplicates:
+ #TODO - refactor as a generator with single SeqIO.write(...) call
+ with open(output_fasta, "w") as handle:
+ for record in SeqIO.parse(input_fasta, "fasta"):
+ if record.id in representatives:
+ cluster = representatives[record.id]
+ record.id = sep.join(cluster)
+ record.description = "representing %i records" % len(cluster)
+ elif record.id in duplicates:
+ continue
+ SeqIO.write(record, handle, "fasta")
+ print("%i unique entries; removed %i duplicates leaving %i representative records" % (unique, len(duplicates), len(representatives)))
+ else:
+ os.symlink(os.path.abspath(input_fasta), output_fasta)
+ print("No perfect duplicates in file, %i unique entries" % unique)
#print("Starting...")
+check_duplicate_ids(fasta_a)
+if not self_comparison:
+ check_duplicate_ids(fasta_b)
+
+if options.nr:
+ make_nr(fasta_a, tmp_a)
+ if not self_comparison:
+ make_nr(fasta_b, tmp_b)
+ fasta_a = tmp_a
+ fasta_b = tmp_b
+
#TODO - Report log in case of error?
run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_a, db_a, log))
-run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_b, db_b, log))
+if not self_comparison:
+ run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_b, db_b, log))
#print("BLAST databases prepared.")
run('%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i'
% (blast_cmd, fasta_a, db_b, a_vs_b, cols, threads))
#print("BLAST species A vs species B done.")
-run('%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i'
- % (blast_cmd, fasta_b, db_a, b_vs_a, cols, threads))
-#print("BLAST species B vs species A done.")
+if not self_comparison:
+ run('%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i'
+ % (blast_cmd, fasta_b, db_a, b_vs_a, cols, threads))
+ #print("BLAST species B vs species A done.")
best_b_vs_a = dict(best_hits(b_vs_a, self_comparison))
diff -r b828ca44a313 -r ff0b814c1320 tools/blast_rbh/blast_rbh.xml
--- a/tools/blast_rbh/blast_rbh.xml Mon Aug 04 08:13:39 2014 -0400
+++ b/tools/blast_rbh/blast_rbh.xml Fri Nov 21 06:57:14 2014 -0500
@@ -1,10 +1,12 @@
-
+
from two FASTA files
- makeblastdb
- blastp
- blastn
- blast+
+ biopython
+ Bio
+ makeblastdb
+ blastp
+ blastn
+ blast+
blast_rbh.py --version
@@ -17,6 +19,7 @@
#else
-t $seq.prot_type
#end if
+$make_nr
-i $identity
-c $q_cover
-o "$output"
@@ -61,6 +64,9 @@
+
@@ -201,6 +207,13 @@
can happen following gene duplication, or for (near) identical gene
duplicates.
+The tool can optionally make the FASTA files non-redundant by replacing
+repeated identical sequences with a single representative before building
+the databases and running BLAST.
+
+Finally, the tool can be run using the same FASTA input file to look for
+RBH within the dataset. In this case, self matches are discarded.
+
.. class:: warningmark
**Note**
diff -r b828ca44a313 -r ff0b814c1320 tools/blast_rbh/tool_dependencies.xml
--- a/tools/blast_rbh/tool_dependencies.xml Mon Aug 04 08:13:39 2014 -0400
+++ b/tools/blast_rbh/tool_dependencies.xml Fri Nov 21 06:57:14 2014 -0500
@@ -1,6 +1,9 @@
-
-
+
+
+
+
+