Mercurial > repos > peterjc > blast_rbh

--- a/test-data/rbh_tblastx_rhodopsin_nucs_vs_three_human_mRNA.tabular	Mon Aug 04 08:13:39 2014 -0400
+++ b/test-data/rbh_tblastx_rhodopsin_nucs_vs_three_human_mRNA.tabular	Fri Nov 21 06:57:14 2014 -0500
@@ -1,2 +1,2 @@
 #A_id	B_id	A_length	B_length	A_qcovhsp	B_qcovhsp	length	pident	bitscore
-gi|57163782|ref|NM_001009242.1|	ENA|BC112106|BC112106.1	1047	1213	66	57	230	97.39	  559
+gi|57163782|ref|NM_001009242.1|	ENA|BC112106|BC112106.1	1047	1213	22	19	230	97.39	  559
--- a/tools/blast_rbh/README.rst	Mon Aug 04 08:13:39 2014 -0400
+++ b/tools/blast_rbh/README.rst	Fri Nov 21 06:57:14 2014 -0500
@@ -1,16 +1,19 @@
-Galaxy tool to find BLAST Reciprocal Best Hits (RBH)
-====================================================
+Find BLAST Reciprocal Best Hits (RBH), with Galaxy wrapper
+==========================================================

 This tool is copyright 2011-2014 by Peter Cock, The James Hutton Institute
 (formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved.
 See the licence text below.

 This tool is a short Python script to run reciprocal BLAST searches on a
-pair of sequence files, and extract the reciprocal best hits.
+pair of sequence files, and extract the reciprocal best hits. The script
+``blast_rbh.py`` can be used directly (without Galaxy) as long as NCBI
+BLAST+ is installed.

-This is a work in progress, and builds on an earlier implementation which
-prequired the two BLAST searches be prepared in advance. Integration allows
-a much simpler user experience, and can ensure sensible filters are used.
+It comes with an optional Galaxy tool definition file ``blast_rbh.xml``
+allowing the Python script to be run from within Galaxy. It is available
+from the Galaxy Tool Shed at:
+http://toolshed.g2.bx.psu.edu/view/peterjc/blast_rbh


 Automated Installation
@@ -34,11 +37,10 @@

     <tool file="blast_rbh/blast_rbh.xml" />

-If you want to run the functional tests, include the same line in your
-``tool_conf.xml.sample`` file, and the sample test files under Galaxy's
-``test-data/`` directory. Then::
+If you want to run the functional tests, copy the sample test files under
+sample test files under Galaxy's ``test-data/`` directory. Then::

-    ./run_functional_tests.sh -id blast_reciprocal_best_hits
+    ./run_tests.sh -id blast_reciprocal_best_hits

 You will need to have the NCBI BLAST+ binaries installed and on the ``$PATH``.

@@ -52,7 +54,14 @@
 v0.1.0  - Initial Test Tool Shed release, targetting NCBI BLAST+ 2.2.29
 v0.1.1  - Supports self-comparison, sometimes useful for spotting duplicates.
 v0.1.2  - Using optparse for command line API.
+        - Tool definition now embeds citation information.
         - Fixed Tool Shed dependency definition.
+v0.1.3  - Option to make FASTA files non-redundant (via Biopython dependency).
+        - Avoid extra database and BLAST search in self-comparison mode.
+v0.1.4  - Check for duplicate FASTA identifiers (workaround for makeblastdb
+          not treating this as an error, leading to confusing RBH output).
+v0.1.5  - Clarify documentation for using the Python script outside Galaxy.
+        - Updated to depend on NCBI BLAST+ 2.2.30 via ToolShed install.
 ======= ======================================================================
--- a/tools/blast_rbh/blast_rbh.py	Mon Aug 04 08:13:39 2014 -0400
+++ b/tools/blast_rbh/blast_rbh.py	Fri Nov 21 06:57:14 2014 -0500
@@ -1,14 +1,16 @@
 #!/usr/bin/env python
 """BLAST Reciprocal Best Hit (RBH) from two FASTA input files.

-Takes the following command line options,
-1. FASTA filename of species A
-2. FASTA filename of species B
-3. Sequence type (prot/nucl)
-4. BLAST type (e.g. blastn, or blastp) consistent with sequence type
-5. Minimum BLAST Percentage identity
-6. Minimum BLAST query coverage
-7. Output filename
+Run "blast_rbh.py -h" to see the help text, or read the associated
+README.rst file which is also available on GitHub at:
+https://github.com/peterjc/galaxy_blast/tree/master/tools/blast_rbh
+
+This requires Python and the NCBI BLAST+ tools to be installed
+and on the $PATH.
+
+You can also run this tool via Galaxy using the "blast_rbh.xml"
+definition file. This is available as a package on the Galaxy
+Tool Shed: http://toolshed.g2.bx.psu.edu/view/peterjc/blast_rbh
 """

 # TODO - Output more columns, e.g. pident, qcovs, descriptions?
@@ -30,7 +32,7 @@

 if "--version" in sys.argv[1:]:
     #TODO - Capture version of BLAST+ binaries too?
-    print "BLAST RBH v0.1.2"
+    print "BLAST RBH v0.1.5"
     sys.exit(0)

 #Parse Command Line
@@ -47,11 +49,14 @@
                   default=None,
                   help="BLAST task (e.g. blastp, blastn, megablast)")
 parser.add_option("-i","--identity", dest="min_identity",
-                  default="0",
-                  help="Minimum percentage identity (optional, default 0)")
+                  default="70",
+                  help="Minimum percentage identity (optional, default 70)")
 parser.add_option("-c", "--coverage", dest="min_coverage",
-                  default="0",
-                  help="Minimum HSP coverage (optional, default 0)")
+                  default="50",
+                  help="Minimum HSP coverage (optional, default 50)")
+parser.add_option("--nr", dest="nr", default=False, action="store_true",
+                  help="Preprocess FASTA files to collapse identifical "
+                  "entries (make sequences non-redundant)")
 parser.add_option("-o", "--output", dest="output",
                   default=None, metavar="FILE",
                   help="Output filename")
@@ -117,10 +122,17 @@
 makeblastdb_exe = "makeblastdb"

 base_path = tempfile.mkdtemp()
+tmp_a = os.path.join(base_path, "SpeciesA.fasta")
 db_a = os.path.join(base_path, "SpeciesA")
-db_b = os.path.join(base_path, "SpeciesB")
 a_vs_b = os.path.join(base_path, "A_vs_B.tabular")
-b_vs_a = os.path.join(base_path, "B_vs_A.tabular")
+if self_comparison:
+    tmp_b = tmp_a
+    db_b = db_a
+    b_vs_a = a_vs_b
+else:
+    tmp_b = os.path.join(base_path, "SpeciesB.fasta")
+    db_b = os.path.join(base_path, "SpeciesB")
+    b_vs_a = os.path.join(base_path, "B_vs_A.tabular")
 log = os.path.join(base_path, "blast.log")

 cols = "qseqid sseqid bitscore pident qcovhsp qlen length" #Or qcovs?
@@ -202,18 +214,87 @@
             #print("%s has %i equally good hits: %s" % (a, len(best), ", ".join(best)))
             tie_warning += 1

+def check_duplicate_ids(filename):
+    # Copied from tools/ncbi_blast_plus/check_no_duplicates.py
+    # TODO - just use Biopython's FASTA parser?
+    if not os.path.isfile(filename):
+        stop_err("Missing FASTA file %r" % filename, 2)
+    identifiers = set()
+    handle = open(filename)
+    for line in handle:
+        if line.startswith(">"):
+        # The split will also take care of the new line character,
+        # e.g. ">test\n" and ">test description here\n" both give "test"
+            seq_id = line[1:].split(None, 1)[0]
+            if seq_id in identifiers:
+                handle.close()
+                stop_err("Repeated identifiers, e.g. %r" % seq_id, 3)
+            identifiers.add(seq_id)
+    handle.close()
+
+def make_nr(input_fasta, output_fasta, sep=";"):
+    #TODO - seq-hash based to avoid loading everything into RAM?
+    by_seq = dict()
+    try:
+        from Bio import SeqIO
+    except KeyError:
+        stop_err("Missing Biopython")
+    for record in SeqIO.parse(input_fasta, "fasta"):
+        s = str(record.seq).upper()
+        try:
+            by_seq[s].append(record.id)
+        except KeyError:
+            by_seq[s] = [record.id]
+    unique = 0
+    representatives = dict()
+    duplicates = set()
+    for cluster in by_seq.values():
+        if len(cluster) > 1:
+            representatives[cluster[0]] = cluster
+            duplicates.update(cluster[1:])
+        else:
+            unique += 1
+    del by_seq
+    if duplicates:
+        #TODO - refactor as a generator with single SeqIO.write(...) call
+        with open(output_fasta, "w") as handle:
+            for record in SeqIO.parse(input_fasta, "fasta"):
+                if record.id in representatives:
+                    cluster = representatives[record.id]
+                    record.id = sep.join(cluster)
+                    record.description = "representing %i records" % len(cluster)
+                elif record.id in duplicates:
+                    continue
+                SeqIO.write(record, handle, "fasta")
+        print("%i unique entries; removed %i duplicates leaving %i representative records" % (unique, len(duplicates), len(representatives)))
+    else:
+        os.symlink(os.path.abspath(input_fasta), output_fasta)
+        print("No perfect duplicates in file, %i unique entries" % unique)

 #print("Starting...")
+check_duplicate_ids(fasta_a)
+if not self_comparison:
+    check_duplicate_ids(fasta_b)
+
+if options.nr:
+    make_nr(fasta_a, tmp_a)
+    if not self_comparison:
+        make_nr(fasta_b, tmp_b)
+    fasta_a = tmp_a
+    fasta_b = tmp_b
+
 #TODO - Report log in case of error?
 run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_a, db_a, log))
-run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_b, db_b, log))
+if not self_comparison:
+    run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_b, db_b, log))
 #print("BLAST databases prepared.")
 run('%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i'
     % (blast_cmd, fasta_a, db_b, a_vs_b, cols, threads))
 #print("BLAST species A vs species B done.")
-run('%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i'
-    % (blast_cmd, fasta_b, db_a, b_vs_a, cols, threads))
-#print("BLAST species B vs species A done.")
+if not self_comparison:
+    run('%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i'
+        % (blast_cmd, fasta_b, db_a, b_vs_a, cols, threads))
+    #print("BLAST species B vs species A done.")


 best_b_vs_a = dict(best_hits(b_vs_a, self_comparison))
--- a/tools/blast_rbh/blast_rbh.xml	Mon Aug 04 08:13:39 2014 -0400
+++ b/tools/blast_rbh/blast_rbh.xml	Fri Nov 21 06:57:14 2014 -0500
@@ -1,10 +1,12 @@
-<tool id="blast_reciprocal_best_hits" name="BLAST Reciprocal Best Hits (RBH)" version="0.1.2">
+<tool id="blast_reciprocal_best_hits" name="BLAST Reciprocal Best Hits (RBH)" version="0.1.5">
     <description>from two FASTA files</description>
     <requirements>
-            <requirement type="binary">makeblastdb</requirement>
-            <requirement type="binary">blastp</requirement>
-            <requirement type="binary">blastn</requirement>
-            <requirement type="package" version="2.2.29">blast+</requirement>
+        <requirement type="package" version="1.64">biopython</requirement>
+        <requirement type="python-module">Bio</requirement>
+        <requirement type="binary">makeblastdb</requirement>
+        <requirement type="binary">blastp</requirement>
+        <requirement type="binary">blastn</requirement>
+        <requirement type="package" version="2.2.30">blast+</requirement>
     </requirements>
     <version_command interpreter="python">
 blast_rbh.py --version
@@ -17,6 +19,7 @@
 #else
 -t $seq.prot_type
 #end if
+$make_nr
 -i $identity
 -c $q_cover
 -o "$output"
@@ -61,6 +64,9 @@
         <param name="q_cover" type="float" value="50" min="0" max="100"
 	       label="Minimum percentage query coverage for BLAST matches"
 	       help="Default is 50%, use 0 for no filtering." />
+        <param name="make_nr" type="boolean" checked="false" truevalue="--nr" falsevalue=""
+	       label="Process input FASTA files to collapse identical sequences"
+               help="i.e. First make the input non-redundant" />
     </inputs>
     <outputs>
         <data name="output" format="tabular" label="BLAST RBH: $fasta_a.name vs $fasta_b.name" />
@@ -201,6 +207,13 @@
 can happen following gene duplication, or for (near) identical gene
 duplicates.

+The tool can optionally make the FASTA files non-redundant by replacing
+repeated identical sequences with a single representative before building
+the databases and running BLAST.
+
+Finally, the tool can be run using the same FASTA input file to look for
+RBH within the dataset. In this case, self matches are discarded.
+
 .. class:: warningmark

 **Note**
--- a/tools/blast_rbh/tool_dependencies.xml	Mon Aug 04 08:13:39 2014 -0400
+++ b/tools/blast_rbh/tool_dependencies.xml	Fri Nov 21 06:57:14 2014 -0500
@@ -1,6 +1,9 @@
 <?xml version="1.0"?>
 <tool_dependency>
-    <package name="blast+" version="2.2.29">
-        <repository changeset_revision="a2ec897aac2c" name="package_blast_plus_2_2_29" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" />
+    <package name="biopython" version="1.64">
+        <repository changeset_revision="5477a05cc158" name="package_biopython_1_64" owner="biopython" toolshed="https://toolshed.g2.bx.psu.edu" />
+    </package>
+    <package name="blast+" version="2.2.30">
+        <repository changeset_revision="0fe5d5c28ea2" name="package_blast_plus_2_2_30" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" />
     </package>
 </tool_dependency>