Mercurial > repos > peterjc > blast_rbh
changeset 5:8f4500f6f2aa draft
Refactored to use more than one Python file (internal change only).
author | peterjc |
---|---|
date | Tue, 06 Dec 2022 15:53:36 +0000 |
parents | d8d9a9069586 |
children | b2f91cbed8d9 |
files | tools/blast_rbh/README.rst tools/blast_rbh/best_hits.py tools/blast_rbh/blast_rbh.py tools/blast_rbh/blast_rbh.xml tools/blast_rbh/tool_dependencies.xml |
diffstat | 5 files changed, 270 insertions(+), 165 deletions(-) [+] |
line wrap: on
line diff
--- a/tools/blast_rbh/README.rst Wed Apr 19 07:44:47 2017 -0400 +++ b/tools/blast_rbh/README.rst Tue Dec 06 15:53:36 2022 +0000 @@ -24,13 +24,13 @@ NCBI BLAST+ integrated into Galaxy. P.J.A. Cock, J.M. Chilton, B. Gruening, J.E. Johnson, N. Soranzo *GigaScience* 2015, 4:39 -DOI: http://dx.doi.org/10.1186/s13742-015-0080-7 +https://doi.org/10.1186/s13742-015-0080-7 You should also cite the NCBI BLAST+ tools: BLAST+: architecture and applications. C. Camacho et al. *BMC Bioinformatics* 2009, 10:421. -DOI: http://dx.doi.org/10.1186/1471-2105-10-421 +https://doi.org/10.1186/1471-2105-10-421 Automated Installation @@ -43,9 +43,10 @@ Manual Installation =================== -There are just two files to install: +There are just three files to install: - ``blast_rbh.py`` (the Python script) +- ``best_hits.py`` (helper script, put in same directory) - ``blast_rbh.xml`` (the Galaxy tool definition) The suggested location is in a ``tools/blast_rbh/`` folder. You will then @@ -91,6 +92,9 @@ v0.1.11 - Updated to depend on NCBI BLAST+ 2.5.0 via ToolShed or BioConda. - Update Biopython dependency. - Tweak Python script to work under Python 2 or Python 3. +v0.1.12 - Use ``<command detect_errors="aggressive">`` (internal change only). + - Single quote command line arguments (internal change only). +v0.2.0 - Refactored to use more than one Python file (internal change only). ======= ====================================================================== @@ -116,7 +120,7 @@ $ planemo shed_upload --tar_only tools/blast_rbh/ ... - $ tar -tzf shed_upload.tar.gz + $ tar -tzf shed_upload.tar.gz test-data/four_human_proteins.fasta test-data/k12_edited_proteins.fasta test-data/k12_ten_proteins.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/blast_rbh/best_hits.py Tue Dec 06 15:53:36 2022 +0000 @@ -0,0 +1,124 @@ +#!/usr/bin/env python +"""Helper code for BLAST Reciprocal Best Hit (RBH) from two BLAST tabular files. + +This module defines a function best_hits to return the best hit in a BLAST +tabular file, intended for use in finding BLAST Reciprocal Best Hits (RBH). + +The code in this module originated from blast_rbh.py in +https://github.com/peterjc/galaxy_blast/tree/master/tools/blast_rbh + +Please cite the author per instructions in +https://github.com/peterjc/galaxy_blast/blob/master/tools/blast_rbh/README.rst +""" + +import sys + +tie_warning = 0 + +c_query = 0 +c_match = 1 +c_score = 2 +c_identity = 3 +c_coverage = 4 +c_qlen = 5 +c_length = 6 + +cols = "qseqid sseqid bitscore pident qcovhsp qlen length" + + +def best_hits(blast_tabular, min_identity=70, min_coverage=50, ignore_self=False): + """Parse BLAST tabular output returning the best hit. + + Argument blast_tabular should be a filename, containing BLAST tabular + output with "qseqid sseqid bitscore pident qcovhsp qlen length" as + the columns. + + Parses the tabular file, iterating over the results as 2-tuples + of (query name, tuple of values for the best hit). + + One hit is returned for tied best hits to the same sequence + (e.g. repeated domains). + + Tied best hits to different sequences are NOT returned. Instead, + global variable tie_warning is incremented. + """ + global tie_warning + current = None + best_score = None + best = None + col_count = len(cols.split()) + with open(blast_tabular) as h: + for line in h: + if line.startswith("#"): + continue + parts = line.rstrip("\n").split("\t") + if len(parts) != col_count: + # Using NCBI BLAST+ 2.2.27 the undefined field is ignored + # Even NCBI BLAST+ 2.5.0 silently ignores unknown fields :( + sys.exit( + "Old version of NCBI BLAST? Expected %i columns, got %i:\n%s\n" + "Note the qcovhsp field was only added in version 2.2.28\n" + % (col_count, len(parts), line) + ) + if ( + float(parts[c_identity]) < min_identity + or float(parts[c_coverage]) < min_coverage + ): + continue + a = parts[c_query] + b = parts[c_match] + if ignore_self and a == b: + continue + score = float(parts[c_score]) + qlen = int(parts[c_qlen]) + length = int(parts[c_length]) + # print("Considering hit for %s to %s with score %s..." % (a, b, score)) + if current is None: + # First hit + assert best is None + assert best_score is None + best = dict() + # Now append this hit... + elif a != current: + # New hit + if len(best) == 1: + # Unambiguous (no tied matches) + yield current, list(best.values())[0] + else: + # print("%s has %i equally good hits: %s" + # % (a, len(best), ", ".join(best))) + tie_warning += 1 + best = dict() + # Now append this hit... + elif score < best_score: + # print("No improvement for %s, %s < %s" % (a, score, best_score)) + continue + elif score > best_score: + # This is better, discard old best + best = dict() + # Now append this hit... + else: + # print("Tied best hits for %s" % a) + assert best_score == score + # Now append this hit... + current = a + best_score = score + # This will collapse two equally good hits + # to the same target (e.g. duplicated domain) + best[b] = ( + b, + score, + parts[c_score], + parts[c_identity], + parts[c_coverage], + qlen, + length, + ) + # Best hit for final query, if unambiguous: + if current is not None: + if len(best) == 1: + yield current, list(best.values())[0] + else: + # print("%s has %i equally good hits: %s" + # % (a, len(best), ", ".join(best))) + tie_warning += 1
--- a/tools/blast_rbh/blast_rbh.py Wed Apr 19 07:44:47 2017 -0400 +++ b/tools/blast_rbh/blast_rbh.py Tue Dec 06 15:53:36 2022 +0000 @@ -24,11 +24,14 @@ import shutil import sys import tempfile +import best_hits + from optparse import OptionParser def run(cmd): + """Run the given command line string.""" return_code = os.system(cmd) if return_code: sys.exit("Error %i from: %s" % (return_code, cmd)) @@ -36,7 +39,7 @@ if "--version" in sys.argv[1:]: # TODO - Capture version of BLAST+ binaries too? - print("BLAST RBH v0.1.11") + print("BLAST RBH v0.2.0") sys.exit(0) try: @@ -54,33 +57,62 @@ $ python blast_rbh.py -a prot -t blasp -o output.tsv protA.fasta protB.fasta -There is additional guideance in the help text in the blast_rbh.xml file, +There is additional guidance in the help text in the blast_rbh.xml file, which is shown to the user via the Galaxy interface to this tool. """ parser = OptionParser(usage=usage) -parser.add_option("-a", "--alphabet", dest="dbtype", - default=None, - help="Alphabet type (nucl or prot; required)") -parser.add_option("-t", "--task", dest="task", - default=None, - help="BLAST task (e.g. blastp, blastn, megablast; required)") -parser.add_option("-i", "--identity", dest="min_identity", - default="70", - help="Minimum percentage identity (optional, default 70)") -parser.add_option("-c", "--coverage", dest="min_coverage", - default="50", - help="Minimum HSP coverage (optional, default 50)") -parser.add_option("--nr", dest="nr", default=False, action="store_true", - help="Preprocess FASTA files to collapse identitical " - "entries (make sequences non-redundant)") -parser.add_option("-o", "--output", dest="output", - default=None, metavar="FILE", - help="Output filename (required)") -parser.add_option("--threads", dest="threads", - default=threads, - help="Number of threads when running BLAST. Defaults to the " - "$GALAXY_SLOTS environment variable if set, or 1.") +parser.add_option( + "-a", + "--alphabet", + dest="dbtype", + default=None, + help="Alphabet type (nucl or prot; required)", +) +parser.add_option( + "-t", + "--task", + dest="task", + default=None, + help="BLAST task (e.g. blastp, blastn, megablast; required)", +) +parser.add_option( + "-i", + "--identity", + dest="min_identity", + default="70", + help="Minimum percentage identity (optional, default 70)", +) +parser.add_option( + "-c", + "--coverage", + dest="min_coverage", + default="50", + help="Minimum HSP coverage (optional, default 50)", +) +parser.add_option( + "--nr", + dest="nr", + default=False, + action="store_true", + help="Preprocess FASTA files to collapse identitical " + "entries (make sequences non-redundant)", +) +parser.add_option( + "-o", + "--output", + dest="output", + default=None, + metavar="FILE", + help="Output filename (required)", +) +parser.add_option( + "--threads", + dest="threads", + default=threads, + help="Number of threads when running BLAST. Defaults to the " + "$GALAXY_SLOTS environment variable if set, or 1.", +) options, args = parser.parse_args() if len(args) != 2: @@ -103,13 +135,17 @@ try: min_identity = float(options.min_identity) except ValueError: - sys.exit("Expected number between 0 and 100 for minimum identity, not %r" % min_identity) + sys.exit( + "Expected number between 0 and 100 for minimum identity, not %r" % min_identity + ) if not (0 <= min_identity <= 100): sys.exit("Expected minimum identity between 0 and 100, not %0.2f" % min_identity) try: min_coverage = float(options.min_coverage) except ValueError: - sys.exit("Expected number between 0 and 100 for minimum coverage, not %r" % min_coverage) + sys.exit( + "Expected number between 0 and 100 for minimum coverage, not %r" % min_coverage + ) if not (0 <= min_coverage <= 100): sys.exit("Expected minimum coverage between 0 and 100, not %0.2f" % min_coverage) @@ -137,7 +173,9 @@ try: threads = int(options.threads) except ValueError: - sys.exit("Expected positive integer for number of threads, not %r" % options.threads) + sys.exit( + "Expected positive integer for number of threads, not %r" % options.threads + ) if threads < 1: sys.exit("Expected positive integer for number of threads, not %r" % threads) @@ -157,95 +195,9 @@ b_vs_a = os.path.join(base_path, "B_vs_A.tabular") log = os.path.join(base_path, "blast.log") -cols = "qseqid sseqid bitscore pident qcovhsp qlen length" # Or qcovs? -c_query = 0 -c_match = 1 -c_score = 2 -c_identity = 3 -c_coverage = 4 -c_qlen = 5 -c_length = 6 - -tie_warning = 0 - - -def best_hits(blast_tabular, ignore_self=False): - """Iterate over BLAST tabular output, returns best hits as 2-tuples. - - Each return value is (query name, tuple of value for the best hit). - - Tied best hits to different sequences are NOT returned. - - One hit is returned for tied best hits to the same sequence - (e.g. repeated domains). - """ - global tie_warning - current = None - best_score = None - best = None - col_count = len(cols.split()) - with open(blast_tabular) as h: - for line in h: - if line.startswith("#"): - continue - parts = line.rstrip("\n").split("\t") - if len(parts) != col_count: - # Using NCBI BLAST+ 2.2.27 the undefined field is ignored - # Even NCBI BLAST+ 2.5.0 silently ignores unknown fields :( - sys.exit("Old version of NCBI BLAST? Expected %i columns, got %i:\n%s\n" - "Note the qcovhsp field was only added in version 2.2.28\n" - % (col_count, len(parts), line)) - if float(parts[c_identity]) < min_identity or float(parts[c_coverage]) < min_coverage: - continue - a = parts[c_query] - b = parts[c_match] - if ignore_self and a == b: - continue - score = float(parts[c_score]) - qlen = int(parts[c_qlen]) - length = int(parts[c_length]) - # print("Considering hit for %s to %s with score %s..." % (a, b, score)) - if current is None: - # First hit - assert best is None - assert best_score is None - best = dict() - # Now append this hit... - elif a != current: - # New hit - if len(best) == 1: - # Unambiguous (no tied matches) - yield current, list(best.values())[0] - else: - # print("%s has %i equally good hits: %s" % (a, len(best), ", ".join(best))) - tie_warning += 1 - best = dict() - # Now append this hit... - elif score < best_score: - # print("No improvement for %s, %s < %s" % (a, score, best_score)) - continue - elif score > best_score: - # This is better, discard old best - best = dict() - # Now append this hit... - else: - # print("Tied best hits for %s" % a) - assert best_score == score - # Now append this hit... - current = a - best_score = score - # This will collapse two equally good hits to the same target (e.g. duplicated domain) - best[b] = (b, score, parts[c_score], parts[c_identity], parts[c_coverage], qlen, length) - # Best hit for final query, if unambiguous: - if current is not None: - if len(best) == 1: - yield current, list(best.values())[0] - else: - # print("%s has %i equally good hits: %s" % (a, len(best), ", ".join(best))) - tie_warning += 1 - def check_duplicate_ids(filename): + """Check for duplicate identifiers in a FASTA file.""" # Copied from tools/ncbi_blast_plus/check_no_duplicates.py # TODO - just use Biopython's FASTA parser? if not os.path.isfile(filename): @@ -267,11 +219,12 @@ def make_nr(input_fasta, output_fasta, sep=";"): + """Make the sequences in a FASTA file non-redundant.""" # TODO - seq-hash based to avoid loading everything into RAM? by_seq = dict() try: from Bio import SeqIO - except KeyError: + except ImportError: sys.exit("Missing Biopython") for record in SeqIO.parse(input_fasta, "fasta"): s = str(record.seq).upper() @@ -300,8 +253,10 @@ elif record.id in duplicates: continue SeqIO.write(record, handle, "fasta") - print("%i unique entries; removed %i duplicates leaving %i representative records" - % (unique, len(duplicates), len(representatives))) + print( + "%i unique entries; removed %i duplicates leaving %i representative records" + % (unique, len(duplicates), len(representatives)) + ) else: os.symlink(os.path.abspath(input_fasta), output_fasta) print("No perfect duplicates in file, %i unique entries" % unique) @@ -320,31 +275,55 @@ fasta_b = tmp_b # TODO - Report log in case of error? -run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_a, db_a, log)) +run( + '%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' + % (makeblastdb_exe, dbtype, fasta_a, db_a, log) +) if not self_comparison: - run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_b, db_b, log)) + run( + '%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' + % (makeblastdb_exe, dbtype, fasta_b, db_b, log) + ) # print("BLAST databases prepared.") -run('%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i' - % (blast_cmd, fasta_a, db_b, a_vs_b, cols, threads)) +run( + '%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i' + % (blast_cmd, fasta_a, db_b, a_vs_b, best_hits.cols, threads) +) # print("BLAST species A vs species B done.") if not self_comparison: - run('%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i' - % (blast_cmd, fasta_b, db_a, b_vs_a, cols, threads)) + run( + '%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i' + % (blast_cmd, fasta_b, db_a, b_vs_a, best_hits.cols, threads) + ) # print("BLAST species B vs species A done.") -best_b_vs_a = dict(best_hits(b_vs_a, self_comparison)) +best_b_vs_a = dict( + best_hits.best_hits(b_vs_a, min_identity, min_coverage, self_comparison) +) count = 0 -outfile = open(out_file, 'w') -outfile.write("#A_id\tB_id\tA_length\tB_length\tA_qcovhsp\tB_qcovhsp\tlength\tpident\tbitscore\n") -for a, (b, a_score_float, a_score_str, - a_identity_str, a_coverage_str, a_qlen, a_length) in best_hits(a_vs_b, self_comparison): +outfile = open(out_file, "w") +outfile.write( + "#A_id\tB_id\tA_length\tB_length\tA_qcovhsp\tB_qcovhsp\tlength\tpident\tbitscore\n" +) +for ( + a, + (b, a_score_float, a_score_str, a_identity_str, a_coverage_str, a_qlen, a_length), +) in best_hits.best_hits(a_vs_b, min_identity, min_coverage, self_comparison): if b not in best_b_vs_a: # Match b has no best hit continue - a2, b_score_float, b_score_str, b_identity_str, b_coverage_str, b_qlen, b_length = best_b_vs_a[b] + ( + a2, + b_score_float, + b_score_str, + b_identity_str, + b_coverage_str, + b_qlen, + b_length, + ) = best_b_vs_a[b] if a != a2: # Not an RBH continue @@ -365,8 +344,11 @@ count += 1 outfile.close() print("Done, %i RBH found" % count) -if tie_warning: - sys.stderr.write("Warning: Sequences with tied best hits found, you may have duplicates/clusters\n") +if best_hits.tie_warning: + sys.stderr.write( + "Warning: Sequences with tied best hits found, " + "you may have duplicates/clusters\n" + ) # Remove temp files... shutil.rmtree(base_path)
--- a/tools/blast_rbh/blast_rbh.xml Wed Apr 19 07:44:47 2017 -0400 +++ b/tools/blast_rbh/blast_rbh.xml Tue Dec 06 15:53:36 2022 +0000 @@ -1,19 +1,14 @@ -<tool id="blast_reciprocal_best_hits" name="BLAST Reciprocal Best Hits (RBH)" version="0.1.11"> +<tool id="blast_reciprocal_best_hits" name="BLAST Reciprocal Best Hits (RBH)" version="0.2.0"> <description>from two FASTA files</description> <requirements> <requirement type="package" version="1.67">biopython</requirement> <requirement type="package" version="2.5.0">blast</requirement> </requirements> - <stdio> - <!-- Anything other than zero is an error --> - <exit_code range="1:" /> - <exit_code range=":-1" /> - </stdio> - <version_command interpreter="python"> -blast_rbh.py --version + <version_command> +python $__tool_directory__/blast_rbh.py --version </version_command> - <command interpreter="python"> -blast_rbh.py "$fasta_a" "$fasta_b" + <command detect_errors="aggressive"> +python $__tool_directory__/blast_rbh.py '$fasta_a' '$fasta_b' -a $seq.dbtype #if $seq.dbtype=="nucl" -t $seq.nucl_type @@ -23,16 +18,16 @@ $make_nr -i $identity -c $q_cover --o "$output" +-o '$output' </command> <inputs> <!-- Galaxy does not have sub-types for protein vs nucletide FASTA --> <param name="fasta_a" type="data" format="fasta" - label="Genes/proteins from species A" - help="FASTA file, one sequence per gene/protein." /> + label="Genes/proteins from species A" + help="FASTA file, one sequence per gene/protein."/> <param name="fasta_b" type="data" format="fasta" - label="Genes/proteins from species B" - help="FASTA file, one sequence per gene/protein." /> + label="Genes/proteins from species B" + help="FASTA file, one sequence per gene/protein."/> <conditional name="seq"> <param name="dbtype" type="select" label="Molecule type of FASTA inputs"> <option value="prot">protein</option> @@ -55,14 +50,14 @@ </param> </when> </conditional> - <param name="identity" type="float" value="70" min="0" max="100" - label="Minimum percentage identity for BLAST matches" - help="Default is 70%, use 0 for no filtering." /> + <param name="identity" type="float" value="70" min="0" max="100" + label="Minimum percentage identity for BLAST matches" + help="Default is 70%, use 0 for no filtering." /> <param name="q_cover" type="float" value="50" min="0" max="100" - label="Minimum percentage query coverage for BLAST matches" - help="Default is 50%, use 0 for no filtering." /> + label="Minimum percentage query coverage for BLAST matches" + help="Default is 50%, use 0 for no filtering." /> <param name="make_nr" type="boolean" checked="false" truevalue="--nr" falsevalue="" - label="Process input FASTA files to collapse identical sequences" + label="Process input FASTA files to collapse identical sequences" help="i.e. First make the input non-redundant" /> </inputs> <outputs> @@ -106,7 +101,7 @@ <param name="q_cover" value="86"/> <output name="output" file="rbh_none.tabular" ftype="tabular"/> </test> - <!-- push the coverage over the 86% level --> + <!-- push the coverage over the 86% level --> <test> <param name="fasta_a" value="rhodopsin_nucs.fasta" ftype="fasta"/> <param name="fasta_b" value="three_human_mRNA.fasta" ftype="fasta"/> @@ -134,7 +129,7 @@ <param name="q_cover" value="0.0"/> <output name="output" file="rbh_blastn_three_human_mRNA_vs_rhodopsin_nucs.tabular" ftype="tabular"/> </test> - <!-- this pair of examples test tied best hits --> + <!-- this pair of examples test tied best hits --> <test> <param name="fasta_a" value="k12_ten_proteins.fasta" ftype="fasta"/> <param name="fasta_b" value="k12_edited_proteins.fasta" ftype="fasta"/> @@ -220,7 +215,7 @@ Punta and Ofran (2008) The Rough Guide to In Silico Function Prediction, or How To Use Sequence and Structure Information To Predict Protein Function. PLoS Comput Biol 4(10): e1000160. -http://dx.doi.org/10.1371/journal.pcbi.1000160 +https://doi.org/10.1371/journal.pcbi.1000160 The defaults are to require 70% sequence identity over the aligned region (using ``pident`` in the BLAST+ tabular output), and that the HSP alignment @@ -235,12 +230,12 @@ P.J.A. Cock, J.M. Chilton, B. Gruening, J.E. Johnson, N. Soranzo (2015). NCBI BLAST+ integrated into Galaxy. *GigaScience* 4:39 -http://dx.doi.org/10.1186/s13742-015-0080-7 +https://doi.org/10.1186/s13742-015-0080-7 Christiam Camacho et al. (2009). BLAST+: architecture and applications. *BMC Bioinformatics* 15;10:421. -http://dx.doi.org/10.1186/1471-2105-10-421 +https://doi.org/10.1186/1471-2105-10-421 This wrapper is available to install into other Galaxy Instances via the Galaxy Tool Shed at http://toolshed.g2.bx.psu.edu/view/peterjc/blast_rbh
--- a/tools/blast_rbh/tool_dependencies.xml Wed Apr 19 07:44:47 2017 -0400 +++ b/tools/blast_rbh/tool_dependencies.xml Tue Dec 06 15:53:36 2022 +0000 @@ -1,9 +1,9 @@ -<?xml version="1.0"?> +<?xml version="1.0" ?> <tool_dependency> <package name="biopython" version="1.67"> - <repository changeset_revision="a42f244cce44" name="package_biopython_1_67" owner="biopython" toolshed="https://toolshed.g2.bx.psu.edu" /> + <repository name="package_biopython_1_67" owner="biopython" toolshed="https://toolshed.g2.bx.psu.edu" changeset_revision="a12f73c3b116"/> </package> <package name="blast" version="2.5.0"> - <repository changeset_revision="5dd2b68c7d04" name="package_blast_plus_2_5_0" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" /> + <repository name="package_blast_plus_2_5_0" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" changeset_revision="5dd2b68c7d04"/> </package> -</tool_dependency> +</tool_dependency> \ No newline at end of file