Mercurial > repos > devteam > ncbi_blast_plus
changeset 26:2889433c7ae1 draft
v0.3.3 - fixed legacy dependecy definition
author | peterjc |
---|---|
date | Sat, 20 Jul 2019 18:36:36 -0400 |
parents | e25d3acf6e68 |
children | 6f8ea4b9a2c4 |
files | test-data/cd00003_and_cd00008.pin test-data/rhodopsin_nucs.blastdbcmd.txt tools/ncbi_blast_plus/README.rst tools/ncbi_blast_plus/blastxml_to_tabular.py tools/ncbi_blast_plus/check_no_duplicates.py tools/ncbi_blast_plus/ncbi_blastdbcmd_wrapper.xml tools/ncbi_blast_plus/ncbi_macros.xml tools/ncbi_blast_plus/ncbi_makeprofiledb.xml tools/ncbi_blast_plus/ncbi_tblastn_wrapper.xml tools/ncbi_blast_plus/repository_dependencies.xml tools/ncbi_blast_plus/tool_dependencies.xml |
diffstat | 11 files changed, 178 insertions(+), 82 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/rhodopsin_nucs.blastdbcmd.txt Sat Jul 20 18:36:36 2019 -0400 @@ -0,0 +1,7 @@ +gi|57163782|ref|NM_001009242.1| +gi|2734705|gb|U59921.1|BBU59921 "1 -" + +gi|283855845|gb|GQ290303.1| 1-4301 + +gi|283855822|gb|GQ290312.1| "1-983" +gi|18148870|dbj|AB062417.1| "1 -" + +gi|12583664|dbj|AB043817.1| "1--" +
--- a/tools/ncbi_blast_plus/README.rst Tue Oct 23 08:48:19 2018 -0400 +++ b/tools/ncbi_blast_plus/README.rst Sat Jul 20 18:36:36 2019 -0400 @@ -264,6 +264,10 @@ output format it must be mapped to different command line arguments. - Extend gzipped query support to all the command line tools. - Workaround for gzipped support under Galaxy release 16.01 or older. +v0.3.2 - Fixed incomplete ``@CLI_OPTIONS@`` macro in the help text for the + ``tblastn`` and ``blastdbcmd`` wrappers. +v0.3.3 - Fixed ``tool_dependencies.xml`` to use BLAST+ 2.7.1 (useful only for + older Galaxy instances - we recommend conda for dependencies now). ======= ======================================================================
--- a/tools/ncbi_blast_plus/blastxml_to_tabular.py Tue Oct 23 08:48:19 2018 -0400 +++ b/tools/ncbi_blast_plus/blastxml_to_tabular.py Sat Jul 20 18:36:36 2019 -0400 @@ -81,12 +81,14 @@ else: from galaxy import eggs # noqa - ignore flake8 F401 import pkg_resources + pkg_resources.require("elementtree") from elementtree import ElementTree if len(sys.argv) == 4 and sys.argv[3] in ["std", "x22", "ext"]: # False positive if user really has a BLAST XML file called 'std' or 'ext'... - sys.exit("""ERROR: The script API has changed, sorry. + sys.exit( + """ERROR: The script API has changed, sorry. Instead of the old style: @@ -99,7 +101,8 @@ For more information, use: $ python blastxml_to_tabular.py -h -""") +""" + ) usage = """usage: %prog [options] blastxml[,...] @@ -113,16 +116,29 @@ extended column names are supported. """ parser = OptionParser(usage=usage) -parser.add_option('-o', '--output', dest='output', default=None, - help='output filename (defaults to stdout)', - metavar="FILE") -parser.add_option("-c", "--columns", dest="columns", default='std', - help="[std|ext|col1,col2,...] standard 12 columns, extended 25 columns, or list of column names") +parser.add_option( + "-o", + "--output", + dest="output", + default=None, + help="output filename (defaults to stdout)", + metavar="FILE", +) +parser.add_option( + "-c", + "--columns", + dest="columns", + default="std", + help="[std|ext|col1,col2,...] standard 12 columns, " + "extended 25 columns, or list of column names", +) (options, args) = parser.parse_args() -colnames = ('qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,' - 'sstart,send,evalue,bitscore,sallseqid,score,nident,positive,' - 'gaps,ppos,qframe,sframe,qseq,sseq,qlen,slen,salltitles').split(',') +colnames = ( + "qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend," + "sstart,send,evalue,bitscore,sallseqid,score,nident,positive," + "gaps,ppos,qframe,sframe,qseq,sseq,qlen,slen,salltitles" +).split(",") if len(args) < 1: sys.exit("ERROR: No BLASTXML input files given; run with --help to see options.") @@ -148,7 +164,9 @@ assert set(colnames).issuperset(cols), cols if not cols: sys.exit("No columns selected!") - extended = max(colnames.index(c) for c in cols) >= 12 # Do we need any higher columns? + extended = ( + max(colnames.index(c) for c in cols) >= 12 + ) # Do we need any higher columns? del out_fmt for in_file in args: @@ -156,15 +174,15 @@ sys.exit("Input BLAST XML file not found: %s" % in_file) -re_default_query_id = re.compile("^Query_\d+$") -assert re_default_query_id.match("Query_101") -assert not re_default_query_id.match("Query_101a") -assert not re_default_query_id.match("MyQuery_101") -re_default_subject_id = re.compile("^Subject_\d+$") -assert re_default_subject_id.match("Subject_1") -assert not re_default_subject_id.match("Subject_") -assert not re_default_subject_id.match("Subject_12a") -assert not re_default_subject_id.match("TheSubject_1") +re_default_query_id = re.compile(r"^Query_\d+$") +assert re_default_query_id.match(r"Query_101") +assert not re_default_query_id.match(r"Query_101a") +assert not re_default_query_id.match(r"MyQuery_101") +re_default_subject_id = re.compile(r"^Subject_\d+$") +assert re_default_subject_id.match(r"Subject_1") +assert not re_default_subject_id.match(r"Subject_") +assert not re_default_subject_id.match(r"Subject_12a") +assert not re_default_subject_id.match(r"TheSubject_1") def convert(blastxml_filename, output_handle): @@ -213,7 +231,8 @@ # <Hit_accession>P56514</Hit_accession> # or, # <Hit_id>Subject_1</Hit_id> - # <Hit_def>gi|57163783|ref|NP_001009242.1| rhodopsin [Felis catus]</Hit_def> + # <Hit_def>gi|57163783|ref|NP_001009242.1| + # rhodopsin [Felis catus]</Hit_def> # <Hit_accession>Subject_1</Hit_accession> # # apparently depending on the parse_deflines switch @@ -225,11 +244,15 @@ # <Hit_accession>2</Hit_accession> sseqid = hit.findtext("Hit_id").split(None, 1)[0] hit_def = sseqid + " " + hit.findtext("Hit_def") - if re_default_subject_id.match(sseqid) and sseqid == hit.findtext("Hit_accession"): + if re_default_subject_id.match(sseqid) and sseqid == hit.findtext( + "Hit_accession" + ): # Place holder ID, take the first word of the subject definition hit_def = hit.findtext("Hit_def") sseqid = hit_def.split(None, 1)[0] - if sseqid.startswith("gnl|BL_ORD_ID|") and sseqid == "gnl|BL_ORD_ID|" + hit.findtext("Hit_accession"): + if sseqid.startswith( + "gnl|BL_ORD_ID|" + ) and sseqid == "gnl|BL_ORD_ID|" + hit.findtext("Hit_accession"): # Alternative place holder ID, again take the first word of hit_def hit_def = hit.findtext("Hit_def") sseqid = hit_def.split(None, 1)[0] @@ -244,27 +267,61 @@ h_seq = hsp.findtext("Hsp_hseq") m_seq = hsp.findtext("Hsp_midline") assert len(q_seq) == len(h_seq) == len(m_seq) == int(length) - gapopen = str(len(q_seq.replace('-', ' ').split()) - 1 + - len(h_seq.replace('-', ' ').split()) - 1) + gapopen = str( + len(q_seq.replace("-", " ").split()) + - 1 + + len(h_seq.replace("-", " ").split()) + - 1 + ) - mismatch = m_seq.count(' ') + m_seq.count('+') - q_seq.count('-') - h_seq.count('-') + mismatch = ( + m_seq.count(" ") + + m_seq.count("+") + - q_seq.count("-") + - h_seq.count("-") + ) # TODO - Remove this alternative mismatch calculation and test # once satisifed there are no problems - expected_mismatch = len(q_seq) - sum(1 for q, h in zip(q_seq, h_seq) - if q == h or q == "-" or h == "-") + expected_mismatch = len(q_seq) - sum( + 1 + for q, h in zip(q_seq, h_seq) + if q == h or q == "-" or h == "-" + ) xx = sum(1 for q, h in zip(q_seq, h_seq) if q == "X" and h == "X") - if not (expected_mismatch - q_seq.count("X") <= int(mismatch) <= expected_mismatch + xx): - sys.exit("%s vs %s mismatches, expected %i <= %i <= %i" - % (qseqid, sseqid, expected_mismatch - q_seq.count("X"), - int(mismatch), expected_mismatch)) + if not ( + expected_mismatch - q_seq.count("X") + <= int(mismatch) + <= expected_mismatch + xx + ): + sys.exit( + "%s vs %s mismatches, expected %i <= %i <= %i" + % ( + qseqid, + sseqid, + expected_mismatch - q_seq.count("X"), + int(mismatch), + expected_mismatch, + ) + ) # TODO - Remove this alternative identity calculation and test # once satisifed there are no problems expected_identity = sum(1 for q, h in zip(q_seq, h_seq) if q == h) - if not (expected_identity - xx <= int(nident) <= expected_identity + q_seq.count("X")): - sys.exit("%s vs %s identities, expected %i <= %i <= %i" - % (qseqid, sseqid, expected_identity, int(nident), - expected_identity + q_seq.count("X"))) + if not ( + expected_identity - xx + <= int(nident) + <= expected_identity + q_seq.count("X") + ): + sys.exit( + "%s vs %s identities, expected %i <= %i <= %i" + % ( + qseqid, + sseqid, + expected_identity, + int(nident), + expected_identity + q_seq.count("X"), + ) + ) evalue = hsp.findtext("Hsp_evalue") if evalue == "0": @@ -280,53 +337,66 @@ # Note BLAST does not round to nearest int, it truncates bitscore = "%i" % bitscore - values = [qseqid, - sseqid, - pident, - length, # hsp.findtext("Hsp_align-len") - str(mismatch), - gapopen, - hsp.findtext("Hsp_query-from"), # qstart, - hsp.findtext("Hsp_query-to"), # qend, - hsp.findtext("Hsp_hit-from"), # sstart, - hsp.findtext("Hsp_hit-to"), # send, - evalue, # hsp.findtext("Hsp_evalue") in scientific notation - bitscore, # hsp.findtext("Hsp_bit-score") rounded - ] + values = [ + qseqid, + sseqid, + pident, + length, # hsp.findtext("Hsp_align-len") + str(mismatch), + gapopen, + hsp.findtext("Hsp_query-from"), # qstart, + hsp.findtext("Hsp_query-to"), # qend, + hsp.findtext("Hsp_hit-from"), # sstart, + hsp.findtext("Hsp_hit-to"), # send, + evalue, # hsp.findtext("Hsp_evalue") in scientific notation + bitscore, # hsp.findtext("Hsp_bit-score") rounded + ] if extended: try: - sallseqid = ";".join(name.split(None, 1)[0] for name in hit_def.split(" >")) - salltitles = "<>".join(name.split(None, 1)[1] for name in hit_def.split(" >")) + sallseqid = ";".join( + name.split(None, 1)[0] for name in hit_def.split(" >") + ) + salltitles = "<>".join( + name.split(None, 1)[1] for name in hit_def.split(" >") + ) except IndexError as e: - sys.exit("Problem splitting multuple hits?\n%r\n--> %s" % (hit_def, e)) + sys.exit( + "Problem splitting multuple hits?\n%r\n--> %s" + % (hit_def, e) + ) # print(hit_def, "-->", sallseqid) positive = hsp.findtext("Hsp_positive") ppos = "%0.2f" % (100 * float(positive) / float(length)) qframe = hsp.findtext("Hsp_query-frame") sframe = hsp.findtext("Hsp_hit-frame") if blast_program == "blastp": - # Probably a bug in BLASTP that they use 0 or 1 depending on format + # Probably a bug in BLASTP that they use 0 or 1 + # depending on format if qframe == "0": qframe = "1" if sframe == "0": sframe = "1" slen = int(hit.findtext("Hit_len")) - values.extend([sallseqid, - hsp.findtext("Hsp_score"), # score, - nident, - positive, - hsp.findtext("Hsp_gaps"), # gaps, - ppos, - qframe, - sframe, - # NOTE - for blastp, XML shows original seq, tabular uses XXX masking - q_seq, - h_seq, - str(qlen), - str(slen), - salltitles, - ]) + values.extend( + [ + sallseqid, + hsp.findtext("Hsp_score"), # score, + nident, + positive, + hsp.findtext("Hsp_gaps"), # gaps, + ppos, + qframe, + sframe, + # NOTE - for blastp, XML shows original seq, + # tabular uses XXX masking + q_seq, + h_seq, + str(qlen), + str(slen), + salltitles, + ] + ) if cols: # Only a subset of the columns are needed values = [values[colnames.index(c)] for c in cols]
--- a/tools/ncbi_blast_plus/check_no_duplicates.py Tue Oct 23 08:48:19 2018 -0400 +++ b/tools/ncbi_blast_plus/check_no_duplicates.py Sat Jul 20 18:36:36 2019 -0400 @@ -31,7 +31,7 @@ if not magic: # Empty file, special case continue - elif magic == b'\x1f\x8b': + elif magic == b"\x1f\x8b": # Gzipped handle = gzip.open(filename, "rt") elif magic[0:1] == b">":
--- a/tools/ncbi_blast_plus/ncbi_blastdbcmd_wrapper.xml Tue Oct 23 08:48:19 2018 -0400 +++ b/tools/ncbi_blast_plus/ncbi_blastdbcmd_wrapper.xml Sat Jul 20 18:36:36 2019 -0400 @@ -8,7 +8,7 @@ <command detect_errors="aggressive" strict="true"> ## The command is a Cheetah template which allows some Python based syntax. ## Lines starting hash hash are comments. Galaxy will turn newlines into spaces -blastdbcmd -dbtype $db_opts.db_type -db "${db_opts.database.fields.path}" +blastdbcmd -dbtype $db_opts.db_type -db "${db_opts.database.fields.path.replace(',',' ')}" ##TODO: What about -ctrl_a and -target_only as advanced options? @@ -55,7 +55,7 @@ <option value="prompt">User entered</option> </param> <when value="file"> - <param name="entries" argument="-entry_batch" type="data" format="txt,tabular" label="Sequence identifier(s)" help="Plain text file with one ID per line (i.e. single column tabular file)"/> + <param name="entries" argument="-entry_batch" type="data" format="txt,tabular" label="Sequence identifier(s)" help="Plain text file with one ID per line, optionally with space separated range, strand, and algorithm."/> </when> <when value="prompt"> <param name="entries" argument="-entry" type="text" optional="false" area="true" size="10x30" label="Sequence identifier(s)" help="Comma or new line separated list"/> @@ -88,6 +88,15 @@ <output name="seq" file="rhodopsin_nucs.no_gi.fasta" ftype="fasta" /> </test> <test> + <!-- This uses various start end frame combinations but all recover full sequence --> + <param name="db_opts|db_type" value="nucl" /> + <param name="db_opts|database" value="rhodopsin_nucs" /> + <param name="id_opts|id_type" value="file" /> + <param name="id_opts|entries" value="rhodopsin_nucs.blastdbcmd.txt" ftype="txt" /> + <param name="outfmt" value="original" /> + <output name="seq" file="rhodopsin_nucs.no_gi.fasta" ftype="fasta" /> + </test> + <test> <param name="db_opts|db_type" value="nucl" /> <param name="db_opts|database" value="rhodopsin_nucs" /> <param name="id_opts|id_type" value="prompt" /> @@ -111,6 +120,12 @@ Extracts FASTA formatted sequences from a BLAST database using the NCBI BLAST+ blastdbcmd command line tool. +When giving a text file of entries, use one line per sequence. +Optional valies should be space separate - the simplest syntax +is ``identifier start-end`` (where ``end`` can be just ``-``), +or ``identifier start-end strand`` (wheere the strand given as +either ``+`` or ``-``). + .. class:: warningmark **BLAST assigned identifiers** @@ -131,7 +146,7 @@ ------- -@CLI_OPTIONS +@CLI_OPTIONS@ -------
--- a/tools/ncbi_blast_plus/ncbi_macros.xml Tue Oct 23 08:48:19 2018 -0400 +++ b/tools/ncbi_blast_plus/ncbi_macros.xml Sat Jul 20 18:36:36 2019 -0400 @@ -1,5 +1,5 @@ <macros> - <token name="@WRAPPER_VERSION@">0.3.1</token> + <token name="@WRAPPER_VERSION@">0.3.3</token> <xml name="parallelism"> <!-- If job splitting is enabled, break up the query file into parts --> <parallelism method="multi" split_inputs="query" split_mode="to_size" split_size="1000" merge_outputs="output1" />
--- a/tools/ncbi_blast_plus/ncbi_makeprofiledb.xml Tue Oct 23 08:48:19 2018 -0400 +++ b/tools/ncbi_blast_plus/ncbi_makeprofiledb.xml Sat Jul 20 18:36:36 2019 -0400 @@ -90,7 +90,7 @@ <param name="contain_pssm_scores_type" value="yes" /> <output name="outfile" file="empty_file.dat" ftype="blastdbd" > <extra_files type="file" value="cd00003_and_cd00008.phr" name="blastdb.phr" /> - <extra_files type="file" value="cd00003_and_cd00008.pin" name="blastdb.pin" compare="sim_size" delta="0" /> + <extra_files type="file" value="cd00003_and_cd00008.pin" name="blastdb.pin" compare="sim_size" delta="8" /> <extra_files type="file" value="cd00003_and_cd00008.psq" name="blastdb.psq" /> <extra_files type="file" value="cd00003_and_cd00008.freq" name="blastdb.freq" /> <extra_files type="file" value="cd00003_and_cd00008.loo" name="blastdb.loo" />
--- a/tools/ncbi_blast_plus/ncbi_tblastn_wrapper.xml Tue Oct 23 08:48:19 2018 -0400 +++ b/tools/ncbi_blast_plus/ncbi_tblastn_wrapper.xml Sat Jul 20 18:36:36 2019 -0400 @@ -166,7 +166,7 @@ ------ -@CLI_OPTIONS +@CLI_OPTIONS@ ------
--- a/tools/ncbi_blast_plus/repository_dependencies.xml Tue Oct 23 08:48:19 2018 -0400 +++ b/tools/ncbi_blast_plus/repository_dependencies.xml Sat Jul 20 18:36:36 2019 -0400 @@ -1,4 +1,4 @@ -<?xml version="1.0"?> +<?xml version="1.0" ?> <repositories description="This requires the BLAST datatype definitions (e.g. the BLAST XML format)."> - <repository changeset_revision="01b38f20197e" name="blast_datatypes" owner="devteam" toolshed="https://toolshed.g2.bx.psu.edu" /> -</repositories> + <repository changeset_revision="01b38f20197e" name="blast_datatypes" owner="devteam" toolshed="https://toolshed.g2.bx.psu.edu"/> +</repositories> \ No newline at end of file
--- a/tools/ncbi_blast_plus/tool_dependencies.xml Tue Oct 23 08:48:19 2018 -0400 +++ b/tools/ncbi_blast_plus/tool_dependencies.xml Sat Jul 20 18:36:36 2019 -0400 @@ -1,6 +1,6 @@ -<?xml version="1.0"?> +<?xml version="1.0" ?> <tool_dependency> - <package name="blast" version="2.5.0"> - <repository changeset_revision="5dd2b68c7d04" name="package_blast_plus_2_5_0" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" /> + <package name="blast" version="2.7.1"> + <repository changeset_revision="2e9109a8924f" name="package_blast_plus_2_7_1" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu"/> </package> -</tool_dependency> +</tool_dependency> \ No newline at end of file