changeset 26:2889433c7ae1 draft

v0.3.3 - fixed legacy dependecy definition
author peterjc
date Sat, 20 Jul 2019 18:36:36 -0400
parents e25d3acf6e68
children 6f8ea4b9a2c4
files test-data/cd00003_and_cd00008.pin test-data/rhodopsin_nucs.blastdbcmd.txt tools/ncbi_blast_plus/README.rst tools/ncbi_blast_plus/blastxml_to_tabular.py tools/ncbi_blast_plus/check_no_duplicates.py tools/ncbi_blast_plus/ncbi_blastdbcmd_wrapper.xml tools/ncbi_blast_plus/ncbi_macros.xml tools/ncbi_blast_plus/ncbi_makeprofiledb.xml tools/ncbi_blast_plus/ncbi_tblastn_wrapper.xml tools/ncbi_blast_plus/repository_dependencies.xml tools/ncbi_blast_plus/tool_dependencies.xml
diffstat 11 files changed, 178 insertions(+), 82 deletions(-) [+]
line wrap: on
line diff
Binary file test-data/cd00003_and_cd00008.pin has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/rhodopsin_nucs.blastdbcmd.txt	Sat Jul 20 18:36:36 2019 -0400
@@ -0,0 +1,7 @@
+gi|57163782|ref|NM_001009242.1|
+gi|2734705|gb|U59921.1|BBU59921 "1 -" +
+gi|283855845|gb|GQ290303.1| 1-4301 +
+gi|283855822|gb|GQ290312.1| "1-983"
+gi|18148870|dbj|AB062417.1| "1 -" +
+gi|12583664|dbj|AB043817.1| "1--"
+
--- a/tools/ncbi_blast_plus/README.rst	Tue Oct 23 08:48:19 2018 -0400
+++ b/tools/ncbi_blast_plus/README.rst	Sat Jul 20 18:36:36 2019 -0400
@@ -264,6 +264,10 @@
           output format it must be mapped to different command line arguments.
         - Extend gzipped query support to all the command line tools.
         - Workaround for gzipped support under Galaxy release 16.01 or older.
+v0.3.2  - Fixed incomplete ``@CLI_OPTIONS@`` macro in the help text for the
+          ``tblastn`` and ``blastdbcmd`` wrappers.
+v0.3.3  - Fixed ``tool_dependencies.xml`` to use BLAST+ 2.7.1 (useful only for
+          older Galaxy instances - we recommend conda for dependencies now).
 ======= ======================================================================
 
 
--- a/tools/ncbi_blast_plus/blastxml_to_tabular.py	Tue Oct 23 08:48:19 2018 -0400
+++ b/tools/ncbi_blast_plus/blastxml_to_tabular.py	Sat Jul 20 18:36:36 2019 -0400
@@ -81,12 +81,14 @@
 else:
     from galaxy import eggs  # noqa - ignore flake8 F401
     import pkg_resources
+
     pkg_resources.require("elementtree")
     from elementtree import ElementTree
 
 if len(sys.argv) == 4 and sys.argv[3] in ["std", "x22", "ext"]:
     # False positive if user really has a BLAST XML file called 'std' or 'ext'...
-    sys.exit("""ERROR: The script API has changed, sorry.
+    sys.exit(
+        """ERROR: The script API has changed, sorry.
 
 Instead of the old style:
 
@@ -99,7 +101,8 @@
 For more information, use:
 
 $ python blastxml_to_tabular.py -h
-""")
+"""
+    )
 
 usage = """usage: %prog [options] blastxml[,...]
 
@@ -113,16 +116,29 @@
 extended column names are supported.
 """
 parser = OptionParser(usage=usage)
-parser.add_option('-o', '--output', dest='output', default=None,
-                  help='output filename (defaults to stdout)',
-                  metavar="FILE")
-parser.add_option("-c", "--columns", dest="columns", default='std',
-                  help="[std|ext|col1,col2,...] standard 12 columns, extended 25 columns, or list of column names")
+parser.add_option(
+    "-o",
+    "--output",
+    dest="output",
+    default=None,
+    help="output filename (defaults to stdout)",
+    metavar="FILE",
+)
+parser.add_option(
+    "-c",
+    "--columns",
+    dest="columns",
+    default="std",
+    help="[std|ext|col1,col2,...] standard 12 columns, "
+    "extended 25 columns, or list of column names",
+)
 (options, args) = parser.parse_args()
 
-colnames = ('qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,'
-            'sstart,send,evalue,bitscore,sallseqid,score,nident,positive,'
-            'gaps,ppos,qframe,sframe,qseq,sseq,qlen,slen,salltitles').split(',')
+colnames = (
+    "qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,"
+    "sstart,send,evalue,bitscore,sallseqid,score,nident,positive,"
+    "gaps,ppos,qframe,sframe,qseq,sseq,qlen,slen,salltitles"
+).split(",")
 
 if len(args) < 1:
     sys.exit("ERROR: No BLASTXML input files given; run with --help to see options.")
@@ -148,7 +164,9 @@
     assert set(colnames).issuperset(cols), cols
     if not cols:
         sys.exit("No columns selected!")
-    extended = max(colnames.index(c) for c in cols) >= 12  # Do we need any higher columns?
+    extended = (
+        max(colnames.index(c) for c in cols) >= 12
+    )  # Do we need any higher columns?
 del out_fmt
 
 for in_file in args:
@@ -156,15 +174,15 @@
         sys.exit("Input BLAST XML file not found: %s" % in_file)
 
 
-re_default_query_id = re.compile("^Query_\d+$")
-assert re_default_query_id.match("Query_101")
-assert not re_default_query_id.match("Query_101a")
-assert not re_default_query_id.match("MyQuery_101")
-re_default_subject_id = re.compile("^Subject_\d+$")
-assert re_default_subject_id.match("Subject_1")
-assert not re_default_subject_id.match("Subject_")
-assert not re_default_subject_id.match("Subject_12a")
-assert not re_default_subject_id.match("TheSubject_1")
+re_default_query_id = re.compile(r"^Query_\d+$")
+assert re_default_query_id.match(r"Query_101")
+assert not re_default_query_id.match(r"Query_101a")
+assert not re_default_query_id.match(r"MyQuery_101")
+re_default_subject_id = re.compile(r"^Subject_\d+$")
+assert re_default_subject_id.match(r"Subject_1")
+assert not re_default_subject_id.match(r"Subject_")
+assert not re_default_subject_id.match(r"Subject_12a")
+assert not re_default_subject_id.match(r"TheSubject_1")
 
 
 def convert(blastxml_filename, output_handle):
@@ -213,7 +231,8 @@
                 # <Hit_accession>P56514</Hit_accession>
                 # or,
                 # <Hit_id>Subject_1</Hit_id>
-                # <Hit_def>gi|57163783|ref|NP_001009242.1| rhodopsin [Felis catus]</Hit_def>
+                # <Hit_def>gi|57163783|ref|NP_001009242.1|
+                # rhodopsin [Felis catus]</Hit_def>
                 # <Hit_accession>Subject_1</Hit_accession>
                 #
                 # apparently depending on the parse_deflines switch
@@ -225,11 +244,15 @@
                 # <Hit_accession>2</Hit_accession>
                 sseqid = hit.findtext("Hit_id").split(None, 1)[0]
                 hit_def = sseqid + " " + hit.findtext("Hit_def")
-                if re_default_subject_id.match(sseqid) and sseqid == hit.findtext("Hit_accession"):
+                if re_default_subject_id.match(sseqid) and sseqid == hit.findtext(
+                    "Hit_accession"
+                ):
                     # Place holder ID, take the first word of the subject definition
                     hit_def = hit.findtext("Hit_def")
                     sseqid = hit_def.split(None, 1)[0]
-                if sseqid.startswith("gnl|BL_ORD_ID|") and sseqid == "gnl|BL_ORD_ID|" + hit.findtext("Hit_accession"):
+                if sseqid.startswith(
+                    "gnl|BL_ORD_ID|"
+                ) and sseqid == "gnl|BL_ORD_ID|" + hit.findtext("Hit_accession"):
                     # Alternative place holder ID, again take the first word of hit_def
                     hit_def = hit.findtext("Hit_def")
                     sseqid = hit_def.split(None, 1)[0]
@@ -244,27 +267,61 @@
                     h_seq = hsp.findtext("Hsp_hseq")
                     m_seq = hsp.findtext("Hsp_midline")
                     assert len(q_seq) == len(h_seq) == len(m_seq) == int(length)
-                    gapopen = str(len(q_seq.replace('-', ' ').split()) - 1 +
-                                  len(h_seq.replace('-', ' ').split()) - 1)
+                    gapopen = str(
+                        len(q_seq.replace("-", " ").split())
+                        - 1
+                        + len(h_seq.replace("-", " ").split())
+                        - 1
+                    )
 
-                    mismatch = m_seq.count(' ') + m_seq.count('+') - q_seq.count('-') - h_seq.count('-')
+                    mismatch = (
+                        m_seq.count(" ")
+                        + m_seq.count("+")
+                        - q_seq.count("-")
+                        - h_seq.count("-")
+                    )
                     # TODO - Remove this alternative mismatch calculation and test
                     # once satisifed there are no problems
-                    expected_mismatch = len(q_seq) - sum(1 for q, h in zip(q_seq, h_seq)
-                                                         if q == h or q == "-" or h == "-")
+                    expected_mismatch = len(q_seq) - sum(
+                        1
+                        for q, h in zip(q_seq, h_seq)
+                        if q == h or q == "-" or h == "-"
+                    )
                     xx = sum(1 for q, h in zip(q_seq, h_seq) if q == "X" and h == "X")
-                    if not (expected_mismatch - q_seq.count("X") <= int(mismatch) <= expected_mismatch + xx):
-                        sys.exit("%s vs %s mismatches, expected %i <= %i <= %i"
-                                 % (qseqid, sseqid, expected_mismatch - q_seq.count("X"),
-                                    int(mismatch), expected_mismatch))
+                    if not (
+                        expected_mismatch - q_seq.count("X")
+                        <= int(mismatch)
+                        <= expected_mismatch + xx
+                    ):
+                        sys.exit(
+                            "%s vs %s mismatches, expected %i <= %i <= %i"
+                            % (
+                                qseqid,
+                                sseqid,
+                                expected_mismatch - q_seq.count("X"),
+                                int(mismatch),
+                                expected_mismatch,
+                            )
+                        )
 
                     # TODO - Remove this alternative identity calculation and test
                     # once satisifed there are no problems
                     expected_identity = sum(1 for q, h in zip(q_seq, h_seq) if q == h)
-                    if not (expected_identity - xx <= int(nident) <= expected_identity + q_seq.count("X")):
-                        sys.exit("%s vs %s identities, expected %i <= %i <= %i"
-                                 % (qseqid, sseqid, expected_identity, int(nident),
-                                    expected_identity + q_seq.count("X")))
+                    if not (
+                        expected_identity - xx
+                        <= int(nident)
+                        <= expected_identity + q_seq.count("X")
+                    ):
+                        sys.exit(
+                            "%s vs %s identities, expected %i <= %i <= %i"
+                            % (
+                                qseqid,
+                                sseqid,
+                                expected_identity,
+                                int(nident),
+                                expected_identity + q_seq.count("X"),
+                            )
+                        )
 
                     evalue = hsp.findtext("Hsp_evalue")
                     if evalue == "0":
@@ -280,53 +337,66 @@
                         # Note BLAST does not round to nearest int, it truncates
                         bitscore = "%i" % bitscore
 
-                    values = [qseqid,
-                              sseqid,
-                              pident,
-                              length,  # hsp.findtext("Hsp_align-len")
-                              str(mismatch),
-                              gapopen,
-                              hsp.findtext("Hsp_query-from"),  # qstart,
-                              hsp.findtext("Hsp_query-to"),  # qend,
-                              hsp.findtext("Hsp_hit-from"),  # sstart,
-                              hsp.findtext("Hsp_hit-to"),  # send,
-                              evalue,  # hsp.findtext("Hsp_evalue") in scientific notation
-                              bitscore,  # hsp.findtext("Hsp_bit-score") rounded
-                              ]
+                    values = [
+                        qseqid,
+                        sseqid,
+                        pident,
+                        length,  # hsp.findtext("Hsp_align-len")
+                        str(mismatch),
+                        gapopen,
+                        hsp.findtext("Hsp_query-from"),  # qstart,
+                        hsp.findtext("Hsp_query-to"),  # qend,
+                        hsp.findtext("Hsp_hit-from"),  # sstart,
+                        hsp.findtext("Hsp_hit-to"),  # send,
+                        evalue,  # hsp.findtext("Hsp_evalue") in scientific notation
+                        bitscore,  # hsp.findtext("Hsp_bit-score") rounded
+                    ]
 
                     if extended:
                         try:
-                            sallseqid = ";".join(name.split(None, 1)[0] for name in hit_def.split(" >"))
-                            salltitles = "<>".join(name.split(None, 1)[1] for name in hit_def.split(" >"))
+                            sallseqid = ";".join(
+                                name.split(None, 1)[0] for name in hit_def.split(" >")
+                            )
+                            salltitles = "<>".join(
+                                name.split(None, 1)[1] for name in hit_def.split(" >")
+                            )
                         except IndexError as e:
-                            sys.exit("Problem splitting multuple hits?\n%r\n--> %s" % (hit_def, e))
+                            sys.exit(
+                                "Problem splitting multuple hits?\n%r\n--> %s"
+                                % (hit_def, e)
+                            )
                         # print(hit_def, "-->", sallseqid)
                         positive = hsp.findtext("Hsp_positive")
                         ppos = "%0.2f" % (100 * float(positive) / float(length))
                         qframe = hsp.findtext("Hsp_query-frame")
                         sframe = hsp.findtext("Hsp_hit-frame")
                         if blast_program == "blastp":
-                            # Probably a bug in BLASTP that they use 0 or 1 depending on format
+                            # Probably a bug in BLASTP that they use 0 or 1
+                            # depending on format
                             if qframe == "0":
                                 qframe = "1"
                             if sframe == "0":
                                 sframe = "1"
                         slen = int(hit.findtext("Hit_len"))
-                        values.extend([sallseqid,
-                                       hsp.findtext("Hsp_score"),  # score,
-                                       nident,
-                                       positive,
-                                       hsp.findtext("Hsp_gaps"),  # gaps,
-                                       ppos,
-                                       qframe,
-                                       sframe,
-                                       # NOTE - for blastp, XML shows original seq, tabular uses XXX masking
-                                       q_seq,
-                                       h_seq,
-                                       str(qlen),
-                                       str(slen),
-                                       salltitles,
-                                       ])
+                        values.extend(
+                            [
+                                sallseqid,
+                                hsp.findtext("Hsp_score"),  # score,
+                                nident,
+                                positive,
+                                hsp.findtext("Hsp_gaps"),  # gaps,
+                                ppos,
+                                qframe,
+                                sframe,
+                                # NOTE - for blastp, XML shows original seq,
+                                # tabular uses XXX masking
+                                q_seq,
+                                h_seq,
+                                str(qlen),
+                                str(slen),
+                                salltitles,
+                            ]
+                        )
                     if cols:
                         # Only a subset of the columns are needed
                         values = [values[colnames.index(c)] for c in cols]
--- a/tools/ncbi_blast_plus/check_no_duplicates.py	Tue Oct 23 08:48:19 2018 -0400
+++ b/tools/ncbi_blast_plus/check_no_duplicates.py	Sat Jul 20 18:36:36 2019 -0400
@@ -31,7 +31,7 @@
     if not magic:
         # Empty file, special case
         continue
-    elif magic == b'\x1f\x8b':
+    elif magic == b"\x1f\x8b":
         # Gzipped
         handle = gzip.open(filename, "rt")
     elif magic[0:1] == b">":
--- a/tools/ncbi_blast_plus/ncbi_blastdbcmd_wrapper.xml	Tue Oct 23 08:48:19 2018 -0400
+++ b/tools/ncbi_blast_plus/ncbi_blastdbcmd_wrapper.xml	Sat Jul 20 18:36:36 2019 -0400
@@ -8,7 +8,7 @@
     <command detect_errors="aggressive" strict="true">
 ## The command is a Cheetah template which allows some Python based syntax.
 ## Lines starting hash hash are comments. Galaxy will turn newlines into spaces
-blastdbcmd -dbtype $db_opts.db_type -db "${db_opts.database.fields.path}"
+blastdbcmd -dbtype $db_opts.db_type -db "${db_opts.database.fields.path.replace(',',' ')}"
 
 ##TODO: What about -ctrl_a and -target_only as advanced options?
 
@@ -55,7 +55,7 @@
               <option value="prompt">User entered</option>
             </param>
             <when value="file">
-                <param name="entries" argument="-entry_batch" type="data" format="txt,tabular" label="Sequence identifier(s)" help="Plain text file with one ID per line (i.e. single column tabular file)"/>
+                <param name="entries" argument="-entry_batch" type="data" format="txt,tabular" label="Sequence identifier(s)" help="Plain text file with one ID per line, optionally with space separated range, strand, and algorithm."/>
             </when>
             <when value="prompt">
                 <param name="entries" argument="-entry" type="text" optional="false" area="true" size="10x30" label="Sequence identifier(s)" help="Comma or new line separated list"/>
@@ -88,6 +88,15 @@
             <output name="seq" file="rhodopsin_nucs.no_gi.fasta" ftype="fasta" />
         </test>
         <test>
+            <!-- This uses various start end frame combinations but all recover full sequence -->
+            <param name="db_opts|db_type" value="nucl" />
+            <param name="db_opts|database" value="rhodopsin_nucs" />
+            <param name="id_opts|id_type" value="file" />
+            <param name="id_opts|entries" value="rhodopsin_nucs.blastdbcmd.txt" ftype="txt" />
+            <param name="outfmt" value="original" />
+            <output name="seq" file="rhodopsin_nucs.no_gi.fasta" ftype="fasta" />
+        </test>
+        <test>
             <param name="db_opts|db_type" value="nucl" />
             <param name="db_opts|database" value="rhodopsin_nucs" />
             <param name="id_opts|id_type" value="prompt" />
@@ -111,6 +120,12 @@
 Extracts FASTA formatted sequences from a BLAST database
 using the NCBI BLAST+ blastdbcmd command line tool.
 
+When giving a text file of entries, use one line per sequence.
+Optional valies should be space separate - the simplest syntax
+is ``identifier start-end`` (where ``end`` can be just ``-``),
+or ``identifier start-end strand`` (wheere the strand given as
+either ``+`` or ``-``).
+
 .. class:: warningmark
 
 **BLAST assigned identifiers**
@@ -131,7 +146,7 @@
 
 -------
 
-@CLI_OPTIONS
+@CLI_OPTIONS@
 
 -------
 
--- a/tools/ncbi_blast_plus/ncbi_macros.xml	Tue Oct 23 08:48:19 2018 -0400
+++ b/tools/ncbi_blast_plus/ncbi_macros.xml	Sat Jul 20 18:36:36 2019 -0400
@@ -1,5 +1,5 @@
 <macros>
-    <token name="@WRAPPER_VERSION@">0.3.1</token>
+    <token name="@WRAPPER_VERSION@">0.3.3</token>
     <xml name="parallelism">
         <!-- If job splitting is enabled, break up the query file into parts -->
         <parallelism method="multi" split_inputs="query" split_mode="to_size" split_size="1000" merge_outputs="output1" />
--- a/tools/ncbi_blast_plus/ncbi_makeprofiledb.xml	Tue Oct 23 08:48:19 2018 -0400
+++ b/tools/ncbi_blast_plus/ncbi_makeprofiledb.xml	Sat Jul 20 18:36:36 2019 -0400
@@ -90,7 +90,7 @@
             <param name="contain_pssm_scores_type" value="yes" />
             <output name="outfile" file="empty_file.dat" ftype="blastdbd" >
                 <extra_files type="file" value="cd00003_and_cd00008.phr" name="blastdb.phr" />
-                <extra_files type="file" value="cd00003_and_cd00008.pin" name="blastdb.pin" compare="sim_size" delta="0" />
+                <extra_files type="file" value="cd00003_and_cd00008.pin" name="blastdb.pin" compare="sim_size" delta="8" />
                 <extra_files type="file" value="cd00003_and_cd00008.psq" name="blastdb.psq" />
                 <extra_files type="file" value="cd00003_and_cd00008.freq" name="blastdb.freq" />
                 <extra_files type="file" value="cd00003_and_cd00008.loo" name="blastdb.loo" />
--- a/tools/ncbi_blast_plus/ncbi_tblastn_wrapper.xml	Tue Oct 23 08:48:19 2018 -0400
+++ b/tools/ncbi_blast_plus/ncbi_tblastn_wrapper.xml	Sat Jul 20 18:36:36 2019 -0400
@@ -166,7 +166,7 @@
 
 ------
 
-@CLI_OPTIONS
+@CLI_OPTIONS@
 
 ------
 
--- a/tools/ncbi_blast_plus/repository_dependencies.xml	Tue Oct 23 08:48:19 2018 -0400
+++ b/tools/ncbi_blast_plus/repository_dependencies.xml	Sat Jul 20 18:36:36 2019 -0400
@@ -1,4 +1,4 @@
-<?xml version="1.0"?>
+<?xml version="1.0" ?>
 <repositories description="This requires the BLAST datatype definitions (e.g. the BLAST XML format).">
-    <repository changeset_revision="01b38f20197e" name="blast_datatypes" owner="devteam" toolshed="https://toolshed.g2.bx.psu.edu" />
-</repositories>
+    <repository changeset_revision="01b38f20197e" name="blast_datatypes" owner="devteam" toolshed="https://toolshed.g2.bx.psu.edu"/>
+</repositories>
\ No newline at end of file
--- a/tools/ncbi_blast_plus/tool_dependencies.xml	Tue Oct 23 08:48:19 2018 -0400
+++ b/tools/ncbi_blast_plus/tool_dependencies.xml	Sat Jul 20 18:36:36 2019 -0400
@@ -1,6 +1,6 @@
-<?xml version="1.0"?>
+<?xml version="1.0" ?>
 <tool_dependency>
-    <package name="blast" version="2.5.0">
-        <repository changeset_revision="5dd2b68c7d04" name="package_blast_plus_2_5_0" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" />
+    <package name="blast" version="2.7.1">
+        <repository changeset_revision="2e9109a8924f" name="package_blast_plus_2_7_1" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu"/>
     </package>
-</tool_dependency>
+</tool_dependency>
\ No newline at end of file