Repository 'blastxml_to_gapped_gff3'
hg clone https://toolshed.g2.bx.psu.edu/repos/iuc/blastxml_to_gapped_gff3

Changeset 0:bd47051afe98 (2016-12-20)
Next changeset 1:877cd0833221 (2017-02-15)
Commit message:
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/blastxml_to_gapped_gff3 commit 8f38145c94ecb1e23c3ff6f0243213dc49d2287e
added:
blastxml_to_gapped_gff3.py
blastxml_to_gapped_gff3.xml
macros.xml
static/images/blast-extended.png
static/images/blast2html.png
test-data/blast.gff
test-data/input.xml
b
diff -r 000000000000 -r bd47051afe98 blastxml_to_gapped_gff3.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/blastxml_to_gapped_gff3.py Tue Dec 20 09:21:11 2016 -0500
[
b'@@ -0,0 +1,263 @@\n+#!/usr/bin/perl\n+import argparse\n+import copy\n+import logging\n+import re\n+import sys\n+\n+from BCBio import GFF\n+\n+logging.basicConfig(level=logging.INFO)\n+log = logging.getLogger(name=\'blastxml2gff3\')\n+\n+__author__ = "Eric Rasche"\n+__version__ = "0.4.0"\n+__maintainer__ = "Eric Rasche"\n+__email__ = "esr@tamu.edu"\n+\n+__doc__ = """\n+BlastXML files, when transformed to GFF3, do not normally show gaps in the\n+blast hits. This tool aims to fill that "gap".\n+"""\n+\n+\n+def blastxml2gff3(blastxml, min_gap=3, trim=False, trim_end=False):\n+    from Bio.Blast import NCBIXML\n+    from Bio.Seq import Seq\n+    from Bio.SeqRecord import SeqRecord\n+    from Bio.SeqFeature import SeqFeature, FeatureLocation\n+\n+    blast_records = NCBIXML.parse(blastxml)\n+    records = []\n+    for record in blast_records:\n+        # http://www.sequenceontology.org/browser/release_2.4/term/SO:0000343\n+        match_type = {  # Currently we can only handle BLASTN, BLASTP\n+            \'BLASTN\': \'nucleotide_match\',\n+            \'BLASTP\': \'protein_match\',\n+        }.get(record.application, \'match\')\n+\n+        rec = SeqRecord(Seq("ACTG"), id=record.query)\n+        for hit in record.alignments:\n+            for hsp in hit.hsps:\n+                qualifiers = {\n+                    "source": "blast",\n+                    "score": hsp.expect,\n+                    "accession": hit.accession,\n+                    "hit_id": hit.hit_id,\n+                    "length": hit.length,\n+                    "hit_titles": hit.title.split(\' >\')\n+                }\n+                desc = hit.title.split(\' >\')[0]\n+                qualifiers[\'description\'] = desc[desc.index(\' \'):]\n+\n+                # This required a fair bit of sketching out/match to figure out\n+                # the first time.\n+                #\n+                # the match_start location must account for queries and\n+                # subjecst that start at locations other than 1\n+                parent_match_start = hsp.query_start - hsp.sbjct_start\n+                # The end is the start + hit.length because the match itself\n+                # may be longer than the parent feature, so we use the supplied\n+                # subject/hit length to calculate the real ending of the target\n+                # protein.\n+                parent_match_end = hsp.query_start + hit.length + hsp.query.count(\'-\')\n+\n+                # However, if the user requests that we trim the feature, then\n+                # we need to cut the ``match`` start to 0 to match the parent feature.\n+                # We\'ll also need to cut the end to match the query\'s end. It (maybe)\n+                # should be the feature end? But we don\'t have access to that data, so\n+                # We settle for this.\n+                if trim:\n+                    if parent_match_start < 1:\n+                        parent_match_start = 0\n+\n+                if trim or trim_end:\n+                    if parent_match_end > hsp.query_end:\n+                        parent_match_end = hsp.query_end + 1\n+\n+                # The ``match`` feature will hold one or more ``match_part``s\n+                top_feature = SeqFeature(\n+                    FeatureLocation(parent_match_start, parent_match_end),\n+                    type=match_type, strand=0,\n+                    qualifiers=qualifiers\n+                )\n+\n+                # Unlike the parent feature, ``match_part``s have sources.\n+                part_qualifiers = {\n+                    "source": "blast",\n+                }\n+                top_feature.sub_features = []\n+                for start, end, cigar in generate_parts(hsp.query, hsp.match,\n+                                                        hsp.sbjct,\n+                                                        ignore_under=min_gap):\n+                    part_qualifiers[\'Gap\'] = cigar\n+                    part_qualifiers[\'ID\'] = hit.hit_id\n+\n+                    if trim:\n+                        # If trimming, then we start relative to th'..b'ome::\n+\n+        Q:ACTGACTGACTG\n+        S:ACTGAC---CTG\n+\n+    which greatly simplifies the process of identifying the correct location\n+    for a match_part\n+    """\n+    prev = 0\n+    fq = \'\'\n+    fm = \'\'\n+    fs = \'\'\n+    for position in re.finditer(\'-\', query):\n+        fq += query[prev:position.start()]\n+        fm += match[prev:position.start()]\n+        fs += subject[prev:position.start()]\n+        prev = position.start() + 1\n+    fq += query[prev:]\n+    fm += match[prev:]\n+    fs += subject[prev:]\n+\n+    return (fq, fm, fs)\n+\n+\n+def generate_parts(query, match, subject, ignore_under=3):\n+    region_q = []\n+    region_m = []\n+    region_s = []\n+\n+    (query, match, subject) = __remove_query_gaps(query, match, subject)\n+\n+    region_start = -1\n+    region_end = -1\n+    mismatch_count = 0\n+    for i, (q, m, s) in enumerate(zip(query, match, subject)):\n+\n+        # If we have a match\n+        if m != \' \' or m == \'+\':\n+            if region_start == -1:\n+                region_start = i\n+                # It\'s a new region, we need to reset or it\'s pre-seeded with\n+                # spaces\n+                region_q = []\n+                region_m = []\n+                region_s = []\n+            region_end = i\n+            mismatch_count = 0\n+        else:\n+            mismatch_count += 1\n+\n+        region_q.append(q)\n+        region_m.append(m)\n+        region_s.append(s)\n+\n+        if mismatch_count >= ignore_under and region_start != -1 and region_end != -1:\n+            region_q = region_q[0:-ignore_under]\n+            region_m = region_m[0:-ignore_under]\n+            region_s = region_s[0:-ignore_under]\n+            yield region_start, region_end + 1, \\\n+                cigar_from_string(region_q, region_m, region_s, strict_m=True)\n+            region_q = []\n+            region_m = []\n+            region_s = []\n+\n+            region_start = -1\n+            region_end = -1\n+            mismatch_count = 0\n+\n+    yield region_start, region_end + 1, \\\n+        cigar_from_string(region_q, region_m, region_s, strict_m=True)\n+\n+\n+def _qms_to_matches(query, match, subject, strict_m=True):\n+    matchline = []\n+\n+    for (q, m, s) in zip(query, match, subject):\n+        ret = \'\'\n+\n+        if m != \' \' or m == \'+\':\n+            ret = \'=\'\n+        elif m == \' \':\n+            if q == \'-\':\n+                ret = \'D\'\n+            elif s == \'-\':\n+                ret = \'I\'\n+            else:\n+                ret = \'X\'\n+        else:\n+            log.warn("Bad data: \\n\\t%s\\n\\t%s\\n\\t%s\\n" % (query, match, subject))\n+\n+        if strict_m:\n+            if ret == \'=\' or ret == \'X\':\n+                ret = \'M\'\n+\n+        matchline.append(ret)\n+    return matchline\n+\n+\n+def _matchline_to_cigar(matchline):\n+    cigar_line = []\n+    last_char = matchline[0]\n+    count = 0\n+    for char in matchline:\n+        if char == last_char:\n+            count += 1\n+        else:\n+            cigar_line.append("%s%s" % (last_char, count))\n+            count = 1\n+        last_char = char\n+    cigar_line.append("%s%s" % (last_char, count))\n+    return \' \'.join(cigar_line)\n+\n+\n+def cigar_from_string(query, match, subject, strict_m=True):\n+    matchline = _qms_to_matches(query, match, subject, strict_m=strict_m)\n+    if len(matchline) > 0:\n+        return _matchline_to_cigar(matchline)\n+    else:\n+        return ""\n+\n+\n+if __name__ == \'__main__\':\n+    parser = argparse.ArgumentParser(description=\'Convert Blast XML to gapped GFF3\', epilog=\'\')\n+    parser.add_argument(\'blastxml\', type=open, help=\'Blast XML Output\')\n+    parser.add_argument(\'--min_gap\', type=int, help=\'Maximum gap size before generating a new match_part\', default=3)\n+    parser.add_argument(\'--trim\', action=\'store_true\', help=\'Trim blast hits to be only as long as the parent feature\')\n+    parser.add_argument(\'--trim_end\', action=\'store_true\', help=\'Cut blast results off at end of gene\')\n+    args = parser.parse_args()\n+\n+    result = blastxml2gff3(**vars(args))\n+    GFF.write(result, sys.stdout)\n'
b
diff -r 000000000000 -r bd47051afe98 blastxml_to_gapped_gff3.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/blastxml_to_gapped_gff3.xml Tue Dec 20 09:21:11 2016 -0500
[
@@ -0,0 +1,73 @@
+<tool id="blastxml_to_gapped_gff3" name="BlastXML to gapped GFF3" version="1.1">
+  <description></description>
+  <macros>
+    <import>macros.xml</import>
+  </macros>
+  <expand macro="requirements"/>
+  <expand macro="stdio"/>
+  <version_command>python blastxml_to_gapped_gff3.py --version</version_command>
+  <command>
+  <![CDATA[
+    python '$__tool_directory__/blastxml_to_gapped_gff3.py'
+      '$blastxml'
+      --min_gap $min_gap
+      $trim
+      > '$output'
+  ]]></command>
+  <inputs>
+    <param label="Blast XML" name="blastxml" type="data" format="blastxml"/>
+    <param label="Maximum gap size before generating a new match_part" name="min_gap" type="integer" value="3"/>
+
+    <param type="select" label="Trim sides of blast hits" name="trim">
+        <option value="">None</option>
+        <option value="--trim">Trim start and end</option>
+        <option value="--trim_end" selected="true">Trim only end</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="gff3" name="output"/>
+  </outputs>
+  <tests>
+      <test>
+          <param name="blastxml" ftype="blastxml" value="input.xml"/>
+          <output name="output" file="blast.gff"/>
+      </test>
+  </tests>
+  <help><![CDATA[
+**What it does**
+
+Convert BlastXML results into GFF3 format.
+
+**Options**
+
+The trimming option captures an important feature provided in this tool that
+isn't provided in most other BlastXML visualization tools: the fact that blast
+captures complete alignment location information.
+
+This means that when most blast visualization tools produce output which looks
+like this:
+
+.. image:: $PATH_TO_IMAGES/blast2html.png
+
+This tool produces output which shows where the real subject sequence starts and ends relative to your sequence:
+
+.. image:: $PATH_TO_IMAGES/blast-extended.png
+
+This can be a useful feature for examining alternate start locations that are
+used by sequences found from your blast query.
+
+The green bars on the very top row of the picture indicate start sites, as you
+can see the blast hits and the genome in the visualization share an upstream
+start site. You don't lose the information present in your blastxml data.
+
+So, to the end of useful functionality, this options is controllable:
+
+-  you can trim neither end, see where the real protein alignments are.
+-  you can trim both ends, like blasts HTML reports
+-  you can trim just the end of the sequence, as upstream is generally more
+   interesting, and having long tails can result in poor visualizations.
+]]></help>
+    <citations>
+    </citations>
+</tool>
+
b
diff -r 000000000000 -r bd47051afe98 macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml Tue Dec 20 09:21:11 2016 -0500
b
@@ -0,0 +1,17 @@
+<?xml version="1.0"?>
+<macros>
+  <xml name="requirements">
+    <requirements>
+      <requirement type="package" version="0.6.4">bcbiogff</requirement>
+      <yield/>
+    </requirements>
+  </xml>
+  <xml name="stdio">
+    <stdio>
+      <exit_code range="1:"/>
+      <exit_code range=":-1"/>
+      <regex match="Error:"/>
+      <regex match="Exception:"/>
+    </stdio>
+  </xml>
+</macros>
b
diff -r 000000000000 -r bd47051afe98 static/images/blast-extended.png
b
Binary file static/images/blast-extended.png has changed
b
diff -r 000000000000 -r bd47051afe98 static/images/blast2html.png
b
Binary file static/images/blast2html.png has changed
b
diff -r 000000000000 -r bd47051afe98 test-data/blast.gff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/blast.gff Tue Dec 20 09:21:11 2016 -0500
b
b'@@ -0,0 +1,1963 @@\n+##gff-version 3\n+##sequence-region Merlin_1 1 4\n+Merlin_1\tblast\tprotein_match\t-471\t230\t3.74548e-55\t.\t.\tID=biopygen1;accession=YP_007004572;description=hypothetical protein %5BEnterobacteria phage ime09%5D;hit_id=gi%7C422934611%7Cref%7CYP_007004572.1%7C;hit_titles=gi%7C422934611%7Cref%7CYP_007004572.1%7C hypothetical protein %5BEnterobacteria phage ime09%5D,gi%7C339791394%7Cgb%7CAEK12451.1%7C hypothetical protein %5BEnterobacteria phage ime09%5D;length=685\n+Merlin_1\tblast\tmatch_part\t2\t14\t.\t.\t.\tGap=M13;ID=gi%7C422934611%7Cref%7CYP_007004572.1%7C;Parent=biopygen1\n+Merlin_1\tblast\tmatch_part\t18\t55\t.\t.\t.\tGap=M38;ID=gi%7C422934611%7Cref%7CYP_007004572.1%7C;Parent=biopygen1\n+Merlin_1\tblast\tmatch_part\t59\t63\t.\t.\t.\tGap=M5;ID=gi%7C422934611%7Cref%7CYP_007004572.1%7C;Parent=biopygen1\n+Merlin_1\tblast\tmatch_part\t73\t82\t.\t.\t.\tGap=M10;ID=gi%7C422934611%7Cref%7CYP_007004572.1%7C;Parent=biopygen1\n+Merlin_1\tblast\tmatch_part\t87\t90\t.\t.\t.\tGap=M4;ID=gi%7C422934611%7Cref%7CYP_007004572.1%7C;Parent=biopygen1\n+Merlin_1\tblast\tmatch_part\t103\t207\t.\t.\t.\tGap=M72 I2 M26 I2 M3;ID=gi%7C422934611%7Cref%7CYP_007004572.1%7C;Parent=biopygen1\n+Merlin_1\tblast\tmatch_part\t212\t229\t.\t.\t.\tGap=M18;ID=gi%7C422934611%7Cref%7CYP_007004572.1%7C;Parent=biopygen1\n+Merlin_1\tblast\tprotein_match\t-471\t230\t4.31042e-55\t.\t.\tID=biopygen2;accession=YP_004415089;description=hypothetical protein Shfl2p198 %5BShigella phage Shfl2%5D;hit_id=gi%7C330858714%7Cref%7CYP_004415089.1%7C;hit_titles=gi%7C330858714%7Cref%7CYP_004415089.1%7C hypothetical protein Shfl2p198 %5BShigella phage Shfl2%5D,gi%7C327397648%7Cgb%7CAEA73150.1%7C hypothetical protein Shfl2p198 %5BShigella phage Shfl2%5D;length=685\n+Merlin_1\tblast\tmatch_part\t2\t14\t.\t.\t.\tGap=M13;ID=gi%7C330858714%7Cref%7CYP_004415089.1%7C;Parent=biopygen2\n+Merlin_1\tblast\tmatch_part\t18\t55\t.\t.\t.\tGap=M38;ID=gi%7C330858714%7Cref%7CYP_004415089.1%7C;Parent=biopygen2\n+Merlin_1\tblast\tmatch_part\t59\t63\t.\t.\t.\tGap=M5;ID=gi%7C330858714%7Cref%7CYP_004415089.1%7C;Parent=biopygen2\n+Merlin_1\tblast\tmatch_part\t73\t82\t.\t.\t.\tGap=M10;ID=gi%7C330858714%7Cref%7CYP_004415089.1%7C;Parent=biopygen2\n+Merlin_1\tblast\tmatch_part\t87\t90\t.\t.\t.\tGap=M4;ID=gi%7C330858714%7Cref%7CYP_004415089.1%7C;Parent=biopygen2\n+Merlin_1\tblast\tmatch_part\t103\t207\t.\t.\t.\tGap=M72 I2 M26 I2 M3;ID=gi%7C330858714%7Cref%7CYP_004415089.1%7C;Parent=biopygen2\n+Merlin_1\tblast\tmatch_part\t212\t229\t.\t.\t.\tGap=M18;ID=gi%7C330858714%7Cref%7CYP_004415089.1%7C;Parent=biopygen2\n+Merlin_1\tblast\tprotein_match\t-471\t230\t4.35388e-55\t.\t.\tID=biopygen3;accession=YP_002854530;description=alt.-2 hypothetical protein %5BEnterobacteria phage RB14%5D;hit_id=gi%7C228861509%7Cref%7CYP_002854530.1%7C;hit_titles=gi%7C228861509%7Cref%7CYP_002854530.1%7C alt.-2 hypothetical protein %5BEnterobacteria phage RB14%5D,gi%7C227438525%7Cgb%7CACP30838.1%7C alt.-2 hypothetical protein %5BEnterobacteria phage RB14%5D;length=685\n+Merlin_1\tblast\tmatch_part\t2\t14\t.\t.\t.\tGap=M13;ID=gi%7C228861509%7Cref%7CYP_002854530.1%7C;Parent=biopygen3\n+Merlin_1\tblast\tmatch_part\t18\t55\t.\t.\t.\tGap=M38;ID=gi%7C228861509%7Cref%7CYP_002854530.1%7C;Parent=biopygen3\n+Merlin_1\tblast\tmatch_part\t59\t63\t.\t.\t.\tGap=M5;ID=gi%7C228861509%7Cref%7CYP_002854530.1%7C;Parent=biopygen3\n+Merlin_1\tblast\tmatch_part\t73\t82\t.\t.\t.\tGap=M10;ID=gi%7C228861509%7Cref%7CYP_002854530.1%7C;Parent=biopygen3\n+Merlin_1\tblast\tmatch_part\t96\t99\t.\t.\t.\tGap=M4;ID=gi%7C228861509%7Cref%7CYP_002854530.1%7C;Parent=biopygen3\n+Merlin_1\tblast\tmatch_part\t103\t148\t.\t.\t.\tGap=M46;ID=gi%7C228861509%7Cref%7CYP_002854530.1%7C;Parent=biopygen3\n+Merlin_1\tblast\tmatch_part\t152\t207\t.\t.\t.\tGap=M23 I2 M26 I2 M3;ID=gi%7C228861509%7Cref%7CYP_002854530.1%7C;Parent=biopygen3\n+Merlin_1\tblast\tmatch_part\t212\t229\t.\t.\t.\tGap=M18;ID=gi%7C228861509%7Cref%7CYP_002854530.1%7C;Parent=biopygen3\n+##sequence-region Merlin_2 1 4\n+Merlin_2\tblast\tprotein_match\t-10\t96\t9.23754e-17\t.\t.\tID=biopygen4;accession=YP_003934833;description=hypothetical protein SP18_gp210 %5BShigella phage SP18%5D;hit_id=gi%7C308814559%7Cref%7CYP_003934833.1%7C;hit_ti'..b'ref%7CNP_932538.1%7C;Parent=biopygen156\n+Merlin_5\tblast\tmatch_part\t175\t178\t.\t.\t.\tGap=M4;ID=gi%7C37651664%7Cref%7CNP_932538.1%7C;Parent=biopygen156\n+Merlin_5\tblast\tmatch_part\t182\t188\t.\t.\t.\tGap=M7;ID=gi%7C37651664%7Cref%7CNP_932538.1%7C;Parent=biopygen156\n+Merlin_5\tblast\tmatch_part\t192\t200\t.\t.\t.\tGap=M9;ID=gi%7C37651664%7Cref%7CNP_932538.1%7C;Parent=biopygen156\n+Merlin_5\tblast\tmatch_part\t204\t226\t.\t.\t.\tGap=M23;ID=gi%7C37651664%7Cref%7CNP_932538.1%7C;Parent=biopygen156\n+Merlin_5\tblast\tmatch_part\t231\t258\t.\t.\t.\tGap=M28;ID=gi%7C37651664%7Cref%7CNP_932538.1%7C;Parent=biopygen156\n+Merlin_5\tblast\tmatch_part\t262\t265\t.\t.\t.\tGap=M4;ID=gi%7C37651664%7Cref%7CNP_932538.1%7C;Parent=biopygen156\n+Merlin_5\tblast\tmatch_part\t270\t273\t.\t.\t.\tGap=M4;ID=gi%7C37651664%7Cref%7CNP_932538.1%7C;Parent=biopygen156\n+Merlin_5\tblast\tmatch_part\t277\t346\t.\t.\t.\tGap=M70;ID=gi%7C37651664%7Cref%7CNP_932538.1%7C;Parent=biopygen156\n+Merlin_5\tblast\tmatch_part\t351\t375\t.\t.\t.\tGap=M25;ID=gi%7C37651664%7Cref%7CNP_932538.1%7C;Parent=biopygen156\n+Merlin_5\tblast\tmatch_part\t385\t387\t.\t.\t.\tGap=M3;ID=gi%7C37651664%7Cref%7CNP_932538.1%7C;Parent=biopygen156\n+Merlin_5\tblast\tmatch_part\t393\t393\t.\t.\t.\tGap=M1;ID=gi%7C37651664%7Cref%7CNP_932538.1%7C;Parent=biopygen156\n+Merlin_5\tblast\tmatch_part\t397\t402\t.\t.\t.\tGap=M6;ID=gi%7C37651664%7Cref%7CNP_932538.1%7C;Parent=biopygen156\n+Merlin_5\tblast\tmatch_part\t409\t413\t.\t.\t.\tGap=M5;ID=gi%7C37651664%7Cref%7CNP_932538.1%7C;Parent=biopygen156\n+Merlin_5\tblast\tmatch_part\t417\t419\t.\t.\t.\tGap=M3;ID=gi%7C37651664%7Cref%7CNP_932538.1%7C;Parent=biopygen156\n+Merlin_5\tblast\tmatch_part\t423\t431\t.\t.\t.\tGap=M9;ID=gi%7C37651664%7Cref%7CNP_932538.1%7C;Parent=biopygen156\n+Merlin_5\tblast\tmatch_part\t437\t446\t.\t.\t.\tGap=M10;ID=gi%7C37651664%7Cref%7CNP_932538.1%7C;Parent=biopygen156\n+Merlin_5\tblast\tmatch_part\t451\t461\t.\t.\t.\tGap=M11;ID=gi%7C37651664%7Cref%7CNP_932538.1%7C;Parent=biopygen156\n+Merlin_5\tblast\tmatch_part\t467\t472\t.\t.\t.\tGap=M6;ID=gi%7C37651664%7Cref%7CNP_932538.1%7C;Parent=biopygen156\n+Merlin_5\tblast\tmatch_part\t476\t478\t.\t.\t.\tGap=M3;ID=gi%7C37651664%7Cref%7CNP_932538.1%7C;Parent=biopygen156\n+Merlin_5\tblast\tmatch_part\t482\t499\t.\t.\t.\tGap=M18;ID=gi%7C37651664%7Cref%7CNP_932538.1%7C;Parent=biopygen156\n+Merlin_5\tblast\tmatch_part\t506\t511\t.\t.\t.\tGap=M6;ID=gi%7C37651664%7Cref%7CNP_932538.1%7C;Parent=biopygen156\n+Merlin_5\tblast\tmatch_part\t515\t515\t.\t.\t.\tGap=M1;ID=gi%7C37651664%7Cref%7CNP_932538.1%7C;Parent=biopygen156\n+Merlin_5\tblast\tmatch_part\t519\t526\t.\t.\t.\tGap=M8;ID=gi%7C37651664%7Cref%7CNP_932538.1%7C;Parent=biopygen156\n+Merlin_5\tblast\tmatch_part\t530\t534\t.\t.\t.\tGap=M5;ID=gi%7C37651664%7Cref%7CNP_932538.1%7C;Parent=biopygen156\n+Merlin_5\tblast\tmatch_part\t540\t557\t.\t.\t.\tGap=M2 I1 M15;ID=gi%7C37651664%7Cref%7CNP_932538.1%7C;Parent=biopygen156\n+Merlin_5\tblast\tmatch_part\t562\t569\t.\t.\t.\tGap=M8;ID=gi%7C37651664%7Cref%7CNP_932538.1%7C;Parent=biopygen156\n+Merlin_5\tblast\tprotein_match\t0\t190\t1.49556e-13\t.\t.\tID=biopygen157;accession=CCI89086;description=phage baseplate hub %5BYersinia phage phiD1%5D;hit_id=gi%7C398313739%7Cemb%7CCCI89086.1%7C;hit_titles=gi%7C398313739%7Cemb%7CCCI89086.1%7C phage baseplate hub %5BYersinia phage phiD1%5D;length=191\n+Merlin_5\tblast\tmatch_part\t2\t82\t.\t.\t.\tGap=M10 I1 M70;ID=gi%7C398313739%7Cemb%7CCCI89086.1%7C;Parent=biopygen157\n+Merlin_5\tblast\tmatch_part\t89\t89\t.\t.\t.\tGap=M1;ID=gi%7C398313739%7Cemb%7CCCI89086.1%7C;Parent=biopygen157\n+Merlin_5\tblast\tmatch_part\t94\t114\t.\t.\t.\tGap=M21;ID=gi%7C398313739%7Cemb%7CCCI89086.1%7C;Parent=biopygen157\n+Merlin_5\tblast\tmatch_part\t120\t124\t.\t.\t.\tGap=M5;ID=gi%7C398313739%7Cemb%7CCCI89086.1%7C;Parent=biopygen157\n+Merlin_5\tblast\tmatch_part\t128\t142\t.\t.\t.\tGap=M15;ID=gi%7C398313739%7Cemb%7CCCI89086.1%7C;Parent=biopygen157\n+Merlin_5\tblast\tmatch_part\t149\t157\t.\t.\t.\tGap=M9;ID=gi%7C398313739%7Cemb%7CCCI89086.1%7C;Parent=biopygen157\n+Merlin_5\tblast\tmatch_part\t163\t163\t.\t.\t.\tGap=M1;ID=gi%7C398313739%7Cemb%7CCCI89086.1%7C;Parent=biopygen157\n+Merlin_5\tblast\tmatch_part\t168\t189\t.\t.\t.\tGap=M10 I1 M11;ID=gi%7C398313739%7Cemb%7CCCI89086.1%7C;Parent=biopygen157\n'
b
diff -r 000000000000 -r bd47051afe98 test-data/input.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/input.xml Tue Dec 20 09:21:11 2016 -0500
[
b'@@ -0,0 +1,4514 @@\n+<?xml version="1.0"?>\n+<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">\n+<BlastOutput>\n+  <BlastOutput_program>blastp</BlastOutput_program>\n+  <BlastOutput_version>BLASTP 2.2.28+</BlastOutput_version>\n+  <BlastOutput_reference>Stephen F. Altschul, Thomas L. Madden, Alejandro A. Sch&amp;auml;ffer, Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), &quot;Gapped BLAST and PSI-BLAST: a new generation of protein database search programs&quot;, Nucleic Acids Res. 25:3389-3402.</BlastOutput_reference>\n+  <BlastOutput_db>/usr/local/syncdb/community/nr/nr</BlastOutput_db>\n+  <BlastOutput_query-ID>Query_1</BlastOutput_query-ID>\n+  <BlastOutput_query-def>Merlin_1</BlastOutput_query-def>\n+  <BlastOutput_query-len>229</BlastOutput_query-len>\n+  <BlastOutput_param>\n+    <Parameters>\n+      <Parameters_matrix>BLOSUM62</Parameters_matrix>\n+      <Parameters_expect>0.001</Parameters_expect>\n+      <Parameters_gap-open>11</Parameters_gap-open>\n+      <Parameters_gap-extend>1</Parameters_gap-extend>\n+      <Parameters_filter>F</Parameters_filter>\n+    </Parameters>\n+  </BlastOutput_param>\n+<BlastOutput_iterations>\n+<Iteration>\n+  <Iteration_iter-num>1</Iteration_iter-num>\n+  <Iteration_query-ID>Query_1</Iteration_query-ID>\n+  <Iteration_query-def>Merlin_1</Iteration_query-def>\n+  <Iteration_query-len>229</Iteration_query-len>\n+<Iteration_hits>\n+<Hit>\n+  <Hit_num>1</Hit_num>\n+  <Hit_id>gi|422934611|ref|YP_007004572.1|</Hit_id>\n+  <Hit_def>hypothetical protein [Enterobacteria phage ime09] &gt;gi|339791394|gb|AEK12451.1| hypothetical protein [Enterobacteria phage ime09]</Hit_def>\n+  <Hit_accession>YP_007004572</Hit_accession>\n+  <Hit_len>685</Hit_len>\n+  <Hit_hsps>\n+    <Hsp>\n+      <Hsp_num>1</Hsp_num>\n+      <Hsp_bit-score>197.593</Hsp_bit-score>\n+      <Hsp_score>501</Hsp_score>\n+      <Hsp_evalue>3.74548e-55</Hsp_evalue>\n+      <Hsp_query-from>2</Hsp_query-from>\n+      <Hsp_query-to>229</Hsp_query-to>\n+      <Hsp_hit-from>474</Hsp_hit-from>\n+      <Hsp_hit-to>684</Hsp_hit-to>\n+      <Hsp_query-frame>0</Hsp_query-frame>\n+      <Hsp_hit-frame>0</Hsp_hit-frame>\n+      <Hsp_identity>106</Hsp_identity>\n+      <Hsp_positive>154</Hsp_positive>\n+      <Hsp_gaps>21</Hsp_gaps>\n+      <Hsp_align-len>230</Hsp_align-len>\n+      <Hsp_qseq>LDKGTLLYRGQKLDLPTFEHNAENKLFYFRNYVSTSLKPLIFGEFGRMFMALDDDTTIYTAETPDDYNRFANPEDIIDIGATQKDSFDDNNNDGTSINIGKQVNLGFVISGAENVRVIVPGSLTEYPEEAEVILPRGTLLKINKITTQVDKRS--NKFMVEGSIVPPSEQIDESVEIYDGDLFMETGEVVKLSGFMQFVNESAYDEEQNQMAAEILSGFLDIDDMPRKFR</Hsp_qseq>\n+      <Hsp_hseq>LPPGTTLYRGQEVTFKTLRHNIENKMFYFKNFVSTSLKPNIFGEHGKNYMALDDSGAVFSGEGEGS----VDAEDLMHMGSHSAYANED-----------AETSVGMVIKGAERIKVIVPGHLSGFPSEAEVILPRGILLKINKVSTYMMKETAYNKYLIEGTIVPPSEQLEESV--YDGDHLMETGEVRPMAGFNQFLVEES--KEEENEVSQILASLVNINGMSKKFK</Hsp_hseq>\n+      <Hsp_midline>L  GT LYRGQ++   T  HN ENK+FYF+N+VSTSLKP IFGE G+ +MALDD   +++ E         + ED++ +G+    + +D            + ++G VI GAE ++VIVPG L+ +P EAEVILPRG LLKINK++T + K +  NK+++EG+IVPPSEQ++ESV  YDGD  METGEV  ++GF QF+ E +  +E+    ++IL+  ++I+ M +KF+</Hsp_midline>\n+    </Hsp>\n+  </Hit_hsps>\n+</Hit>\n+<Hit>\n+  <Hit_num>2</Hit_num>\n+  <Hit_id>gi|330858714|ref|YP_004415089.1|</Hit_id>\n+  <Hit_def>hypothetical protein Shfl2p198 [Shigella phage Shfl2] &gt;gi|327397648|gb|AEA73150.1| hypothetical protein Shfl2p198 [Shigella phage Shfl2]</Hit_def>\n+  <Hit_accession>YP_004415089</Hit_accession>\n+  <Hit_len>685</Hit_len>\n+  <Hit_hsps>\n+    <Hsp>\n+      <Hsp_num>1</Hsp_num>\n+      <Hsp_bit-score>197.593</Hsp_bit-score>\n+      <Hsp_score>501</Hsp_score>\n+      <Hsp_evalue>4.31042e-55</Hsp_evalue>\n+      <Hsp_query-from>2</Hsp_query-from>\n+      <Hsp_query-to>229</Hsp_query-to>\n+      <Hsp_hit-from>474</Hsp_hit-from>\n+      <Hsp_hit-to>684</Hsp_hit-to>\n+      <Hsp_query-frame>0</Hsp_query-frame>\n+      <Hsp_hit-frame>0</Hsp_hit-frame>\n+      <Hsp_identity>106</Hsp_identity>\n+      <Hsp_positive>154</Hsp_positive>\n+      <Hsp_gaps>21</Hsp_gap'..b'e>0</Hsp_hit-frame>\n+      <Hsp_identity>150</Hsp_identity>\n+      <Hsp_positive>268</Hsp_positive>\n+      <Hsp_gaps>53</Hsp_gaps>\n+      <Hsp_align-len>553</Hsp_align-len>\n+      <Hsp_qseq>DVQSANELVAEVIEEKGNNL------IDSVDNVAEGTELAAEASERTTESIKTLTGVASTISDKLSKLASMLESKVQA--VEQKVQESGASASTGLSVIEDKLPDPDEPESPGLPERILPPLDDNNNLPDEDFFPPVPQEPENNKKDQKKDDKKPTDMLGD-LLKTTKGGFKATISITDKISSMLFKYTVTALAEAAKMAAMLFALVLGIDLLRIHFKYWTDKFMSNFDEFSAEAGEWGGLLQSIFGMLGDIKKFWEAGDWSGLAVAIVKGLADVIYNLSEIMSLGISKISASILDALGFENAATTIRGSALEGFQERTGNSLSEDDQKALAKYQSKRIEEGPGIIDKAGEFKTRAFDWVLGRENKIDSTQASDRDQETQNLKAMAPEKR---EETLIKQNEARAAVQRLEKYIGDVDPENPTNMQSLEKAYNSAKKSISDSAISDQPA---------TKKELDKRFQRVESKYQKLKEDNTPKPAA---PATSEDNQRVQNIQKAENAKE--QSKKSTGDMNVANTQVNNV-NNSKTIHQVQTVTATPAPGV</Hsp_qseq>\n+      <Hsp_hseq>DSLAAQELIAETVEQGNNELRQIKANTASLHDTAAATELGAESTEMSNTILREISETGKQTFSKLSEFAERLKGSFSADDVEQTPIRAASSSDQAIQIINEENPEPENPLVG-----YLRTISEDIKFLRENKNEPSDPKDPDVVPDDKDDLKTMIDRIGDQIVKSVDSGFKRTVNIADSISSTLFKYTITAALNFAKMAALVLSLIIAFDVLSRHFSHWTQMFQEQYAEFKETLGSFGTPFENLTGIVTDLVNYFKSDEYLKMFVRLAEGAADQMIYIVNMMMVGLAKLGAAILRALGADDKADTLEASAISVATKTVGYTPSEEEEATIGRVRKRQAQE---------EAEQSEASWWEKKKREWDG-----KPIETDEEKAVRERKKSIAENTTAEQFGKHDALSQKIQHVGVTAEKNETSNELLGKHRELLEKRASDVEQAKQSGEITTESYKQLKVEIEKQREFLDAHEQKL-----LKPKASIKPAPEPEIGVVGSIAKEEKRVEASQTAKQEAASNY-NTNANIVKNNNQTLVQAPR-TSSPGPGI</Hsp_hseq>\n+      <Hsp_midline>D  +A EL+AE +E+  N L        S+ + A  TEL AE++E +   ++ ++        KLS+ A  L+    A  VEQ    + +S+   + +I ++ P+P+ P         L  + ++     E+   P   +  +   D K D K   D +GD ++K+   GFK T++I D ISS LFKYT+TA    AKMAA++ +L++  D+L  HF +WT  F   + EF    G +G   +++ G++ D+  ++++ ++  + V + +G AD +  +  +M +G++K+ A+IL ALG ++ A T+  SA+    +  G + SE+++  + + + ++ +E         E +     W   ++ + D      +  ET   KA+   K+   E T  +Q     A+ +  +++G    +N T+ + L K     +K  SD   + Q            K E++K+ + +++  QKL      KP A   PA   +   V +I K E   E  Q+ K     N  NT  N V NN++T+ Q    T++P PG+</Hsp_midline>\n+    </Hsp>\n+  </Hit_hsps>\n+</Hit>\n+<Hit>\n+  <Hit_num>43</Hit_num>\n+  <Hit_id>gi|398313739|emb|CCI89086.1|</Hit_id>\n+  <Hit_def>phage baseplate hub [Yersinia phage phiD1]</Hit_def>\n+  <Hit_accession>CCI89086</Hit_accession>\n+  <Hit_len>191</Hit_len>\n+  <Hit_hsps>\n+    <Hsp>\n+      <Hsp_num>1</Hsp_num>\n+      <Hsp_bit-score>79.7221</Hsp_bit-score>\n+      <Hsp_score>195</Hsp_score>\n+      <Hsp_evalue>1.49556e-13</Hsp_evalue>\n+      <Hsp_query-from>2</Hsp_query-from>\n+      <Hsp_query-to>189</Hsp_query-to>\n+      <Hsp_hit-from>3</Hsp_hit-from>\n+      <Hsp_hit-to>187</Hsp_hit-to>\n+      <Hsp_query-frame>0</Hsp_query-frame>\n+      <Hsp_hit-frame>0</Hsp_hit-frame>\n+      <Hsp_identity>69</Hsp_identity>\n+      <Hsp_positive>102</Hsp_positive>\n+      <Hsp_gaps>17</Hsp_gaps>\n+      <Hsp_align-len>195</Hsp_align-len>\n+      <Hsp_qseq>KSENMSTMRRRKVIADSKGERDAASTASDQVDSLELIGLKLDDVQSANELVAEVIEEKGNNLIDSVDNV-------AEGTELAAEASERTTESIKTLTGVASTISDKLSKLASMLESKVQAVEQKVQESGASASTGLSVIEDKLPDPDEPESPGLPERILPPLDDNNNLPDEDFFPPVPQEPENNKKDQKKDDKK</Hsp_qseq>\n+      <Hsp_hseq>KPQEMQTMRR-KVISDNKPTQEAAKSASNTLSGLNDISTKLDDTQAASELIAQTVEEKSNEIVGAIGNVESAVSDTTAGSELIAETVEIGNNINKE---IGESLGSKLDKLTSLLEQKIQTA--GIQQTGTXLATVESAIPVKVVEDDTDRXXVLXYRXLKQLIMILTLI---FSLPLSQLSQ-SKNHQKKNRKK</Hsp_hseq>\n+      <Hsp_midline>K + M TMRR KVI+D+K  ++AA +AS+ +  L  I  KLDD Q+A+EL+A+ +EEK N ++ ++ NV         G+EL AE  E      K    +  ++  KL KL S+LE K+Q     +Q++G   +T  S I  K+ + D      L  R L  L     L    F  P+ Q  + +K  QKK+ KK</Hsp_midline>\n+    </Hsp>\n+  </Hit_hsps>\n+</Hit>\n+</Iteration_hits>\n+  <Iteration_stat>\n+    <Statistics>\n+      <Statistics_db-num>48094830</Statistics_db-num>\n+      <Statistics_db-len>17186091396</Statistics_db-len>\n+      <Statistics_hsp-len>153</Statistics_hsp-len>\n+      <Statistics_eff-space>4157067357738</Statistics_eff-space>\n+      <Statistics_kappa>0.041</Statistics_kappa>\n+      <Statistics_lambda>0.267</Statistics_lambda>\n+      <Statistics_entropy>0.14</Statistics_entropy>\n+    </Statistics>\n+  </Iteration_stat>\n+</Iteration>\n+</BlastOutput_iterations>\n+</BlastOutput>\n+\n'