Galaxy |

Changeset 0:bd47051afe98 (2016-12-20)

Next changeset 1:877cd0833221 (2017-02-15)

Commit message:
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/blastxml_to_gapped_gff3 commit 8f38145c94ecb1e23c3ff6f0243213dc49d2287e

added:
blastxml_to_gapped_gff3.py
blastxml_to_gapped_gff3.xml
macros.xml
static/images/blast-extended.png
static/images/blast2html.png
test-data/blast.gff
test-data/input.xml

diff -r 000000000000 -r bd47051afe98 blastxml_to_gapped_gff3.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/blastxml_to_gapped_gff3.py Tue Dec 20 09:21:11 2016 -0500

[

b'@@ -0,0 +1,263 @@\n+#!/usr/bin/perl\n+import argparse\n+import copy\n+import logging\n+import re\n+import sys\n+\n+from BCBio import GFF\n+\n+logging.basicConfig(level=logging.INFO)\n+log = logging.getLogger(name=\'blastxml2gff3\')\n+\n+__author__ = "Eric Rasche"\n+__version__ = "0.4.0"\n+__maintainer__ = "Eric Rasche"\n+__email__ = "esr@tamu.edu"\n+\n+__doc__ = """\n+BlastXML files, when transformed to GFF3, do not normally show gaps in the\n+blast hits. This tool aims to fill that "gap".\n+"""\n+\n+\n+def blastxml2gff3(blastxml, min_gap=3, trim=False, trim_end=False):\n+ from Bio.Blast import NCBIXML\n+ from Bio.Seq import Seq\n+ from Bio.SeqRecord import SeqRecord\n+ from Bio.SeqFeature import SeqFeature, FeatureLocation\n+\n+ blast_records = NCBIXML.parse(blastxml)\n+ records = []\n+ for record in blast_records:\n+ # http://www.sequenceontology.org/browser/release_2.4/term/SO:0000343\n+ match_type = { # Currently we can only handle BLASTN, BLASTP\n+ \'BLASTN\': \'nucleotide_match\',\n+ \'BLASTP\': \'protein_match\',\n+ }.get(record.application, \'match\')\n+\n+ rec = SeqRecord(Seq("ACTG"), id=record.query)\n+ for hit in record.alignments:\n+ for hsp in hit.hsps:\n+ qualifiers = {\n+ "source": "blast",\n+ "score": hsp.expect,\n+ "accession": hit.accession,\n+ "hit_id": hit.hit_id,\n+ "length": hit.length,\n+ "hit_titles": hit.title.split(\' >\')\n+ }\n+ desc = hit.title.split(\' >\')[0]\n+ qualifiers[\'description\'] = desc[desc.index(\' \'):]\n+\n+ # This required a fair bit of sketching out/match to figure out\n+ # the first time.\n+ #\n+ # the match_start location must account for queries and\n+ # subjecst that start at locations other than 1\n+ parent_match_start = hsp.query_start - hsp.sbjct_start\n+ # The end is the start + hit.length because the match itself\n+ # may be longer than the parent feature, so we use the supplied\n+ # subject/hit length to calculate the real ending of the target\n+ # protein.\n+ parent_match_end = hsp.query_start + hit.length + hsp.query.count(\'-\')\n+\n+ # However, if the user requests that we trim the feature, then\n+ # we need to cut the ``match`` start to 0 to match the parent feature.\n+ # We\'ll also need to cut the end to match the query\'s end. It (maybe)\n+ # should be the feature end? But we don\'t have access to that data, so\n+ # We settle for this.\n+ if trim:\n+ if parent_match_start < 1:\n+ parent_match_start = 0\n+\n+ if trim or trim_end:\n+ if parent_match_end > hsp.query_end:\n+ parent_match_end = hsp.query_end + 1\n+\n+ # The ``match`` feature will hold one or more ``match_part``s\n+ top_feature = SeqFeature(\n+ FeatureLocation(parent_match_start, parent_match_end),\n+ type=match_type, strand=0,\n+ qualifiers=qualifiers\n+ )\n+\n+ # Unlike the parent feature, ``match_part``s have sources.\n+ part_qualifiers = {\n+ "source": "blast",\n+ }\n+ top_feature.sub_features = []\n+ for start, end, cigar in generate_parts(hsp.query, hsp.match,\n+ hsp.sbjct,\n+ ignore_under=min_gap):\n+ part_qualifiers[\'Gap\'] = cigar\n+ part_qualifiers[\'ID\'] = hit.hit_id\n+\n+ if trim:\n+ # If trimming, then we start relative to th'..b'ome::\n+\n+ Q:ACTGACTGACTG\n+ S:ACTGAC---CTG\n+\n+ which greatly simplifies the process of identifying the correct location\n+ for a match_part\n+ """\n+ prev = 0\n+ fq = \'\'\n+ fm = \'\'\n+ fs = \'\'\n+ for position in re.finditer(\'-\', query):\n+ fq += query[prev:position.start()]\n+ fm += match[prev:position.start()]\n+ fs += subject[prev:position.start()]\n+ prev = position.start() + 1\n+ fq += query[prev:]\n+ fm += match[prev:]\n+ fs += subject[prev:]\n+\n+ return (fq, fm, fs)\n+\n+\n+def generate_parts(query, match, subject, ignore_under=3):\n+ region_q = []\n+ region_m = []\n+ region_s = []\n+\n+ (query, match, subject) = __remove_query_gaps(query, match, subject)\n+\n+ region_start = -1\n+ region_end = -1\n+ mismatch_count = 0\n+ for i, (q, m, s) in enumerate(zip(query, match, subject)):\n+\n+ # If we have a match\n+ if m != \' \' or m == \'+\':\n+ if region_start == -1:\n+ region_start = i\n+ # It\'s a new region, we need to reset or it\'s pre-seeded with\n+ # spaces\n+ region_q = []\n+ region_m = []\n+ region_s = []\n+ region_end = i\n+ mismatch_count = 0\n+ else:\n+ mismatch_count += 1\n+\n+ region_q.append(q)\n+ region_m.append(m)\n+ region_s.append(s)\n+\n+ if mismatch_count >= ignore_under and region_start != -1 and region_end != -1:\n+ region_q = region_q[0:-ignore_under]\n+ region_m = region_m[0:-ignore_under]\n+ region_s = region_s[0:-ignore_under]\n+ yield region_start, region_end + 1, \\\n+ cigar_from_string(region_q, region_m, region_s, strict_m=True)\n+ region_q = []\n+ region_m = []\n+ region_s = []\n+\n+ region_start = -1\n+ region_end = -1\n+ mismatch_count = 0\n+\n+ yield region_start, region_end + 1, \\\n+ cigar_from_string(region_q, region_m, region_s, strict_m=True)\n+\n+\n+def _qms_to_matches(query, match, subject, strict_m=True):\n+ matchline = []\n+\n+ for (q, m, s) in zip(query, match, subject):\n+ ret = \'\'\n+\n+ if m != \' \' or m == \'+\':\n+ ret = \'=\'\n+ elif m == \' \':\n+ if q == \'-\':\n+ ret = \'D\'\n+ elif s == \'-\':\n+ ret = \'I\'\n+ else:\n+ ret = \'X\'\n+ else:\n+ log.warn("Bad data: \\n\\t%s\\n\\t%s\\n\\t%s\\n" % (query, match, subject))\n+\n+ if strict_m:\n+ if ret == \'=\' or ret == \'X\':\n+ ret = \'M\'\n+\n+ matchline.append(ret)\n+ return matchline\n+\n+\n+def _matchline_to_cigar(matchline):\n+ cigar_line = []\n+ last_char = matchline[0]\n+ count = 0\n+ for char in matchline:\n+ if char == last_char:\n+ count += 1\n+ else:\n+ cigar_line.append("%s%s" % (last_char, count))\n+ count = 1\n+ last_char = char\n+ cigar_line.append("%s%s" % (last_char, count))\n+ return \' \'.join(cigar_line)\n+\n+\n+def cigar_from_string(query, match, subject, strict_m=True):\n+ matchline = _qms_to_matches(query, match, subject, strict_m=strict_m)\n+ if len(matchline) > 0:\n+ return _matchline_to_cigar(matchline)\n+ else:\n+ return ""\n+\n+\n+if __name__ == \'__main__\':\n+ parser = argparse.ArgumentParser(description=\'Convert Blast XML to gapped GFF3\', epilog=\'\')\n+ parser.add_argument(\'blastxml\', type=open, help=\'Blast XML Output\')\n+ parser.add_argument(\'--min_gap\', type=int, help=\'Maximum gap size before generating a new match_part\', default=3)\n+ parser.add_argument(\'--trim\', action=\'store_true\', help=\'Trim blast hits to be only as long as the parent feature\')\n+ parser.add_argument(\'--trim_end\', action=\'store_true\', help=\'Cut blast results off at end of gene\')\n+ args = parser.parse_args()\n+\n+ result = blastxml2gff3(**vars(args))\n+ GFF.write(result, sys.stdout)\n'

diff -r 000000000000 -r bd47051afe98 blastxml_to_gapped_gff3.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/blastxml_to_gapped_gff3.xml Tue Dec 20 09:21:11 2016 -0500

[

@@ -0,0 +1,73 @@
+<tool id="blastxml_to_gapped_gff3" name="BlastXML to gapped GFF3" version="1.1">
+  <description></description>
+  <macros>
+    <import>macros.xml</import>
+  </macros>
+  <expand macro="requirements"/>
+  <expand macro="stdio"/>
+  <version_command>python blastxml_to_gapped_gff3.py --version</version_command>
+  <command>
+  <![CDATA[
+    python '$__tool_directory__/blastxml_to_gapped_gff3.py'
+      '$blastxml'
+      --min_gap $min_gap
+      $trim
+      > '$output'
+  ]]></command>
+  <inputs>
+    <param label="Blast XML" name="blastxml" type="data" format="blastxml"/>
+    <param label="Maximum gap size before generating a new match_part" name="min_gap" type="integer" value="3"/>
+
+    <param type="select" label="Trim sides of blast hits" name="trim">
+        <option value="">None</option>
+        <option value="--trim">Trim start and end</option>
+        <option value="--trim_end" selected="true">Trim only end</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="gff3" name="output"/>
+  </outputs>
+  <tests>
+      <test>
+          <param name="blastxml" ftype="blastxml" value="input.xml"/>
+          <output name="output" file="blast.gff"/>
+      </test>
+  </tests>
+  <help><![CDATA[
+**What it does**
+
+Convert BlastXML results into GFF3 format.
+
+**Options**
+
+The trimming option captures an important feature provided in this tool that
+isn't provided in most other BlastXML visualization tools: the fact that blast
+captures complete alignment location information.
+
+This means that when most blast visualization tools produce output which looks
+like this:
+
+.. image:: $PATH_TO_IMAGES/blast2html.png
+
+This tool produces output which shows where the real subject sequence starts and ends relative to your sequence:
+
+.. image:: $PATH_TO_IMAGES/blast-extended.png
+
+This can be a useful feature for examining alternate start locations that are
+used by sequences found from your blast query.
+
+The green bars on the very top row of the picture indicate start sites, as you
+can see the blast hits and the genome in the visualization share an upstream
+start site. You don't lose the information present in your blastxml data.
+
+So, to the end of useful functionality, this options is controllable:
+
+-  you can trim neither end, see where the real protein alignments are.
+-  you can trim both ends, like blasts HTML reports
+-  you can trim just the end of the sequence, as upstream is generally more
+   interesting, and having long tails can result in poor visualizations.
+]]></help>
+    <citations>
+    </citations>
+</tool>
+

diff -r 000000000000 -r bd47051afe98 macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml Tue Dec 20 09:21:11 2016 -0500

@@ -0,0 +1,17 @@
+<?xml version="1.0"?>
+<macros>
+  <xml name="requirements">
+    <requirements>
+      <requirement type="package" version="0.6.4">bcbiogff</requirement>
+      <yield/>
+    </requirements>
+  </xml>
+  <xml name="stdio">
+    <stdio>
+      <exit_code range="1:"/>
+      <exit_code range=":-1"/>
+      <regex match="Error:"/>
+      <regex match="Exception:"/>
+    </stdio>
+  </xml>
+</macros>

diff -r 000000000000 -r bd47051afe98 static/images/blast-extended.png

Binary file static/images/blast-extended.png has changed

diff -r 000000000000 -r bd47051afe98 static/images/blast2html.png

Binary file static/images/blast2html.png has changed

diff -r 000000000000 -r bd47051afe98 test-data/blast.gff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/blast.gff Tue Dec 20 09:21:11 2016 -0500

b'@@ -0,0 +1,1963 @@\n+##gff-version 3\n+##sequence-region Merlin_1 1 4\n+Merlin_1\tblast\tprotein_match\t-471\t230\t3.74548e-55\t.\t.\tID=biopygen1;accession=YP_007004572;description=hypothetical protein %5BEnterobacteria phage ime09%5D;hit_id=gi%7C422934611%7Cref%7CYP_007004572.1%7C;hit_titles=gi%7C422934611%7Cref%7CYP_007004572.1%7C hypothetical protein %5BEnterobacteria phage ime09%5D,gi%7C339791394%7Cgb%7CAEK12451.1%7C hypothetical protein %5BEnterobacteria phage ime09%5D;length=685\n+Merlin_1\tblast\tmatch_part\t2\t14\t.\t.\t.\tGap=M13;ID=gi%7C422934611%7Cref%7CYP_007004572.1%7C;Parent=biopygen1\n+Merlin_1\tblast\tmatch_part\t18\t55\t.\t.\t.\tGap=M38;ID=gi%7C422934611%7Cref%7CYP_007004572.1%7C;Parent=biopygen1\n+Merlin_1\tblast\tmatch_part\t59\t63\t.\t.\t.\tGap=M5;ID=gi%7C422934611%7Cref%7CYP_007004572.1%7C;Parent=biopygen1\n+Merlin_1\tblast\tmatch_part\t73\t82\t.\t.\t.\tGap=M10;ID=gi%7C422934611%7Cref%7CYP_007004572.1%7C;Parent=biopygen1\n+Merlin_1\tblast\tmatch_part\t87\t90\t.\t.\t.\tGap=M4;ID=gi%7C422934611%7Cref%7CYP_007004572.1%7C;Parent=biopygen1\n+Merlin_1\tblast\tmatch_part\t103\t207\t.\t.\t.\tGap=M72 I2 M26 I2 M3;ID=gi%7C422934611%7Cref%7CYP_007004572.1%7C;Parent=biopygen1\n+Merlin_1\tblast\tmatch_part\t212\t229\t.\t.\t.\tGap=M18;ID=gi%7C422934611%7Cref%7CYP_007004572.1%7C;Parent=biopygen1\n+Merlin_1\tblast\tprotein_match\t-471\t230\t4.31042e-55\t.\t.\tID=biopygen2;accession=YP_004415089;description=hypothetical protein Shfl2p198 %5BShigella phage Shfl2%5D;hit_id=gi%7C330858714%7Cref%7CYP_004415089.1%7C;hit_titles=gi%7C330858714%7Cref%7CYP_004415089.1%7C hypothetical protein Shfl2p198 %5BShigella phage Shfl2%5D,gi%7C327397648%7Cgb%7CAEA73150.1%7C hypothetical protein Shfl2p198 %5BShigella phage Shfl2%5D;length=685\n+Merlin_1\tblast\tmatch_part\t2\t14\t.\t.\t.\tGap=M13;ID=gi%7C330858714%7Cref%7CYP_004415089.1%7C;Parent=biopygen2\n+Merlin_1\tblast\tmatch_part\t18\t55\t.\t.\t.\tGap=M38;ID=gi%7C330858714%7Cref%7CYP_004415089.1%7C;Parent=biopygen2\n+Merlin_1\tblast\tmatch_part\t59\t63\t.\t.\t.\tGap=M5;ID=gi%7C330858714%7Cref%7CYP_004415089.1%7C;Parent=biopygen2\n+Merlin_1\tblast\tmatch_part\t73\t82\t.\t.\t.\tGap=M10;ID=gi%7C330858714%7Cref%7CYP_004415089.1%7C;Parent=biopygen2\n+Merlin_1\tblast\tmatch_part\t87\t90\t.\t.\t.\tGap=M4;ID=gi%7C330858714%7Cref%7CYP_004415089.1%7C;Parent=biopygen2\n+Merlin_1\tblast\tmatch_part\t103\t207\t.\t.\t.\tGap=M72 I2 M26 I2 M3;ID=gi%7C330858714%7Cref%7CYP_004415089.1%7C;Parent=biopygen2\n+Merlin_1\tblast\tmatch_part\t212\t229\t.\t.\t.\tGap=M18;ID=gi%7C330858714%7Cref%7CYP_004415089.1%7C;Parent=biopygen2\n+Merlin_1\tblast\tprotein_match\t-471\t230\t4.35388e-55\t.\t.\tID=biopygen3;accession=YP_002854530;description=alt.-2 hypothetical protein %5BEnterobacteria phage RB14%5D;hit_id=gi%7C228861509%7Cref%7CYP_002854530.1%7C;hit_titles=gi%7C228861509%7Cref%7CYP_002854530.1%7C alt.-2 hypothetical protein %5BEnterobacteria phage RB14%5D,gi%7C227438525%7Cgb%7CACP30838.1%7C alt.-2 hypothetical protein %5BEnterobacteria phage RB14%5D;length=685\n+Merlin_1\tblast\tmatch_part\t2\t14\t.\t.\t.\tGap=M13;ID=gi%7C228861509%7Cref%7CYP_002854530.1%7C;Parent=biopygen3\n+Merlin_1\tblast\tmatch_part\t18\t55\t.\t.\t.\tGap=M38;ID=gi%7C228861509%7Cref%7CYP_002854530.1%7C;Parent=biopygen3\n+Merlin_1\tblast\tmatch_part\t59\t63\t.\t.\t.\tGap=M5;ID=gi%7C228861509%7Cref%7CYP_002854530.1%7C;Parent=biopygen3\n+Merlin_1\tblast\tmatch_part\t73\t82\t.\t.\t.\tGap=M10;ID=gi%7C228861509%7Cref%7CYP_002854530.1%7C;Parent=biopygen3\n+Merlin_1\tblast\tmatch_part\t96\t99\t.\t.\t.\tGap=M4;ID=gi%7C228861509%7Cref%7CYP_002854530.1%7C;Parent=biopygen3\n+Merlin_1\tblast\tmatch_part\t103\t148\t.\t.\t.\tGap=M46;ID=gi%7C228861509%7Cref%7CYP_002854530.1%7C;Parent=biopygen3\n+Merlin_1\tblast\tmatch_part\t152\t207\t.\t.\t.\tGap=M23 I2 M26 I2 M3;ID=gi%7C228861509%7Cref%7CYP_002854530.1%7C;Parent=biopygen3\n+Merlin_1\tblast\tmatch_part\t212\t229\t.\t.\t.\tGap=M18;ID=gi%7C228861509%7Cref%7CYP_002854530.1%7C;Parent=biopygen3\n+##sequence-region Merlin_2 1 4\n+Merlin_2\tblast\tprotein_match\t-10\t96\t9.23754e-17\t.\t.\tID=biopygen4;accession=YP_003934833;description=hypothetical protein SP18_gp210 %5BShigella phage SP18%5D;hit_id=gi%7C308814559%7Cref%7CYP_003934833.1%7C;hit_ti'..b'ref%7CNP_932538.1%7C;Parent=biopygen156\n+Merlin_5\tblast\tmatch_part\t175\t178\t.\t.\t.\tGap=M4;ID=gi%7C37651664%7Cref%7CNP_932538.1%7C;Parent=biopygen156\n+Merlin_5\tblast\tmatch_part\t182\t188\t.\t.\t.\tGap=M7;ID=gi%7C37651664%7Cref%7CNP_932538.1%7C;Parent=biopygen156\n+Merlin_5\tblast\tmatch_part\t192\t200\t.\t.\t.\tGap=M9;ID=gi%7C37651664%7Cref%7CNP_932538.1%7C;Parent=biopygen156\n+Merlin_5\tblast\tmatch_part\t204\t226\t.\t.\t.\tGap=M23;ID=gi%7C37651664%7Cref%7CNP_932538.1%7C;Parent=biopygen156\n+Merlin_5\tblast\tmatch_part\t231\t258\t.\t.\t.\tGap=M28;ID=gi%7C37651664%7Cref%7CNP_932538.1%7C;Parent=biopygen156\n+Merlin_5\tblast\tmatch_part\t262\t265\t.\t.\t.\tGap=M4;ID=gi%7C37651664%7Cref%7CNP_932538.1%7C;Parent=biopygen156\n+Merlin_5\tblast\tmatch_part\t270\t273\t.\t.\t.\tGap=M4;ID=gi%7C37651664%7Cref%7CNP_932538.1%7C;Parent=biopygen156\n+Merlin_5\tblast\tmatch_part\t277\t346\t.\t.\t.\tGap=M70;ID=gi%7C37651664%7Cref%7CNP_932538.1%7C;Parent=biopygen156\n+Merlin_5\tblast\tmatch_part\t351\t375\t.\t.\t.\tGap=M25;ID=gi%7C37651664%7Cref%7CNP_932538.1%7C;Parent=biopygen156\n+Merlin_5\tblast\tmatch_part\t385\t387\t.\t.\t.\tGap=M3;ID=gi%7C37651664%7Cref%7CNP_932538.1%7C;Parent=biopygen156\n+Merlin_5\tblast\tmatch_part\t393\t393\t.\t.\t.\tGap=M1;ID=gi%7C37651664%7Cref%7CNP_932538.1%7C;Parent=biopygen156\n+Merlin_5\tblast\tmatch_part\t397\t402\t.\t.\t.\tGap=M6;ID=gi%7C37651664%7Cref%7CNP_932538.1%7C;Parent=biopygen156\n+Merlin_5\tblast\tmatch_part\t409\t413\t.\t.\t.\tGap=M5;ID=gi%7C37651664%7Cref%7CNP_932538.1%7C;Parent=biopygen156\n+Merlin_5\tblast\tmatch_part\t417\t419\t.\t.\t.\tGap=M3;ID=gi%7C37651664%7Cref%7CNP_932538.1%7C;Parent=biopygen156\n+Merlin_5\tblast\tmatch_part\t423\t431\t.\t.\t.\tGap=M9;ID=gi%7C37651664%7Cref%7CNP_932538.1%7C;Parent=biopygen156\n+Merlin_5\tblast\tmatch_part\t437\t446\t.\t.\t.\tGap=M10;ID=gi%7C37651664%7Cref%7CNP_932538.1%7C;Parent=biopygen156\n+Merlin_5\tblast\tmatch_part\t451\t461\t.\t.\t.\tGap=M11;ID=gi%7C37651664%7Cref%7CNP_932538.1%7C;Parent=biopygen156\n+Merlin_5\tblast\tmatch_part\t467\t472\t.\t.\t.\tGap=M6;ID=gi%7C37651664%7Cref%7CNP_932538.1%7C;Parent=biopygen156\n+Merlin_5\tblast\tmatch_part\t476\t478\t.\t.\t.\tGap=M3;ID=gi%7C37651664%7Cref%7CNP_932538.1%7C;Parent=biopygen156\n+Merlin_5\tblast\tmatch_part\t482\t499\t.\t.\t.\tGap=M18;ID=gi%7C37651664%7Cref%7CNP_932538.1%7C;Parent=biopygen156\n+Merlin_5\tblast\tmatch_part\t506\t511\t.\t.\t.\tGap=M6;ID=gi%7C37651664%7Cref%7CNP_932538.1%7C;Parent=biopygen156\n+Merlin_5\tblast\tmatch_part\t515\t515\t.\t.\t.\tGap=M1;ID=gi%7C37651664%7Cref%7CNP_932538.1%7C;Parent=biopygen156\n+Merlin_5\tblast\tmatch_part\t519\t526\t.\t.\t.\tGap=M8;ID=gi%7C37651664%7Cref%7CNP_932538.1%7C;Parent=biopygen156\n+Merlin_5\tblast\tmatch_part\t530\t534\t.\t.\t.\tGap=M5;ID=gi%7C37651664%7Cref%7CNP_932538.1%7C;Parent=biopygen156\n+Merlin_5\tblast\tmatch_part\t540\t557\t.\t.\t.\tGap=M2 I1 M15;ID=gi%7C37651664%7Cref%7CNP_932538.1%7C;Parent=biopygen156\n+Merlin_5\tblast\tmatch_part\t562\t569\t.\t.\t.\tGap=M8;ID=gi%7C37651664%7Cref%7CNP_932538.1%7C;Parent=biopygen156\n+Merlin_5\tblast\tprotein_match\t0\t190\t1.49556e-13\t.\t.\tID=biopygen157;accession=CCI89086;description=phage baseplate hub %5BYersinia phage phiD1%5D;hit_id=gi%7C398313739%7Cemb%7CCCI89086.1%7C;hit_titles=gi%7C398313739%7Cemb%7CCCI89086.1%7C phage baseplate hub %5BYersinia phage phiD1%5D;length=191\n+Merlin_5\tblast\tmatch_part\t2\t82\t.\t.\t.\tGap=M10 I1 M70;ID=gi%7C398313739%7Cemb%7CCCI89086.1%7C;Parent=biopygen157\n+Merlin_5\tblast\tmatch_part\t89\t89\t.\t.\t.\tGap=M1;ID=gi%7C398313739%7Cemb%7CCCI89086.1%7C;Parent=biopygen157\n+Merlin_5\tblast\tmatch_part\t94\t114\t.\t.\t.\tGap=M21;ID=gi%7C398313739%7Cemb%7CCCI89086.1%7C;Parent=biopygen157\n+Merlin_5\tblast\tmatch_part\t120\t124\t.\t.\t.\tGap=M5;ID=gi%7C398313739%7Cemb%7CCCI89086.1%7C;Parent=biopygen157\n+Merlin_5\tblast\tmatch_part\t128\t142\t.\t.\t.\tGap=M15;ID=gi%7C398313739%7Cemb%7CCCI89086.1%7C;Parent=biopygen157\n+Merlin_5\tblast\tmatch_part\t149\t157\t.\t.\t.\tGap=M9;ID=gi%7C398313739%7Cemb%7CCCI89086.1%7C;Parent=biopygen157\n+Merlin_5\tblast\tmatch_part\t163\t163\t.\t.\t.\tGap=M1;ID=gi%7C398313739%7Cemb%7CCCI89086.1%7C;Parent=biopygen157\n+Merlin_5\tblast\tmatch_part\t168\t189\t.\t.\t.\tGap=M10 I1 M11;ID=gi%7C398313739%7Cemb%7CCCI89086.1%7C;Parent=biopygen157\n'

diff -r 000000000000 -r bd47051afe98 test-data/input.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/input.xml Tue Dec 20 09:21:11 2016 -0500

[

b'@@ -0,0 +1,4514 @@\n+<?xml version="1.0"?>\n+<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">\n+<BlastOutput>\n+ <BlastOutput_program>blastp</BlastOutput_program>\n+ <BlastOutput_version>BLASTP 2.2.28+</BlastOutput_version>\n+ <BlastOutput_reference>Stephen F. Altschul, Thomas L. Madden, Alejandro A. Sch&auml;ffer, Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), "Gapped BLAST and PSI-BLAST: a new generation of protein database search programs", Nucleic Acids Res. 25:3389-3402.</BlastOutput_reference>\n+ <BlastOutput_db>/usr/local/syncdb/community/nr/nr</BlastOutput_db>\n+ <BlastOutput_query-ID>Query_1</BlastOutput_query-ID>\n+ <BlastOutput_query-def>Merlin_1</BlastOutput_query-def>\n+ <BlastOutput_query-len>229</BlastOutput_query-len>\n+ <BlastOutput_param>\n+ <Parameters>\n+ <Parameters_matrix>BLOSUM62</Parameters_matrix>\n+ <Parameters_expect>0.001</Parameters_expect>\n+ <Parameters_gap-open>11</Parameters_gap-open>\n+ <Parameters_gap-extend>1</Parameters_gap-extend>\n+ <Parameters_filter>F</Parameters_filter>\n+ </Parameters>\n+ </BlastOutput_param>\n+<BlastOutput_iterations>\n+<Iteration>\n+ <Iteration_iter-num>1</Iteration_iter-num>\n+ <Iteration_query-ID>Query_1</Iteration_query-ID>\n+ <Iteration_query-def>Merlin_1</Iteration_query-def>\n+ <Iteration_query-len>229</Iteration_query-len>\n+<Iteration_hits>\n+<Hit>\n+ <Hit_num>1</Hit_num>\n+ <Hit_id>gi|422934611|ref|YP_007004572.1|</Hit_id>\n+ <Hit_def>hypothetical protein [Enterobacteria phage ime09] >gi|339791394|gb|AEK12451.1| hypothetical protein [Enterobacteria phage ime09]</Hit_def>\n+ <Hit_accession>YP_007004572</Hit_accession>\n+ <Hit_len>685</Hit_len>\n+ <Hit_hsps>\n+ <Hsp>\n+ <Hsp_num>1</Hsp_num>\n+ <Hsp_bit-score>197.593</Hsp_bit-score>\n+ <Hsp_score>501</Hsp_score>\n+ <Hsp_evalue>3.74548e-55</Hsp_evalue>\n+ <Hsp_query-from>2</Hsp_query-from>\n+ <Hsp_query-to>229</Hsp_query-to>\n+ <Hsp_hit-from>474</Hsp_hit-from>\n+ <Hsp_hit-to>684</Hsp_hit-to>\n+ <Hsp_query-frame>0</Hsp_query-frame>\n+ <Hsp_hit-frame>0</Hsp_hit-frame>\n+ <Hsp_identity>106</Hsp_identity>\n+ <Hsp_positive>154</Hsp_positive>\n+ <Hsp_gaps>21</Hsp_gaps>\n+ <Hsp_align-len>230</Hsp_align-len>\n+ <Hsp_qseq>LDKGTLLYRGQKLDLPTFEHNAENKLFYFRNYVSTSLKPLIFGEFGRMFMALDDDTTIYTAETPDDYNRFANPEDIIDIGATQKDSFDDNNNDGTSINIGKQVNLGFVISGAENVRVIVPGSLTEYPEEAEVILPRGTLLKINKITTQVDKRS--NKFMVEGSIVPPSEQIDESVEIYDGDLFMETGEVVKLSGFMQFVNESAYDEEQNQMAAEILSGFLDIDDMPRKFR</Hsp_qseq>\n+ <Hsp_hseq>LPPGTTLYRGQEVTFKTLRHNIENKMFYFKNFVSTSLKPNIFGEHGKNYMALDDSGAVFSGEGEGS----VDAEDLMHMGSHSAYANED-----------AETSVGMVIKGAERIKVIVPGHLSGFPSEAEVILPRGILLKINKVSTYMMKETAYNKYLIEGTIVPPSEQLEESV--YDGDHLMETGEVRPMAGFNQFLVEES--KEEENEVSQILASLVNINGMSKKFK</Hsp_hseq>\n+ <Hsp_midline>L GT LYRGQ++ T HN ENK+FYF+N+VSTSLKP IFGE G+ +MALDD +++ E + ED++ +G+ + +D + ++G VI GAE ++VIVPG L+ +P EAEVILPRG LLKINK++T + K + NK+++EG+IVPPSEQ++ESV YDGD METGEV ++GF QF+ E + +E+ ++IL+ ++I+ M +KF+</Hsp_midline>\n+ </Hsp>\n+ </Hit_hsps>\n+</Hit>\n+<Hit>\n+ <Hit_num>2</Hit_num>\n+ <Hit_id>gi|330858714|ref|YP_004415089.1|</Hit_id>\n+ <Hit_def>hypothetical protein Shfl2p198 [Shigella phage Shfl2] >gi|327397648|gb|AEA73150.1| hypothetical protein Shfl2p198 [Shigella phage Shfl2]</Hit_def>\n+ <Hit_accession>YP_004415089</Hit_accession>\n+ <Hit_len>685</Hit_len>\n+ <Hit_hsps>\n+ <Hsp>\n+ <Hsp_num>1</Hsp_num>\n+ <Hsp_bit-score>197.593</Hsp_bit-score>\n+ <Hsp_score>501</Hsp_score>\n+ <Hsp_evalue>4.31042e-55</Hsp_evalue>\n+ <Hsp_query-from>2</Hsp_query-from>\n+ <Hsp_query-to>229</Hsp_query-to>\n+ <Hsp_hit-from>474</Hsp_hit-from>\n+ <Hsp_hit-to>684</Hsp_hit-to>\n+ <Hsp_query-frame>0</Hsp_query-frame>\n+ <Hsp_hit-frame>0</Hsp_hit-frame>\n+ <Hsp_identity>106</Hsp_identity>\n+ <Hsp_positive>154</Hsp_positive>\n+ <Hsp_gaps>21</Hsp_gap'..b'e>0</Hsp_hit-frame>\n+ <Hsp_identity>150</Hsp_identity>\n+ <Hsp_positive>268</Hsp_positive>\n+ <Hsp_gaps>53</Hsp_gaps>\n+ <Hsp_align-len>553</Hsp_align-len>\n+ <Hsp_qseq>DVQSANELVAEVIEEKGNNL------IDSVDNVAEGTELAAEASERTTESIKTLTGVASTISDKLSKLASMLESKVQA--VEQKVQESGASASTGLSVIEDKLPDPDEPESPGLPERILPPLDDNNNLPDEDFFPPVPQEPENNKKDQKKDDKKPTDMLGD-LLKTTKGGFKATISITDKISSMLFKYTVTALAEAAKMAAMLFALVLGIDLLRIHFKYWTDKFMSNFDEFSAEAGEWGGLLQSIFGMLGDIKKFWEAGDWSGLAVAIVKGLADVIYNLSEIMSLGISKISASILDALGFENAATTIRGSALEGFQERTGNSLSEDDQKALAKYQSKRIEEGPGIIDKAGEFKTRAFDWVLGRENKIDSTQASDRDQETQNLKAMAPEKR---EETLIKQNEARAAVQRLEKYIGDVDPENPTNMQSLEKAYNSAKKSISDSAISDQPA---------TKKELDKRFQRVESKYQKLKEDNTPKPAA---PATSEDNQRVQNIQKAENAKE--QSKKSTGDMNVANTQVNNV-NNSKTIHQVQTVTATPAPGV</Hsp_qseq>\n+ <Hsp_hseq>DSLAAQELIAETVEQGNNELRQIKANTASLHDTAAATELGAESTEMSNTILREISETGKQTFSKLSEFAERLKGSFSADDVEQTPIRAASSSDQAIQIINEENPEPENPLVG-----YLRTISEDIKFLRENKNEPSDPKDPDVVPDDKDDLKTMIDRIGDQIVKSVDSGFKRTVNIADSISSTLFKYTITAALNFAKMAALVLSLIIAFDVLSRHFSHWTQMFQEQYAEFKETLGSFGTPFENLTGIVTDLVNYFKSDEYLKMFVRLAEGAADQMIYIVNMMMVGLAKLGAAILRALGADDKADTLEASAISVATKTVGYTPSEEEEATIGRVRKRQAQE---------EAEQSEASWWEKKKREWDG-----KPIETDEEKAVRERKKSIAENTTAEQFGKHDALSQKIQHVGVTAEKNETSNELLGKHRELLEKRASDVEQAKQSGEITTESYKQLKVEIEKQREFLDAHEQKL-----LKPKASIKPAPEPEIGVVGSIAKEEKRVEASQTAKQEAASNY-NTNANIVKNNNQTLVQAPR-TSSPGPGI</Hsp_hseq>\n+ <Hsp_midline>D +A EL+AE +E+ N L S+ + A TEL AE++E + ++ ++ KLS+ A L+ A VEQ + +S+ + +I ++ P+P+ P L + ++ E+ P + + D K D K D +GD ++K+ GFK T++I D ISS LFKYT+TA AKMAA++ +L++ D+L HF +WT F + EF G +G +++ G++ D+ ++++ ++ + V + +G AD + + +M +G++K+ A+IL ALG ++ A T+ SA+ + G + SE+++ + + + ++ +E E + W ++ + D + ET KA+ K+ E T +Q A+ + +++G +N T+ + L K +K SD + Q K E++K+ + +++ QKL KP A PA + V +I K E E Q+ K N NT N V NN++T+ Q T++P PG+</Hsp_midline>\n+ </Hsp>\n+ </Hit_hsps>\n+</Hit>\n+<Hit>\n+ <Hit_num>43</Hit_num>\n+ <Hit_id>gi|398313739|emb|CCI89086.1|</Hit_id>\n+ <Hit_def>phage baseplate hub [Yersinia phage phiD1]</Hit_def>\n+ <Hit_accession>CCI89086</Hit_accession>\n+ <Hit_len>191</Hit_len>\n+ <Hit_hsps>\n+ <Hsp>\n+ <Hsp_num>1</Hsp_num>\n+ <Hsp_bit-score>79.7221</Hsp_bit-score>\n+ <Hsp_score>195</Hsp_score>\n+ <Hsp_evalue>1.49556e-13</Hsp_evalue>\n+ <Hsp_query-from>2</Hsp_query-from>\n+ <Hsp_query-to>189</Hsp_query-to>\n+ <Hsp_hit-from>3</Hsp_hit-from>\n+ <Hsp_hit-to>187</Hsp_hit-to>\n+ <Hsp_query-frame>0</Hsp_query-frame>\n+ <Hsp_hit-frame>0</Hsp_hit-frame>\n+ <Hsp_identity>69</Hsp_identity>\n+ <Hsp_positive>102</Hsp_positive>\n+ <Hsp_gaps>17</Hsp_gaps>\n+ <Hsp_align-len>195</Hsp_align-len>\n+ <Hsp_qseq>KSENMSTMRRRKVIADSKGERDAASTASDQVDSLELIGLKLDDVQSANELVAEVIEEKGNNLIDSVDNV-------AEGTELAAEASERTTESIKTLTGVASTISDKLSKLASMLESKVQAVEQKVQESGASASTGLSVIEDKLPDPDEPESPGLPERILPPLDDNNNLPDEDFFPPVPQEPENNKKDQKKDDKK</Hsp_qseq>\n+ <Hsp_hseq>KPQEMQTMRR-KVISDNKPTQEAAKSASNTLSGLNDISTKLDDTQAASELIAQTVEEKSNEIVGAIGNVESAVSDTTAGSELIAETVEIGNNINKE---IGESLGSKLDKLTSLLEQKIQTA--GIQQTGTXLATVESAIPVKVVEDDTDRXXVLXYRXLKQLIMILTLI---FSLPLSQLSQ-SKNHQKKNRKK</Hsp_hseq>\n+ <Hsp_midline>K + M TMRR KVI+D+K ++AA +AS+ + L I KLDD Q+A+EL+A+ +EEK N ++ ++ NV G+EL AE E K + ++ KL KL S+LE K+Q +Q++G +T S I K+ + D L R L L L F P+ Q + +K QKK+ KK</Hsp_midline>\n+ </Hsp>\n+ </Hit_hsps>\n+</Hit>\n+</Iteration_hits>\n+ <Iteration_stat>\n+ <Statistics>\n+ <Statistics_db-num>48094830</Statistics_db-num>\n+ <Statistics_db-len>17186091396</Statistics_db-len>\n+ <Statistics_hsp-len>153</Statistics_hsp-len>\n+ <Statistics_eff-space>4157067357738</Statistics_eff-space>\n+ <Statistics_kappa>0.041</Statistics_kappa>\n+ <Statistics_lambda>0.267</Statistics_lambda>\n+ <Statistics_entropy>0.14</Statistics_entropy>\n+ </Statistics>\n+ </Iteration_stat>\n+</Iteration>\n+</BlastOutput_iterations>\n+</BlastOutput>\n+\n'