Repository 'blast_datatypes'
hg clone https://toolshed.g2.bx.psu.edu/repos/devteam/blast_datatypes

Changeset 3:6ef523b390e0 (2012-09-20)
Previous changeset 2:45ba7c750bc8 (2012-09-20) Next changeset 4:f9a7783ed7b6 (2012-11-09)
Commit message:
Uploaded correct file.
added:
blast.py
blast_datatypes.txt
datatypes_conf.xml
removed:
test-data/blastp_four_human_vs_rhodopsin.tabular
test-data/blastp_four_human_vs_rhodopsin.xml
test-data/blastp_four_human_vs_rhodopsin_converted.tabular
test-data/blastp_four_human_vs_rhodopsin_converted_ext.tabular
test-data/blastp_four_human_vs_rhodopsin_ext.tabular
test-data/blastp_human_vs_pdb_seg_no.xml
test-data/blastp_human_vs_pdb_seg_no_converted_ext.tabular
test-data/blastp_human_vs_pdb_seg_no_converted_std.tabular
test-data/blastp_rhodopsin_vs_four_human.tabular
test-data/blastp_sample.xml
test-data/blastp_sample_converted.tabular
test-data/blastx_rhodopsin_vs_four_human.tabular
test-data/blastx_rhodopsin_vs_four_human.xml
test-data/blastx_rhodopsin_vs_four_human_converted.tabular
test-data/blastx_rhodopsin_vs_four_human_converted_ext.tabular
test-data/blastx_rhodopsin_vs_four_human_ext.tabular
test-data/blastx_sample.xml
test-data/blastx_sample_converted.tabular
test-data/four_human_proteins.fasta
test-data/rhodopsin_nucs.fasta
test-data/rhodopsin_proteins.fasta
test-data/tblastn_four_human_vs_rhodopsin.html
test-data/tblastn_four_human_vs_rhodopsin.tabular
test-data/tblastn_four_human_vs_rhodopsin.xml
test-data/tblastn_four_human_vs_rhodopsin_ext.tabular
test-data/tblastn_four_human_vs_rhodopsin_parse_deflines.tabular
tools/ncbi_blast_plus/blastdb.loc.sample
tools/ncbi_blast_plus/blastdb_p.loc.sample
tools/ncbi_blast_plus/blastxml_to_tabular.py
tools/ncbi_blast_plus/blastxml_to_tabular.xml
tools/ncbi_blast_plus/ncbi_blast_plus.txt
tools/ncbi_blast_plus/ncbi_blastn_wrapper.xml
tools/ncbi_blast_plus/ncbi_blastp_wrapper.xml
tools/ncbi_blast_plus/ncbi_blastx_wrapper.xml
tools/ncbi_blast_plus/ncbi_tblastn_wrapper.xml
tools/ncbi_blast_plus/ncbi_tblastx_wrapper.xml
tools/ncbi_blast_plus/tool_dependencies.xml
b
diff -r 45ba7c750bc8 -r 6ef523b390e0 blast.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/blast.py Thu Sep 20 10:13:16 2012 -0400
[
@@ -0,0 +1,124 @@
+"""
+BlastXml class
+"""
+
+from galaxy.datatypes.data import get_file_peek
+from galaxy.datatypes.data import Text
+from galaxy.datatypes.xml import GenericXml
+
+class BlastXml( GenericXml ):
+    """NCBI Blast XML Output data"""
+    file_ext = "blastxml"
+
+    def set_peek( self, dataset, is_multi_byte=False ):
+        """Set the peek and blurb text"""
+        if not dataset.dataset.purged:
+            dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
+            dataset.blurb = 'NCBI Blast XML data'
+        else:
+            dataset.peek = 'file does not exist'
+            dataset.blurb = 'file purged from disk'
+    def sniff( self, filename ):
+        """
+        Determines whether the file is blastxml
+        
+        >>> fname = get_test_fname( 'megablast_xml_parser_test1.blastxml' )
+        >>> BlastXml().sniff( fname )
+        True
+        >>> fname = get_test_fname( 'tblastn_four_human_vs_rhodopsin.xml' )
+        >>> BlastXml().sniff( fname )
+        True
+        >>> fname = get_test_fname( 'interval.interval' )
+        >>> BlastXml().sniff( fname )
+        False
+        """
+        #TODO - Use a context manager on Python 2.5+ to close handle
+        handle = open(filename)
+        line = handle.readline()
+        if line.strip() != '<?xml version="1.0"?>':
+            handle.close()
+            return False
+        line = handle.readline()
+        if line.strip() not in ['<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">',
+                                '<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "NCBI_BlastOutput.dtd">']:
+            handle.close()
+            return False
+        line = handle.readline()
+        if line.strip() != '<BlastOutput>':
+            handle.close()
+            return False
+        handle.close()
+        return True
+    
+    def merge(split_files, output_file):
+        """Merging multiple XML files is non-trivial and must be done in subclasses."""
+        if len(split_files) == 1:
+            #For one file only, use base class method (move/copy)
+            return Text.merge(split_files, output_file)
+        out = open(output_file, "w")
+        h = None
+        for f in split_files:
+            h = open(f)
+            body = False
+            header = h.readline()
+            if not header:
+                out.close()
+                h.close()
+                raise ValueError("BLAST XML file %s was empty" % f)
+            if header.strip() != '<?xml version="1.0"?>':
+                out.write(header) #for diagnosis
+                out.close()
+                h.close()
+                raise ValueError("%s is not an XML file!" % f)
+            line = h.readline()
+            header += line
+            if line.strip() not in ['<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">',
+                                    '<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "NCBI_BlastOutput.dtd">']:
+                out.write(header) #for diagnosis
+                out.close()
+                h.close()
+                raise ValueError("%s is not a BLAST XML file!" % f)
+            while True:
+                line = h.readline()
+                if not line:
+                    out.write(header) #for diagnosis
+                    out.close()
+                    h.close()
+                    raise ValueError("BLAST XML file %s ended prematurely" % f)
+                header += line
+                if "<Iteration>" in line:
+                    break
+                if len(header) > 10000:
+                    #Something has gone wrong, don't load too much into memory!
+                    #Write what we have to the merged file for diagnostics
+                    out.write(header)
+                    out.close()
+                    h.close()
+                    raise ValueError("BLAST XML file %s has too long a header!" % f)
+            if "<BlastOutput>" not in header:
+                out.close()
+                h.close()
+                raise ValueError("%s is not a BLAST XML file:\n%s\n..." % (f, header))
+            if f == split_files[0]:
+                out.write(header)
+                old_header = header
+            elif old_header[:300] != header[:300]:
+                #Enough to check <BlastOutput_program> and <BlastOutput_version> match
+                out.close()
+                h.close()
+                raise ValueError("BLAST XML headers don't match for %s and %s - have:\n%s\n...\n\nAnd:\n%s\n...\n" \
+                                 % (split_files[0], f, old_header[:300], header[:300]))
+            else:
+                out.write("    <Iteration>\n")
+            for line in h:
+                if "</BlastOutput_iterations>" in line:
+                    break
+                #TODO - Increment <Iteration_iter-num> and if required automatic query names
+                #like <Iteration_query-ID>Query_3</Iteration_query-ID> to be increasing?
+                out.write(line)
+            h.close()
+        out.write("  </BlastOutput_iterations>\n")
+        out.write("</BlastOutput>\n")
+        out.close()
+    merge = staticmethod(merge)
+
b
diff -r 45ba7c750bc8 -r 6ef523b390e0 blast_datatypes.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/blast_datatypes.txt Thu Sep 20 10:13:16 2012 -0400
b
@@ -0,0 +1,98 @@
+Galaxy datatypes for NCBI BLAST+ suite
+======================================
+
+These Galaxy datatypes are copyright 2010-2012 by Peter Cock, The James Hutton
+Institute (formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved.
+See the licence text below.
+
+Note that these files (and the associated BLAST+ wrappers) were originally
+distributed as part of the main Galaxy repository, but as of August 2012 moved
+to the Galaxy Tool Shed as 'blast_datatypes' (and 'ncbi_blast_plus' for the
+wrappers). My thanks to Dannon Baker from the Galaxy development team for his
+assistance with this.
+
+
+History
+=======
+
+These versions numbers match those for 'ncbi_blast_plus', but are not used
+explicitly in the datatypes themselves.
+
+v0.0.11 - Final revision as part of the Galaxy main repository, and the
+          first release via the Tool Shed
+v0.0.13 - Uses blast.py instead of xml.py to define the datatypes
+
+
+Installation
+============
+
+Doing this automatically via the Galaxy Tool Shed is probably simplest.
+
+
+Manual Installation
+===================
+
+Normally you would install this via the Galaxy ToolShed, which would move
+the provided blast.py file into a suitable location and process the
+datatypes_conf.xml entry to be combined with your local configuration.
+
+However, if you really want to this should work for a manual install. Add
+the following line to the datatypes_conf.xml file in the Galaxy main folder:
+
+   <datatype extension="blastxml" type="galaxy.datatypes.blast:BlastXml" mimetype="application/xml" display_in_upload="true"/>
+
+Also create the file lib/galaxy/datatypes/blast.py by moving, copying or linking
+the blast.py file provided in this tar-ball.  Finally add 'import blast' near
+the start of file lib/galaxy/datatypes/registry.py (after the other import
+lines).
+
+
+Developers
+==========
+
+BLAST+ datatypes and wrappers, and other tools are being developed on the
+following hg branch: http://bitbucket.org/peterjc/galaxy-central/src/tools
+
+For making the "Galaxy Tool Shed" http://community.g2.bx.psu.edu/ tarball I use
+the following command from the Galaxy tools/ncbi_blast_plus folder:
+
+$ tar -czf blast_datatypes.tar.gz blast_datatypes.txt datatypes_conf.xml blast.py
+
+Check this worked:
+
+$ tar -tzf blast_datatypes.tar.gz
+blast_datatypes.txt
+datatypes_conf.xml
+blast.py
+
+Note that the placement of these three files under tools/ncbi_blast_plus is
+arbitrary - this just puts them next to the tool wrappers which use them.
+
+For development, rather than having a local ToolShed running, I currently
+use a symlink from lib/galaxy/datatypes/blast.py to the actual file
+tools/ncbi_blast_plus/blast.py as described above.
+
+
+Licence (MIT/BSD style)
+=======================
+
+Permission to use, copy, modify, and distribute this software and its
+documentation with or without modifications and for any purpose and
+without fee is hereby granted, provided that any copyright notices
+appear in all copies and that both those copyright notices and this
+permission notice appear in supporting documentation, and that the
+names of the contributors or copyright holders not be used in
+advertising or publicity pertaining to distribution of the software
+without specific prior permission.
+
+THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL
+WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE
+CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT
+OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
+OR PERFORMANCE OF THIS SOFTWARE.
+
+NOTE: This is the licence for the Galaxy BLAST datatypes  only. BLAST+
+and associated data files are available and licenced separately.
b
diff -r 45ba7c750bc8 -r 6ef523b390e0 datatypes_conf.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes_conf.xml Thu Sep 20 10:13:16 2012 -0400
b
@@ -0,0 +1,12 @@
+<?xml version="1.0"?>
+<datatypes>
+    <datatype_files>
+        <datatype_file name="blast.py"/>
+    </datatype_files>
+    <registration>
+        <datatype extension="blastxml" type="galaxy.datatypes.blast:BlastXml" mimetype="application/xml" display_in_upload="true"/>
+    </registration>
+    <sniffers>
+        <sniffer type="galaxy.datatypes.blast:BlastXml"/>
+    </sniffers>
+</datatypes>
b
diff -r 45ba7c750bc8 -r 6ef523b390e0 test-data/blastp_four_human_vs_rhodopsin.tabular
--- a/test-data/blastp_four_human_vs_rhodopsin.tabular Thu Sep 20 10:12:43 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,6 +0,0 @@
-sp|P08100|OPSD_HUMAN gi|57163783|ref|NP_001009242.1| 96.55 348 12 0 1 348 1 348 0.0  701
-sp|P08100|OPSD_HUMAN gi|3024260|sp|P56514.1|OPSD_BUFBU 84.80 342 51 1 1 341 1 342 0.0  619
-sp|P08100|OPSD_HUMAN gi|283855846|gb|ADB45242.1| 94.82 328 17 0 11 338 1 328 0.0  653
-sp|P08100|OPSD_HUMAN gi|283855823|gb|ADB45229.1| 94.82 328 17 0 11 338 1 328 0.0  631
-sp|P08100|OPSD_HUMAN gi|223523|prf||0811197A 93.10 348 23 1 1 348 1 347 0.0  673
-sp|P08100|OPSD_HUMAN gi|12583665|dbj|BAB21486.1| 82.16 342 60 1 1 341 1 342 3e-176  599
b
diff -r 45ba7c750bc8 -r 6ef523b390e0 test-data/blastp_four_human_vs_rhodopsin.xml
--- a/test-data/blastp_four_human_vs_rhodopsin.xml Thu Sep 20 10:12:43 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
b'@@ -1,646 +0,0 @@\n-<?xml version="1.0"?>\n-<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "NCBI_BlastOutput.dtd">\n-<BlastOutput>\n-  <BlastOutput_program>blastp</BlastOutput_program>\n-  <BlastOutput_version>BLASTP 2.2.25+</BlastOutput_version>\n-  <BlastOutput_reference>Stephen F. Altschul, Thomas L. Madden, Alejandro A. Sch&amp;auml;ffer, Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), &quot;Gapped BLAST and PSI-BLAST: a new generation of protein database search programs&quot;, Nucleic Acids Res. 25:3389-3402.</BlastOutput_reference>\n-  <BlastOutput_db></BlastOutput_db>\n-  <BlastOutput_query-ID>sp|Q9BS26|ERP44_HUMAN</BlastOutput_query-ID>\n-  <BlastOutput_query-def>Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1</BlastOutput_query-def>\n-  <BlastOutput_query-len>406</BlastOutput_query-len>\n-  <BlastOutput_param>\n-    <Parameters>\n-      <Parameters_matrix>BLOSUM62</Parameters_matrix>\n-      <Parameters_expect>1e-08</Parameters_expect>\n-      <Parameters_gap-open>11</Parameters_gap-open>\n-      <Parameters_gap-extend>1</Parameters_gap-extend>\n-      <Parameters_filter>F</Parameters_filter>\n-    </Parameters>\n-  </BlastOutput_param>\n-  <BlastOutput_iterations>\n-    <Iteration>\n-      <Iteration_iter-num>1</Iteration_iter-num>\n-      <Iteration_query-ID>sp|Q9BS26|ERP44_HUMAN</Iteration_query-ID>\n-      <Iteration_query-def>Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def>\n-      <Iteration_query-len>406</Iteration_query-len>\n-      <Iteration_hits></Iteration_hits>\n-      <Iteration_stat>\n-        <Statistics>\n-          <Statistics_db-num>0</Statistics_db-num>\n-          <Statistics_db-len>0</Statistics_db-len>\n-          <Statistics_hsp-len>30</Statistics_hsp-len>\n-          <Statistics_eff-space>119568</Statistics_eff-space>\n-          <Statistics_kappa>0.041</Statistics_kappa>\n-          <Statistics_lambda>0.267</Statistics_lambda>\n-          <Statistics_entropy>0.14</Statistics_entropy>\n-        </Statistics>\n-      </Iteration_stat>\n-      <Iteration_message>No hits found</Iteration_message>\n-    </Iteration>\n-    <Iteration>\n-      <Iteration_iter-num>2</Iteration_iter-num>\n-      <Iteration_query-ID>sp|Q9BS26|ERP44_HUMAN</Iteration_query-ID>\n-      <Iteration_query-def>Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def>\n-      <Iteration_query-len>406</Iteration_query-len>\n-      <Iteration_hits></Iteration_hits>\n-      <Iteration_stat>\n-        <Statistics>\n-          <Statistics_db-num>0</Statistics_db-num>\n-          <Statistics_db-len>0</Statistics_db-len>\n-          <Statistics_hsp-len>30</Statistics_hsp-len>\n-          <Statistics_eff-space>119568</Statistics_eff-space>\n-          <Statistics_kappa>0.041</Statistics_kappa>\n-          <Statistics_lambda>0.267</Statistics_lambda>\n-          <Statistics_entropy>0.14</Statistics_entropy>\n-        </Statistics>\n-      </Iteration_stat>\n-      <Iteration_message>No hits found</Iteration_message>\n-    </Iteration>\n-    <Iteration>\n-      <Iteration_iter-num>3</Iteration_iter-num>\n-      <Iteration_query-ID>sp|Q9BS26|ERP44_HUMAN</Iteration_query-ID>\n-      <Iteration_query-def>Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def>\n-      <Iteration_query-len>406</Iteration_query-len>\n-      <Iteration_hits></Iteration_hits>\n-      <Iteration_stat>\n-        <Statistics>\n-          <Statistics_db-num>0</Statistics_db-num>\n-          <Statistics_db-len>0</Statistics_db-len>\n-          <Statistics_hsp-len>30</Statistics_hsp-len>\n-          <Statistics_eff-space>119568</Statistics_eff-space>\n-          <Statistics_kappa>0.041</Statistics_kappa>\n-          <Statistics_lambda>0.267</Statistics_lambda>\n-          <Statistics_entropy>0.14</Statistics_entropy>\n-        </Statistics>\n-      </Iteration_stat>\n-      <Iteration_message>No hits found</Iteration_message>\n-    </Iteration>\n-  '..b'PFSN TGVVRSPFE PQYYLAEPWQFSMLAAYMFLLI+LGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMV GGFT+TLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPL GWSRYIPEG+QCSCGID YT   E NNESFVIYMFVVHF IP+I+IFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICW+PYA VAFYIFTHQGS+FGPIFMTIPAFFAK++A+YNPVIYIMMNKQFRNCM+TT+CCGKNPLGDDEAS TVSKTETSQVAPA</Hsp_midline>\n-            </Hsp>\n-          </Hit_hsps>\n-        </Hit>\n-      </Iteration_hits>\n-      <Iteration_stat>\n-        <Statistics>\n-          <Statistics_db-num>0</Statistics_db-num>\n-          <Statistics_db-len>0</Statistics_db-len>\n-          <Statistics_hsp-len>29</Statistics_hsp-len>\n-          <Statistics_eff-space>101761</Statistics_eff-space>\n-          <Statistics_kappa>0.041</Statistics_kappa>\n-          <Statistics_lambda>0.267</Statistics_lambda>\n-          <Statistics_entropy>0.14</Statistics_entropy>\n-        </Statistics>\n-      </Iteration_stat>\n-    </Iteration>\n-    <Iteration>\n-      <Iteration_iter-num>24</Iteration_iter-num>\n-      <Iteration_query-ID>sp|P08100|OPSD_HUMAN</Iteration_query-ID>\n-      <Iteration_query-def>Rhodopsin OS=Homo sapiens GN=RHO PE=1 SV=1</Iteration_query-def>\n-      <Iteration_query-len>348</Iteration_query-len>\n-      <Iteration_hits>\n-        <Hit>\n-          <Hit_num>1</Hit_num>\n-          <Hit_id>gi|12583665|dbj|BAB21486.1|</Hit_id>\n-          <Hit_def>fresh water form rod opsin [Conger myriaster]</Hit_def>\n-          <Hit_accession>BAB21486</Hit_accession>\n-          <Hit_len>354</Hit_len>\n-          <Hit_hsps>\n-            <Hsp>\n-              <Hsp_num>1</Hsp_num>\n-              <Hsp_bit-score>599.356377496438</Hsp_bit-score>\n-              <Hsp_score>1544</Hsp_score>\n-              <Hsp_evalue>3.49521227372659e-176</Hsp_evalue>\n-              <Hsp_query-from>1</Hsp_query-from>\n-              <Hsp_query-to>341</Hsp_query-to>\n-              <Hsp_hit-from>1</Hsp_hit-from>\n-              <Hsp_hit-to>342</Hsp_hit-to>\n-              <Hsp_query-frame>0</Hsp_query-frame>\n-              <Hsp_hit-frame>0</Hsp_hit-frame>\n-              <Hsp_identity>281</Hsp_identity>\n-              <Hsp_positive>314</Hsp_positive>\n-              <Hsp_gaps>1</Hsp_gaps>\n-              <Hsp_align-len>342</Hsp_align-len>\n-              <Hsp_qseq>MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPL-GDDEASATVSKTE</Hsp_qseq>\n-              <Hsp_hseq>MNGTEGPNFYIPMSNATGVVRSPFEYPQYYLAEPWAFSALSAYMFFLIIAGFPINFLTLYVTIEHKKLRTPLNYILLNLAVADLFMVFGGFTTTMYTSMHGYFVFGPTGCNIEGFFATLGGEIALWCLVVLAIERWMVVCKPVTNFRFGESHAIMGVMVTWTMALACALPPLFGWSRYIPEGLQCSCGIDYYTRAPGINNESFVIYMFTCHFSIPLAVISFCYGRLVCTVKEAAAQQQESETTQRAEREVTRMVVIMVISFLVCWVPYASVAWYIFTHQGSTFGPIFMTIPSFFAKSSALYNPMIYICMNKQFRHCMITTLCCGKNPFEEEDGASATSSKTE</Hsp_hseq>\n-              <Hsp_midline>MNGTEGPNFY+P SNATGVVRSPFEYPQYYLAEPW FS L+AYMF LI+ GFPINFLTLYVT++HKKLRTPLNYILLNLAVADLFMV GGFT+T+YTS+HGYFVFGPTGCN+EGFFATLGGEIALW LVVLAIER++VVCKP++NFRFGE+HAIMGV  TW MALACA PPL GWSRYIPEGLQCSCGIDYYT  P +NNESFVIYMF  HF+IP+ +I FCYG+LV TVKEAAAQQQES TTQ+AE+EVTRMV+IMVI+FL+CWVPYASVA+YIFTHQGS FGPIFMTIP+FFAKS+A+YNP+IYI MNKQFR+CM+TT+CCGKNP   +D ASAT SKTE</Hsp_midline>\n-            </Hsp>\n-          </Hit_hsps>\n-        </Hit>\n-      </Iteration_hits>\n-      <Iteration_stat>\n-        <Statistics>\n-          <Statistics_db-num>0</Statistics_db-num>\n-          <Statistics_db-len>0</Statistics_db-len>\n-          <Statistics_hsp-len>29</Statistics_hsp-len>\n-          <Statistics_eff-space>101761</Statistics_eff-space>\n-          <Statistics_kappa>0.041</Statistics_kappa>\n-          <Statistics_lambda>0.267</Statistics_lambda>\n-          <Statistics_entropy>0.14</Statistics_entropy>\n-        </Statistics>\n-      </Iteration_stat>\n-    </Iteration>\n-  </BlastOutput_iterations>\n-</BlastOutput>\n'
b
diff -r 45ba7c750bc8 -r 6ef523b390e0 test-data/blastp_four_human_vs_rhodopsin_converted.tabular
--- a/test-data/blastp_four_human_vs_rhodopsin_converted.tabular Thu Sep 20 10:12:43 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,6 +0,0 @@
-sp|P08100|OPSD_HUMAN gi|57163783|ref|NP_001009242.1| 96.55 348 12 0 1 348 1 348 0.0 701
-sp|P08100|OPSD_HUMAN gi|3024260|sp|P56514.1|OPSD_BUFBU 84.80 342 51 1 1 341 1 342 0.0 619
-sp|P08100|OPSD_HUMAN gi|283855846|gb|ADB45242.1| 94.82 328 17 0 11 338 1 328 0.0 653
-sp|P08100|OPSD_HUMAN gi|283855823|gb|ADB45229.1| 94.82 328 17 0 11 338 1 328 0.0 631
-sp|P08100|OPSD_HUMAN gi|223523|prf||0811197A 93.10 348 23 1 1 348 1 347 0.0 673
-sp|P08100|OPSD_HUMAN gi|12583665|dbj|BAB21486.1| 82.16 342 60 1 1 341 1 342 3e-176 599
b
diff -r 45ba7c750bc8 -r 6ef523b390e0 test-data/blastp_four_human_vs_rhodopsin_converted_ext.tabular
--- a/test-data/blastp_four_human_vs_rhodopsin_converted_ext.tabular Thu Sep 20 10:12:43 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,6 +0,0 @@
-sp|P08100|OPSD_HUMAN gi|57163783|ref|NP_001009242.1| 96.55 348 12 0 1 348 1 348 0.0 701 gi|57163783|ref|NP_001009242.1| 1808 336 343 0 98.56 1 1 MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA MNGTEGPNFYVPFSNKTGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVFGGFTTTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLVGWSRYIPEGMQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIVIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTLPAFFAKSSSIYNPVIYIMMNKQFRNCMLTTLCCGKNPLGDDEASTTGSKTETSQVAPA 348 348
-sp|P08100|OPSD_HUMAN gi|3024260|sp|P56514.1|OPSD_BUFBU 84.80 342 51 1 1 341 1 342 0.0 619 gi|3024260|sp|P56514.1|OPSD_BUFBU 1595 290 322 1 94.15 1 1 MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEA-SATVSKTE MNGTEGPNFYIPMSNKTGVVRSPFEYPQYYLAEPWQYSILCAYMFLLILLGFPINFMTLYVTIQHKKLRTPLNYILLNLAFANHFMVLCGFTVTMYSSMNGYFILGATGCYVEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFSENHAVMGVAFTWIMALSCAVPPLLGWSRYIPEGMQCSCGVDYYTLKPEVNNESFVIYMFVVHFTIPLIIIFFCYGRLVCTVKEAAAQQQESATTQKAEKEVTRMVIIMVVFFLICWVPYASVAFFIFSNQGSEFGPIFMTVPAFFAKSSSIYNPVIYIMLNKQFRNCMITTLCCGKNPFGEDDASSAATSKTE 348 354
-sp|P08100|OPSD_HUMAN gi|283855846|gb|ADB45242.1| 94.82 328 17 0 11 338 1 328 0.0 653 gi|283855846|gb|ADB45242.1| 1684 311 321 0 97.87 1 1 VPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVS VPFSNKTGVVRSPFEHPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVFGGFTTTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGLALTWVMALACAAPPLVGWSRYIPEGMQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIVIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWLPYAGVAFYIFTHQGSNFGPIFMTLPAFFAKSSSIYNPVIYIMMNKQFRNCMLTTLCCGKNPLGDDEASTTAS 348 328
-sp|P08100|OPSD_HUMAN gi|283855823|gb|ADB45229.1| 94.82 328 17 0 11 338 1 328 0.0 631 gi|283855823|gb|ADB45229.1| 1627 311 323 0 98.48 1 1 VPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVS VPFSNKTGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVANLFMVFGGFTTTLYTSMHGYFVFGATGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGLAFTWVMALACAAPPLAGWSRYIPEGMQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIVIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVVAFLICWLPYASVAFYIFTHQGSNFGPVFMTIPAFFAKSSSIYNPVIYIMMNKQFRNCMLTTLCCGKNPLGDDEASTTAS 348 328
-sp|P08100|OPSD_HUMAN gi|223523|prf||0811197A 93.10 348 23 1 1 348 1 347 0.0 673 gi|223523|prf||0811197A 1736 324 336 1 96.55 1 1 MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFLLIMLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVFGGFTTTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLVGWSRYIPEGMQCSCGID-YTPHEETNNESFVIYMFVVHFIIPLIVIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWLPYAGVAFYIFTHQGSDFGPIFMTIPAFFAKTSAVYNPVIYIMMNKQFRNCMVTTLCCGKNPLGDDEASTTVSKTETSQVAPA 348 347
-sp|P08100|OPSD_HUMAN gi|12583665|dbj|BAB21486.1| 82.16 342 60 1 1 341 1 342 3e-176 599 gi|12583665|dbj|BAB21486.1| 1544 281 314 1 91.81 1 1 MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPL-GDDEASATVSKTE MNGTEGPNFYIPMSNATGVVRSPFEYPQYYLAEPWAFSALSAYMFFLIIAGFPINFLTLYVTIEHKKLRTPLNYILLNLAVADLFMVFGGFTTTMYTSMHGYFVFGPTGCNIEGFFATLGGEIALWCLVVLAIERWMVVCKPVTNFRFGESHAIMGVMVTWTMALACALPPLFGWSRYIPEGLQCSCGIDYYTRAPGINNESFVIYMFTCHFSIPLAVISFCYGRLVCTVKEAAAQQQESETTQRAEREVTRMVVIMVISFLVCWVPYASVAWYIFTHQGSTFGPIFMTIPSFFAKSSALYNPMIYICMNKQFRHCMITTLCCGKNPFEEEDGASATSSKTE 348 354
b
diff -r 45ba7c750bc8 -r 6ef523b390e0 test-data/blastp_four_human_vs_rhodopsin_ext.tabular
--- a/test-data/blastp_four_human_vs_rhodopsin_ext.tabular Thu Sep 20 10:12:43 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,6 +0,0 @@
-sp|P08100|OPSD_HUMAN gi|57163783|ref|NP_001009242.1| 96.55 348 12 0 1 348 1 348 0.0  701 gi|57163783|ref|NP_001009242.1| 1808 336 343 0 98.56 1 1 MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA MNGTEGPNFYVPFSNKTGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVFGGFTTTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLVGWSRYIPEGMQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIVIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTLPAFFAKSSSIYNPVIYIMMNKQFRNCMLTTLCCGKNPLGDDEASTTGSKTETSQVAPA 348 348
-sp|P08100|OPSD_HUMAN gi|3024260|sp|P56514.1|OPSD_BUFBU 84.80 342 51 1 1 341 1 342 0.0  619 gi|3024260|sp|P56514.1|OPSD_BUFBU 1595 290 322 1 94.15 1 1 MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEA-SATVSKTE MNGTEGPNFYIPMSNKTGVVRSPFEYPQYYLAEPWQYSILCAYMFLLILLGFPINFMTLYVTIQHKKLRTPLNYILLNLAFANHFMVLCGFTVTMYSSMNGYFILGATGCYVEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFSENHAVMGVAFTWIMALSCAVPPLLGWSRYIPEGMQCSCGVDYYTLKPEVNNESFVIYMFVVHFTIPLIIIFFCYGRLVCTVKEAAAQQQESATTQKAEKEVTRMVIIMVVFFLICWVPYASVAFFIFSNQGSEFGPIFMTVPAFFAKSSSIYNPVIYIMLNKQFRNCMITTLCCGKNPFGEDDASSAATSKTE 348 354
-sp|P08100|OPSD_HUMAN gi|283855846|gb|ADB45242.1| 94.82 328 17 0 11 338 1 328 0.0  653 gi|283855846|gb|ADB45242.1| 1684 311 321 0 97.87 1 1 VPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVS VPFSNKTGVVRSPFEHPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVFGGFTTTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGLALTWVMALACAAPPLVGWSRYIPEGMQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIVIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWLPYAGVAFYIFTHQGSNFGPIFMTLPAFFAKSSSIYNPVIYIMMNKQFRNCMLTTLCCGKNPLGDDEASTTAS 348 328
-sp|P08100|OPSD_HUMAN gi|283855823|gb|ADB45229.1| 94.82 328 17 0 11 338 1 328 0.0  631 gi|283855823|gb|ADB45229.1| 1627 311 323 0 98.48 1 1 VPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVS VPFSNKTGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVANLFMVFGGFTTTLYTSMHGYFVFGATGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGLAFTWVMALACAAPPLAGWSRYIPEGMQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIVIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVVAFLICWLPYASVAFYIFTHQGSNFGPVFMTIPAFFAKSSSIYNPVIYIMMNKQFRNCMLTTLCCGKNPLGDDEASTTAS 348 328
-sp|P08100|OPSD_HUMAN gi|223523|prf||0811197A 93.10 348 23 1 1 348 1 347 0.0  673 gi|223523|prf||0811197A 1736 324 336 1 96.55 1 1 MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFLLIMLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVFGGFTTTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLVGWSRYIPEGMQCSCGID-YTPHEETNNESFVIYMFVVHFIIPLIVIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWLPYAGVAFYIFTHQGSDFGPIFMTIPAFFAKTSAVYNPVIYIMMNKQFRNCMVTTLCCGKNPLGDDEASTTVSKTETSQVAPA 348 347
-sp|P08100|OPSD_HUMAN gi|12583665|dbj|BAB21486.1| 82.16 342 60 1 1 341 1 342 3e-176  599 gi|12583665|dbj|BAB21486.1| 1544 281 314 1 91.81 1 1 MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPL-GDDEASATVSKTE MNGTEGPNFYIPMSNATGVVRSPFEYPQYYLAEPWAFSALSAYMFFLIIAGFPINFLTLYVTIEHKKLRTPLNYILLNLAVADLFMVFGGFTTTMYTSMHGYFVFGPTGCNIEGFFATLGGEIALWCLVVLAIERWMVVCKPVTNFRFGESHAIMGVMVTWTMALACALPPLFGWSRYIPEGLQCSCGIDYYTRAPGINNESFVIYMFTCHFSIPLAVISFCYGRLVCTVKEAAAQQQESETTQRAEREVTRMVVIMVISFLVCWVPYASVAWYIFTHQGSTFGPIFMTIPSFFAKSSALYNPMIYICMNKQFRHCMITTLCCGKNPFEEEDGASATSSKTE 348 354
b
diff -r 45ba7c750bc8 -r 6ef523b390e0 test-data/blastp_human_vs_pdb_seg_no.xml
--- a/test-data/blastp_human_vs_pdb_seg_no.xml Thu Sep 20 10:12:43 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
b'@@ -1,322 +0,0 @@\n-<?xml version="1.0"?>\n-<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "NCBI_BlastOutput.dtd">\n-<BlastOutput>\n-  <BlastOutput_program>blastp</BlastOutput_program>\n-  <BlastOutput_version>BLASTP 2.2.24+</BlastOutput_version>\n-  <BlastOutput_reference>Stephen F. Altschul, Thomas L. Madden, Alejandro A. Sch&amp;auml;ffer, Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), &quot;Gapped BLAST and PSI-BLAST: a new generation of protein database search programs&quot;, Nucleic Acids Res. 25:3389-3402.</BlastOutput_reference>\n-  <BlastOutput_db>/data/blastdb/pdbaa</BlastOutput_db>\n-  <BlastOutput_query-ID>Query_1</BlastOutput_query-ID>\n-  <BlastOutput_query-def>sp|Q9BS26|ERP44_HUMAN Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1</BlastOutput_query-def>\n-  <BlastOutput_query-len>406</BlastOutput_query-len>\n-  <BlastOutput_param>\n-    <Parameters>\n-      <Parameters_matrix>BLOSUM62</Parameters_matrix>\n-      <Parameters_expect>1e-08</Parameters_expect>\n-      <Parameters_gap-open>11</Parameters_gap-open>\n-      <Parameters_gap-extend>1</Parameters_gap-extend>\n-      <Parameters_filter>F</Parameters_filter>\n-    </Parameters>\n-  </BlastOutput_param>\n-  <BlastOutput_iterations>\n-    <Iteration>\n-      <Iteration_iter-num>1</Iteration_iter-num>\n-      <Iteration_query-ID>Query_1</Iteration_query-ID>\n-      <Iteration_query-def>sp|Q9BS26|ERP44_HUMAN Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def>\n-      <Iteration_query-len>406</Iteration_query-len>\n-      <Iteration_hits>\n-        <Hit>\n-          <Hit_num>1</Hit_num>\n-          <Hit_id>gi|193885198|pdb|2R2J|A</Hit_id>\n-          <Hit_def>Chain A, Crystal Structure Of Human Erp44</Hit_def>\n-          <Hit_accession>2R2J_A</Hit_accession>\n-          <Hit_len>382</Hit_len>\n-          <Hit_hsps>\n-            <Hsp>\n-              <Hsp_num>1</Hsp_num>\n-              <Hsp_bit-score>768.073791748238</Hsp_bit-score>\n-              <Hsp_score>1982</Hsp_score>\n-              <Hsp_evalue>0</Hsp_evalue>\n-              <Hsp_query-from>26</Hsp_query-from>\n-              <Hsp_query-to>406</Hsp_query-to>\n-              <Hsp_hit-from>2</Hsp_hit-from>\n-              <Hsp_hit-to>382</Hsp_hit-to>\n-              <Hsp_query-frame>0</Hsp_query-frame>\n-              <Hsp_hit-frame>0</Hsp_hit-frame>\n-              <Hsp_identity>370</Hsp_identity>\n-              <Hsp_positive>372</Hsp_positive>\n-              <Hsp_gaps>0</Hsp_gaps>\n-              <Hsp_align-len>381</Hsp_align-len>\n-              <Hsp_qseq>PVTTEITSLDTENIDEILNNADVALVNFYADWCRFSQMLHPIFEEASDVIKEEFPNENQVVFARVDCDQHSDIAQRYRISKYPTLKLFRNGMMMKREYRGQRSVKALADYIRQQKSDPIQEIRDLAEITTLDRSKRNIIGYFEQKDSDNYRVFERVANILHDDCAFLSAFGDVSKPERYSGDNIIYKPPGHSAPDMVYLGAMTNFDVTYNWIQDKCVPLVREITFENGEELTEEGLPFLILFHMKEDTESLEIFQNEVARQLISEKGTINFLHADCDKFRHPLLHIQKTPADCPVIAIDSFRHMYVFGDFKDVLIPGKLKQFVFDLHSGKLHREFHHGPDPTDTAPGEQAQDVASSPPESSFQKLAPSEYRYTLLRDRDEL</Hsp_qseq>\n-              <Hsp_hseq>PLGSEITSLDTENIDEILNNADVALVNFYADWCRFSQXLHPIFEEASDVIKEEFPNENQVVFARVDCDQHSDIAQRYRISKYPTLKLFRNGXXXKREYRGQRSVKALADYIRQQKSDPIQEIRDLAEITTLDRSKRNIIGYFEQKDSDNYRVFERVANILHDDCAFLSAFGDVSKPERYSGDNIIYKPPGHSAPDXVYLGAXTNFDVTYNWIQDKCVPLVREITFENGEELTEEGLPFLILFHXKEDTESLEIFQNEVARQLISEKGTINFLHADCDKFRHPLLHIQKTPADCPVIAIDSFRHXYVFGDFKDVLIPGKLKQFVFDLHSGKLHREFHHGPDPTDTAPGEQAQDVASSPPESSFQKLAPSEYRYTLLRDRDEL</Hsp_hseq>\n-              <Hsp_midline>P+ +EITSLDTENIDEILNNADVALVNFYADWCRFSQ LHPIFEEASDVIKEEFPNENQVVFARVDCDQHSDIAQRYRISKYPTLKLFRNG   KREYRGQRSVKALADYIRQQKSDPIQEIRDLAEITTLDRSKRNIIGYFEQKDSDNYRVFERVANILHDDCAFLSAFGDVSKPERYSGDNIIYKPPGHSAPD VYLGA TNFDVTYNWIQDKCVPLVREITFENGEELTEEGLPFLILFH KEDTESLEIFQNEVARQLISEKGTINFLHADCDKFRHPLLHIQKTPADCPVIAIDSFRH YVFGDFKDVLIPGKLKQFVFDLHSGKLHREFHHGPDPTDTAPGEQAQDVASSPPESSFQKLAPSEYRYTLLRDRDEL</Hsp_midline>\n-            </Hsp>\n-          </Hit_hsps>\n-        </Hit>\n-        <Hit>\n-          <Hit_num>2</Hit_num>\n-          <Hit_id>gi|88192228|pdb|2B5E|A</Hit_id>\n-          <Hit_d'..b'      <Hsp_qseq>MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA</Hsp_qseq>\n-              <Hsp_hseq>MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFLLIMLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVFGGFTTTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLVGWSRYIPEGMQCSCGIDYYTPHEETNNESFVIYMFVVHFIIPLIVIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWLPYAGVAFYIFTHQGSDFGPIFMTIPAFFAKTSAVYNPVIYIMMNKQFRNCMVTTLCCGKNPLGDDEASTTVSKTETSQVAPA</Hsp_hseq>\n-              <Hsp_midline>MNGTEGPNFYVPFSN TGVVRSPFE PQYYLAEPWQFSMLAAYMFLLI+LGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMV GGFT+TLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPL GWSRYIPEG+QCSCGIDYYT   E NNESFVIYMFVVHF IP+I+IFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICW+PYA VAFYIFTHQGS+FGPIFMTIPAFFAK++A+YNPVIYIMMNKQFRNCM+TT+CCGKNPLGDDEAS TVSKTETSQVAPA</Hsp_midline>\n-            </Hsp>\n-          </Hit_hsps>\n-        </Hit>\n-        <Hit>\n-          <Hit_num>2</Hit_num>\n-          <Hit_id>gi|195927458|pdb|3C9M|A</Hit_id>\n-          <Hit_def>Chain A, Structure Of A Mutant Bovine Rhodopsin In Hexagonal Crystal Form</Hit_def>\n-          <Hit_accession>3C9M_A</Hit_accession>\n-          <Hit_len>348</Hit_len>\n-          <Hit_hsps>\n-            <Hsp>\n-              <Hsp_num>1</Hsp_num>\n-              <Hsp_bit-score>674.085095224404</Hsp_bit-score>\n-              <Hsp_score>1738</Hsp_score>\n-              <Hsp_evalue>0</Hsp_evalue>\n-              <Hsp_query-from>1</Hsp_query-from>\n-              <Hsp_query-to>348</Hsp_query-to>\n-              <Hsp_hit-from>1</Hsp_hit-from>\n-              <Hsp_hit-to>348</Hsp_hit-to>\n-              <Hsp_query-frame>0</Hsp_query-frame>\n-              <Hsp_hit-frame>0</Hsp_hit-frame>\n-              <Hsp_identity>324</Hsp_identity>\n-              <Hsp_positive>335</Hsp_positive>\n-              <Hsp_gaps>0</Hsp_gaps>\n-              <Hsp_align-len>348</Hsp_align-len>\n-              <Hsp_qseq>MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA</Hsp_qseq>\n-              <Hsp_hseq>MCGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFLLIMLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVFGGFTTTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLVGWSRYIPEGMQCSCGIDYYTPHEETNNESFVIYMFVVHFIIPLIVIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWLPYAGVAFYIFTHQGSCFGPIFMTIPAFFAKTSAVYNPVIYIMMNKQFRNCMVTTLCCGKNPLGDDEASTTVSKTETSQVAPA</Hsp_hseq>\n-              <Hsp_midline>M GTEGPNFYVPFSN TGVVRSPFE PQYYLAEPWQFSMLAAYMFLLI+LGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMV GGFT+TLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPL GWSRYIPEG+QCSCGIDYYT   E NNESFVIYMFVVHF IP+I+IFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICW+PYA VAFYIFTHQGS FGPIFMTIPAFFAK++A+YNPVIYIMMNKQFRNCM+TT+CCGKNPLGDDEAS TVSKTETSQVAPA</Hsp_midline>\n-            </Hsp>\n-          </Hit_hsps>\n-        </Hit>\n-      </Iteration_hits>\n-      <Iteration_stat>\n-        <Statistics>\n-          <Statistics_db-num>49615</Statistics_db-num>\n-          <Statistics_db-len>11554246</Statistics_db-len>\n-          <Statistics_hsp-len>0</Statistics_hsp-len>\n-          <Statistics_eff-space>1672994000</Statistics_eff-space>\n-          <Statistics_kappa>0.041</Statistics_kappa>\n-          <Statistics_lambda>0.267</Statistics_lambda>\n-          <Statistics_entropy>0.14</Statistics_entropy>\n-        </Statistics>\n-      </Iteration_stat>\n-    </Iteration>\n-  </BlastOutput_iterations>\n-</BlastOutput>\n'
b
diff -r 45ba7c750bc8 -r 6ef523b390e0 test-data/blastp_human_vs_pdb_seg_no_converted_ext.tabular
--- a/test-data/blastp_human_vs_pdb_seg_no_converted_ext.tabular Thu Sep 20 10:12:43 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
b'@@ -1,8 +0,0 @@\n-sp|Q9BS26|ERP44_HUMAN\tgi|193885198|pdb|2R2J|A\t97.11\t381\t11\t0\t26\t406\t2\t382\t0.0\t768\tgi|193885198|pdb|2R2J|A\t1982\t370\t372\t0\t97.64\t1\t1\tPVTTEITSLDTENIDEILNNADVALVNFYADWCRFSQMLHPIFEEASDVIKEEFPNENQVVFARVDCDQHSDIAQRYRISKYPTLKLFRNGMMMKREYRGQRSVKALADYIRQQKSDPIQEIRDLAEITTLDRSKRNIIGYFEQKDSDNYRVFERVANILHDDCAFLSAFGDVSKPERYSGDNIIYKPPGHSAPDMVYLGAMTNFDVTYNWIQDKCVPLVREITFENGEELTEEGLPFLILFHMKEDTESLEIFQNEVARQLISEKGTINFLHADCDKFRHPLLHIQKTPADCPVIAIDSFRHMYVFGDFKDVLIPGKLKQFVFDLHSGKLHREFHHGPDPTDTAPGEQAQDVASSPPESSFQKLAPSEYRYTLLRDRDEL\tPLGSEITSLDTENIDEILNNADVALVNFYADWCRFSQXLHPIFEEASDVIKEEFPNENQVVFARVDCDQHSDIAQRYRISKYPTLKLFRNGXXXKREYRGQRSVKALADYIRQQKSDPIQEIRDLAEITTLDRSKRNIIGYFEQKDSDNYRVFERVANILHDDCAFLSAFGDVSKPERYSGDNIIYKPPGHSAPDXVYLGAXTNFDVTYNWIQDKCVPLVREITFENGEELTEEGLPFLILFHXKEDTESLEIFQNEVARQLISEKGTINFLHADCDKFRHPLLHIQKTPADCPVIAIDSFRHXYVFGDFKDVLIPGKLKQFVFDLHSGKLHREFHHGPDPTDTAPGEQAQDVASSPPESSFQKLAPSEYRYTLLRDRDEL\t406\t382\n-sp|Q9BS26|ERP44_HUMAN\tgi|88192228|pdb|2B5E|A\t25.17\t290\t193\t8\t25\t306\t10\t283\t4e-20\t95.1\tgi|88192228|pdb|2B5E|A;gi|206581884|pdb|3BOA|A\t235\t73\t133\t24\t45.86\t1\t1\tTPVTTEITSLDTENIDEILNNADVALVNFYADWCRFSQMLHPIFEEASDVIKEEFPNENQVVFARVDCDQHSDIAQRYRISKYPTLKLFRNGMMMKR-EYRGQRSVKALADYIRQQKSDPIQEIRDLAEITTLDRSKRNIIGYFEQKDSDNYRVFERVANILHDDCAFLSAFGDVSKPERYSGDNI---IYKPPGHSAPDMVYLGA---MTNFDVTYNWIQDKCVPLVREITFENGEELTEEGLPFLILFHMKEDTESLEIFQNEVARQLISEKGTINFLHADCDKF-RH\tAPEDSAVVKLATDSFNEYIQSHDLVLAEFFAPWCGHCKNMAPEYVKAAETLVEK-----NITLAQIDCTENQDLCMEHNIPGFPSLKIFKNSDVNNSIDYEGPRTAEAIVQFMIKQSQPAVAVVADLPAYLANETFVTPVIVQSGKIDADFNATFYSMANKHFNDYDFVSA--------ENADDDFKLSIYLPSAMDEP-VVYNGKKADIADADVFEKWLQVEALPYFGEIDGSVFAQYVESGLPLGYLFY--NDEEELEEYKPLFTELAKKNRGLMNFVSIDARKFGRH\t406\t504\n-sp|Q9NSY1|BMP2K_HUMAN\tgi|73536291|pdb|2BUJ|A\t29.39\t279\t182\t8\t40\t308\t21\t294\t1e-22\t105\tgi|73536291|pdb|2BUJ|A;gi|73536292|pdb|2BUJ|B\t262\t82\t130\t15\t46.59\t1\t1\tGVRVFAVGRHQVTLEESLAEGGFSTVFLVR-THGGIRCALKRMYVNNMPDLNVCKREITIMKELSGHKNIVGYLDCAVNSISDNVWEVLILMEYCRAGQVVNQMNKKLQTG--FTEPEVLQIFCDTCEAVARLHQCKTPIIHRDLKVENILLNDGGNYVLCDFGSATNKFLNPQKDG-VNVVEEEIKKYTTLSYRAPEMINLYGGKPITTKADIWALGCLLYKLCFFTLPF------GESQVAICDGNFTIPDNSRYSRNIHCLIRFMLEPDPEHRPDI\tGHMVIIDNKHYLFIQK-LGEGGFSYVDLVEGLHDGHFYALKRILCHEQQDREEAQREAD-MHRLFNHPNILRLVAYCLRERGAKH-EAWLLLPFFKRGTLWNEIERLKDKGNFLTEDQILWLLLGICRGLEAIH--AKGYAHRDLKPTNILLGDEGQPVLMDLGSMNQACIHVEGSRQALTLQDWAAQRCTISYRAPELFSVQSHCVIDERTDVWSLGCVLYAMMFGEGPYDMVFQKGDSVALAVQNQLSIPQSPRHSSALWQLLNSMMTVDPHQRPHI\t1161\t317\n-sp|Q9NSY1|BMP2K_HUMAN\tgi|270346335|pdb|2WQM|A\t27.21\t272\t166\t12\t53\t311\t36\t288\t6e-17\t86.3\tgi|270346335|pdb|2WQM|A;gi|270346336|pdb|2WQN|A\t212\t74\t129\t32\t47.43\t1\t1\tLEESLAEGGFSTVFLVRTH-GGIRCALKRMYVNNMPDLNV---CKREITIMKELSGHKNIVGYLDCAVNSISDNVWEVLILMEYCRAGQVVNQMN--KKLQTGFTEPEVLQIFCDTCEAVARLHQCKTPIIHRDLKVENILLNDGGNYVLCDFGSATNKFLNPQKDGVNVVEEEIKKYTTLSYRAPEMINLYGGKPITTKADIWALGCLLYKLCFFTLPFGESQV---AICD----GNFTIPDNSRYSRNIHCLIRFMLEPDPEHRPDIFQV\tIEKKIGRGQFSEVYRAACLLDGVPVALKKVQIFDLMDAKARADCIKEIDLLKQLN-HPNVIKYY---ASFIEDN--ELNIVLELADAGDLSRMIKHFKKQKRLIPERTVWKYFVQLCSALEHMHSRR--VMHRDIKPANVFITATGVVKLGDLG--LGRFFSSKTTAAHSL------VGTPYYMSPERIHENG---YNFKSDIWSLGCLLYEMAALQSPFYGDKMNLYSLCKKIEQCDYPPLPSDHYSEELRQLVNMCINPDPEKRPDVTYV\t1161\t310\n-sp|P06213|INSR_HUMAN\tgi|116667097|pdb|2DTG|E\t95.91\t928\t7\t2\t28\t955\t1\t897\t0.0\t1846\tgi|116667097|pdb|2DTG|E\t4781\t890\t893\t31\t96.23\t1\t1\tHLYPGEVCPGMDIRNNLTRLHELENCSVIEGHLQILLMFKTRPEDFRDLSFPKLIMITDYLLLFRVYGLESLKDLFPNLTVIRGSRLFFNYALVIFEMVHLKELGLYNLMNITRGSVRIEKNNELCYLATIDWSRILDSVEDNYIVLNKDDNEECGDICPGTAKGKTNCPATVINGQFVERCWTHSHCQKVCPTICKSHGCTAEGLCCHSECLGNCSQPDDPTKCVACRNFYLDGRCVETCPPPYYHFQDWRCVNFSFCQDLHHKCKNSRRQGCHQYVIHNNKCIPECPSGYTMNSSNLLCTPCLGPCPKVCHLLEGEKTIDSVTSAQELRGCTVINGSLIINIRGGNNLAAELEANLGLIEEISGYLKIRRSYALVSLSFFRKLRLIRGETLEIGNYSFYALDNQNLRQLWDWSKHNLTITQGKLFFHYNPKLCLSEIHKMEEVSGTKGRQERNDIALKTNGDQASCENELLKFSYIRTSFDKILLRWEPYWPPDFRDLLGFMLFYKEAPYQNVTEFDGQDACGSNSWTVVDIDPPLRSNDPKSQNHPGWLMRGLKPWTQYAIFVKTLVTFSDERRTYGAKSDIIYVQTDATNPSVPLDPISVSNSSSQIILKWKPPSDPNGNITHYLVFWERQAEDSELFELDYCLKGLKLPSRTWSPPFESEDSQKHNQSEYEDSAGECCSCPKTDSQILKELEESSFRKTFEDYLHNVVFVPRKTSSGTGAEDPRPSRKRRSLGDVGNVTVAVPTVAAFPNTSS'..b'NLTITQGKLFFHYNPKLCLSEIHKMEEVSGTKGRQERNDIALKTNGDQASCENELLKFSYIRTSFDKILLRWEPYWPPDFRDLLGFMLFYKEAPYQNVTEFDGQDACGSNSWTVVDIDPPLRSNDPKSQNHPGWLMRGLKPWTQYAIFVKTLVTFSDERRTYGAKSDIIYVQTDATNPSVPLDPISVSNSSSQIILKWKPPSDPNGNITHYLVFWERQAEDSELFELDYCLKGLKLPSRTWSPPFESEDSQKHNQSEYEDSAGECCSCPKTDSQILKELEESSFRKTFEDYLHNVVFV------------PRPSRKRRSLGDVGNA-------------------GNNEEHRPFEKVVNKESLVISGLRHFTGYRIELQACNQDTPEERCSVAAYVSARTMPEAKADDIVGPVTHEIFENNVVHLMWQEPKEPNGLIVLYEVSYRRYGDEELHLCDTRKHFALERGCRLRGLSPGNYSVRIRATSLAGNGSWTEPTYFYVTDYLDVPSNIA\t1382\t897\n-sp|P06213|INSR_HUMAN\tgi|114794482|pdb|2HR7|A\t99.59\t485\t2\t0\t28\t512\t1\t485\t0.0\t1016\tgi|114794482|pdb|2HR7|A;gi|114794483|pdb|2HR7|B\t2628\t483\t485\t0\t100.00\t1\t1\tHLYPGEVCPGMDIRNNLTRLHELENCSVIEGHLQILLMFKTRPEDFRDLSFPKLIMITDYLLLFRVYGLESLKDLFPNLTVIRGSRLFFNYALVIFEMVHLKELGLYNLMNITRGSVRIEKNNELCYLATIDWSRILDSVEDNYIVLNKDDNEECGDICPGTAKGKTNCPATVINGQFVERCWTHSHCQKVCPTICKSHGCTAEGLCCHSECLGNCSQPDDPTKCVACRNFYLDGRCVETCPPPYYHFQDWRCVNFSFCQDLHHKCKNSRRQGCHQYVIHNNKCIPECPSGYTMNSSNLLCTPCLGPCPKVCHLLEGEKTIDSVTSAQELRGCTVINGSLIINIRGGNNLAAELEANLGLIEEISGYLKIRRSYALVSLSFFRKLRLIRGETLEIGNYSFYALDNQNLRQLWDWSKHNLTITQGKLFFHYNPKLCLSEIHKMEEVSGTKGRQERNDIALKTNGDQASCENELLKFSYIRTSFDKI\tHLYPGEVCPGMDIRNNLTRLHELENCSVIEGHLQILLMFKTRPEDFRDLSFPKLIMITDYLLLFRVYGLESLKDLFPNLTVIRGSRLFFNYALVIFEMVHLKELGLYNLMNITRGSVRIEKNNELCYLATIDWSRILDSVEDNHIVLNKDDNEECGDICPGTAKGKTNCPATVINGQFVERCWTHSHCQKVCPTICKSHGCTAEGLCCHSECLGNCSQPDDPTKCVACRNFYLDGRCVETCPPPYYHFQDWRCVNFSFCQDLHHKCKNSRRQGCHQYVIHNNKCIPECPSGYTMNSSNLLCTPCLGPCPKVCHLLEGEKTIDSVTSAQELRGCTVINGSLIINIRGGNNLAAELEANLGLIEEISGYLKIRRSYALVSLSFFRKLRLIRGETLEIGNYSFYALDNQNLRQLWDWSKHNLTITQGKLFFHYNPKLCLSEIHKMEEVSGTKGRQERNDIALKTNGDKASCENELLKFSYIRTSFDKI\t1382\t486\n-sp|P08100|OPSD_HUMAN\tgi|16975387|pdb|1JFP|A\t93.39\t348\t23\t0\t1\t348\t1\t348\t0.0\t681\tgi|16975387|pdb|1JFP|A;gi|22219255|pdb|1LN6|A;gi|157878065|pdb|1GZM|A;gi|157878066|pdb|1GZM|B;gi|157878298|pdb|1HZX|A;gi|157878299|pdb|1HZX|B;gi|157878979|pdb|1L9H|A;gi|157878980|pdb|1L9H|B;gi|157880263|pdb|1U19|A;gi|157880264|pdb|1U19|B;gi|157883606|pdb|2G87|A;gi|157883607|pdb|2G87|B;gi|157883830|pdb|2HPY|A;gi|157883831|pdb|2HPY|B;gi|157883860|pdb|2I35|A;gi|157883861|pdb|2I36|A;gi|157883862|pdb|2I36|B;gi|157883863|pdb|2I36|C;gi|157883864|pdb|2I37|A;gi|157883865|pdb|2I37|B;gi|157883866|pdb|2I37|C;gi|159795066|pdb|2PED|A;gi|159795067|pdb|2PED|B;gi|192988480|pdb|3CAP|A;gi|192988481|pdb|3CAP|B;gi|195927457|pdb|3C9L|A;gi|197107530|pdb|1F88|A;gi|197107531|pdb|1F88|B;gi|206582030|pdb|3DQB|A\t1756\t325\t337\t0\t96.84\t1\t1\tMNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA\tMNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFLLIMLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVFGGFTTTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLVGWSRYIPEGMQCSCGIDYYTPHEETNNESFVIYMFVVHFIIPLIVIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWLPYAGVAFYIFTHQGSDFGPIFMTIPAFFAKTSAVYNPVIYIMMNKQFRNCMVTTLCCGKNPLGDDEASTTVSKTETSQVAPA\t348\t348\n-sp|P08100|OPSD_HUMAN\tgi|195927458|pdb|3C9M|A\t93.10\t348\t24\t0\t1\t348\t1\t348\t0.0\t674\tgi|195927458|pdb|3C9M|A\t1738\t324\t335\t0\t96.26\t1\t1\tMNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA\tMCGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFLLIMLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVFGGFTTTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLVGWSRYIPEGMQCSCGIDYYTPHEETNNESFVIYMFVVHFIIPLIVIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWLPYAGVAFYIFTHQGSCFGPIFMTIPAFFAKTSAVYNPVIYIMMNKQFRNCMVTTLCCGKNPLGDDEASTTVSKTETSQVAPA\t348\t348\n'
b
diff -r 45ba7c750bc8 -r 6ef523b390e0 test-data/blastp_human_vs_pdb_seg_no_converted_std.tabular
--- a/test-data/blastp_human_vs_pdb_seg_no_converted_std.tabular Thu Sep 20 10:12:43 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,8 +0,0 @@
-sp|Q9BS26|ERP44_HUMAN gi|193885198|pdb|2R2J|A 97.11 381 11 0 26 406 2 382 0.0 768
-sp|Q9BS26|ERP44_HUMAN gi|88192228|pdb|2B5E|A 25.17 290 193 8 25 306 10 283 4e-20 95.1
-sp|Q9NSY1|BMP2K_HUMAN gi|73536291|pdb|2BUJ|A 29.39 279 182 8 40 308 21 294 1e-22 105
-sp|Q9NSY1|BMP2K_HUMAN gi|270346335|pdb|2WQM|A 27.21 272 166 12 53 311 36 288 6e-17 86.3
-sp|P06213|INSR_HUMAN gi|116667097|pdb|2DTG|E 95.91 928 7 2 28 955 1 897 0.0 1846
-sp|P06213|INSR_HUMAN gi|114794482|pdb|2HR7|A 99.59 485 2 0 28 512 1 485 0.0 1016
-sp|P08100|OPSD_HUMAN gi|16975387|pdb|1JFP|A 93.39 348 23 0 1 348 1 348 0.0 681
-sp|P08100|OPSD_HUMAN gi|195927458|pdb|3C9M|A 93.10 348 24 0 1 348 1 348 0.0 674
b
diff -r 45ba7c750bc8 -r 6ef523b390e0 test-data/blastp_rhodopsin_vs_four_human.tabular
--- a/test-data/blastp_rhodopsin_vs_four_human.tabular Thu Sep 20 10:12:43 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,6 +0,0 @@
-gi|57163783|ref|NP_001009242.1| sp|P08100|OPSD_HUMAN 96.55 348 12 0 1 348 1 348 0.0  679
-gi|3024260|sp|P56514.1|OPSD_BUFBU sp|P08100|OPSD_HUMAN 83.33 354 53 2 1 354 1 348 6e-178  605
-gi|283855846|gb|ADB45242.1| sp|P08100|OPSD_HUMAN 94.82 328 17 0 1 328 11 338 0.0  630
-gi|283855823|gb|ADB45229.1| sp|P08100|OPSD_HUMAN 94.82 328 17 0 1 328 11 338 0.0  630
-gi|223523|prf||0811197A sp|P08100|OPSD_HUMAN 93.10 348 23 1 1 347 1 348 0.0  651
-gi|12583665|dbj|BAB21486.1| sp|P08100|OPSD_HUMAN 81.09 349 65 1 1 349 1 348 2e-172  587
b
diff -r 45ba7c750bc8 -r 6ef523b390e0 test-data/blastp_sample.xml
--- a/test-data/blastp_sample.xml Thu Sep 20 10:12:43 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
b'@@ -1,293 +0,0 @@\n-<?xml version="1.0"?>\n-<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "NCBI_BlastOutput.dtd">\n-<BlastOutput>\n-  <BlastOutput_program>blastp</BlastOutput_program>\n-  <BlastOutput_version>BLASTP 2.2.24+</BlastOutput_version>\n-  <BlastOutput_reference>Stephen F. Altschul, Thomas L. Madden, Alejandro A. Sch&amp;auml;ffer, Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), &quot;Gapped BLAST and PSI-BLAST: a new generation of protein database search programs&quot;, Nucleic Acids Res. 25:3389-3402.</BlastOutput_reference>\n-  <BlastOutput_db>nr</BlastOutput_db>\n-  <BlastOutput_query-ID>Query_1</BlastOutput_query-ID>\n-  <BlastOutput_query-def>Sample</BlastOutput_query-def>\n-  <BlastOutput_query-len>516</BlastOutput_query-len>\n-  <BlastOutput_param>\n-    <Parameters>\n-      <Parameters_matrix>BLOSUM62</Parameters_matrix>\n-      <Parameters_expect>1e-30</Parameters_expect>\n-      <Parameters_gap-open>11</Parameters_gap-open>\n-      <Parameters_gap-extend>1</Parameters_gap-extend>\n-      <Parameters_filter>F</Parameters_filter>\n-    </Parameters>\n-  </BlastOutput_param>\n-  <BlastOutput_iterations>\n-    <Iteration>\n-      <Iteration_iter-num>1</Iteration_iter-num>\n-      <Iteration_query-ID>Query_1</Iteration_query-ID>\n-      <Iteration_query-def>Sample</Iteration_query-def>\n-      <Iteration_query-len>516</Iteration_query-len>\n-      <Iteration_hits>\n-        <Hit>\n-          <Hit_num>1</Hit_num>\n-          <Hit_id>gi|119953746|ref|YP_950551.1|</Hit_id>\n-          <Hit_def>tail tape measure protein [Streptococcus phage SMP] &gt;gi|118430558|gb|ABK91882.1| tail tape measure protein [Streptococcus suis phage SMP]</Hit_def>\n-          <Hit_accession>YP_950551</Hit_accession>\n-          <Hit_len>659</Hit_len>\n-          <Hit_hsps>\n-            <Hsp>\n-              <Hsp_num>1</Hsp_num>\n-              <Hsp_bit-score>949.117592429394</Hsp_bit-score>\n-              <Hsp_score>2452</Hsp_score>\n-              <Hsp_evalue>0</Hsp_evalue>\n-              <Hsp_query-from>1</Hsp_query-from>\n-              <Hsp_query-to>516</Hsp_query-to>\n-              <Hsp_hit-from>27</Hsp_hit-from>\n-              <Hsp_hit-to>542</Hsp_hit-to>\n-              <Hsp_query-frame>0</Hsp_query-frame>\n-              <Hsp_hit-frame>0</Hsp_hit-frame>\n-              <Hsp_identity>500</Hsp_identity>\n-              <Hsp_positive>500</Hsp_positive>\n-              <Hsp_gaps>0</Hsp_gaps>\n-              <Hsp_align-len>516</Hsp_align-len>\n-              <Hsp_qseq>FHLLNSGGSALSVMFAKLVGIIAGISAPIWXXXXXXXXXXXXXXXXYNTNEEFRTKVQAAWEAIKSAISTAVEAVVSFVMDLWGQMVAWWNENQELIRQTAETVWNAIRTVVETVMTALIPIVQTAWDLILAVVTTVLNVIKTVVDTGLKVVLGIIKAVMQMINGDWSGAWETLKGVAGTIWEGIKSLVQVAIDGLVQIFQTGLAFLKSIWDTVWGTIMAVVGPIWDWIKTTVSNAITAVWEIIQNIMTSIQTTWDTVWNAISTVASNIWTAISTTVMSVLTTIWGYIQTYLELIKTVWSAAWEIIKAVFAAILLTIVGLVTGNFDLIKQAISNAWEIIKTKTSEIWNAITTFLSGIWEGIKTAASTAWEWIKTTISNVMTTIKSNIETAWNNIKTSISNALNNIKSAAENAWNNIKSAISTAIENIKSTVSNGWNNLVSTVTNAGPRIVSAVRTGFDNAVNAARNFISNAISVGGDLINGFVEGVKGAAGRLIDAVGGAVSGAIDWAKGLLGIKS</Hsp_qseq>\n-              <Hsp_hseq>FHLLNSGGSALSVMFAKLVGIIAGISAPIWAVIGVIAALVAGFVLLYNTNEEFRTKVQAAWEAIKSAISTAVEAVVSFVMDLWGQMVAWWNENQELIRQTAETVWNAIRTVVETVMTALIPIVQTAWDLILAVVTTVLNVIKTVVDTGLKVVLGIIKAVMQMINGDWSGAWETLKGVAGTIWEGIKSLVQVAIDGLVQIFQTGLAFLKSIWDTVWGTIMAVVGPIWDWIKTTVSNAITAVWEIIQNIMTSIQTTWDTVWNAISTVASNIWTAISTTVMSVLTTIWGYIQTYLELIKTVWSAAWEIIKAVFAAILLTIVGLVTGNFDLIKQAISNAWEIIKTKTSEIWNAITTFLSGIWEGIKTAASTAWEWIKTTISNVMTTIKSNIETAWNNIKTSISNALNNIKSAAENAWNNIKSAISTAIENIKSTVSNGWNNLVSTVTNAGPRIVSAVRTGFDNAVNAARNFISNAISVGGDLINGFVEGVKGAAGRLIDAVGGAVSGAIDWAKGLLGIKS</Hsp_hseq>\n-              <Hsp_midline>FHLLNSGGSALSVMFAKLVGIIAGISAPIW                YNTNEEFRTKVQAAWEAIKSAISTAVEAVVSFVMDLWGQMVAWWNENQELIRQTAETVWNAIRTVVETVMTALIPIVQTAWDLILAVVTTVLNVIKTVVDTGLKVVLGIIKAVMQMINGDWSGAWETLKGVAGTIWEGIKSLVQVAIDGLVQIFQTGLAFLKSIWDTVWGTIMAVVGPIWDWIKTTVSNAITAVWEIIQNIMTSIQTTWDTVWNAISTVASNIWTAISTTVMSVLTTIWGYIQTYLELIKTVWSAAWEIIKAVFAAILLTIVGLVTGNFDLIKQAISNAWEIIKTKTSEIWNAITTFLSGIWEGIKTAASTAWEWIKTTISNVMTTIKSNIETAWNN'..b'NVFNAIKNTATNVWNAIKTTISNVVQTILNF---------------------------------VTPIFNTMKNTITNIFNAIRNTASSVWNSIKTTISNIVTSVKNTVINIFNALKNSITNIFNAIRNTASTVWNSIKSTVSNIVSATVNTVKNLFNGMKNTVSSIWDGVRNTISNVVNAVKNTISNVWGGITGTVSN----IFNGVKNAIDGPMNAAKNLVKNVV----DAIKGF</Hsp_hseq>\n-              <Hsp_midline>+++V     L G +V  WN+    + +         +  ++  +  VE V   +   +QT W++I AVV  ++    N+ K + D          KA  Q +       W+ +K +A  +WE I   V   I+G + + +      K+ +  +W  ++  V   W+ IK TV++  TA+   +  I  +I+TT   V+NAI   A+N+W AI TT+ +V+ TI  +                                 VT  F+ +K  I+N +  I+   S +WN+I T +S I   +K      +  +K +I+N+   I++   T WN+IK+++S           N  N +K+   + W+ +++ IS  +  +K+T+SN W  +  TV+N    I + V+   D  +NAA+N + N +    D I GF</Hsp_midline>\n-            </Hsp>\n-          </Hit_hsps>\n-        </Hit>\n-        <Hit>\n-          <Hit_num>9</Hit_num>\n-          <Hit_id>gi|163941333|ref|YP_001646217.1|</Hit_id>\n-          <Hit_def>prophage LambdaBa01, membrane protein, putative [Bacillus weihenstephanensis KBAB4] &gt;gi|163863530|gb|ABY44589.1| prophage LambdaBa01, membrane protein, putative [Bacillus weihenstephanensis KBAB4]</Hit_def>\n-          <Hit_accession>YP_001646217</Hit_accession>\n-          <Hit_len>725</Hit_len>\n-          <Hit_hsps>\n-            <Hsp>\n-              <Hsp_num>1</Hsp_num>\n-              <Hsp_bit-score>138.657684699283</Hsp_bit-score>\n-              <Hsp_score>348</Hsp_score>\n-              <Hsp_evalue>8.15996781441799e-31</Hsp_evalue>\n-              <Hsp_query-from>61</Hsp_query-from>\n-              <Hsp_query-to>480</Hsp_query-to>\n-              <Hsp_hit-from>142</Hsp_hit-from>\n-              <Hsp_hit-to>560</Hsp_hit-to>\n-              <Hsp_query-frame>0</Hsp_query-frame>\n-              <Hsp_hit-frame>0</Hsp_hit-frame>\n-              <Hsp_identity>118</Hsp_identity>\n-              <Hsp_positive>203</Hsp_positive>\n-              <Hsp_gaps>29</Hsp_gaps>\n-              <Hsp_align-len>434</Hsp_align-len>\n-              <Hsp_qseq>WEAIKSAISTAVEAVVSFVMDLWGQMVAWWNENQELIRQTAETVWNAIRTVVETVMTALIPIVQTAWDLILAVVTTVLNVIKTVVDTGLKVVLGIIK---AVMQMINGDWSGAWETLKGVAGTIWEGIKSLVQVAIDGLVQIFQTGLAFLKSIWDTVWGTIMAVVGPIWDWIKTTVSNAITAVWEIIQNIMTSIQTTWDTVWNAISTVASNIWTAISTTVMSVLTTIWGYIQTYLELIKT----VWS-------AAWEIIKAVFAAILLTIVGLVTGNFDLIKQAISNAWEIIKTKTSEIWNAITTFLSGIWEGIKTAASTAWEWIKTTISNVMTTIKSNIETAWNNIKTSISNALNNIKSAAENAWNNIKSAISTAIENIKSTVSNGWNNLVSTVTNAGPRIVSAVRTGFDNAVNAARNFISNAISVGGDLIN</Hsp_qseq>\n-              <Hsp_hseq>WDAIKQWTIDAWNAIGEFLVGIWDGIVQWASEAWNSISESTSAVWNSIKEFLIGIWNGIVEFVVT-WGT--AILETYVGIWTSIFNFCMEIWNGIVEYLTSVLQGIATFFTEIWTSISTFFQEIWNGLVAFITPVLQGIADFFAM-----------IWNGISTVIQTVWNFITQYLQAIWTAILYFATPLFESIKNFISECWNKISSTTSLVWETIKNFLVSCWNGLVSFVTPIFEKIKSWIISVWDTISSATMAVWNAVKNFLQACWNGLVSIVTPIFDAIKNWIVNVWNAISSTTSAVWNAIKSYLSSLWNSIVSTASSIFNSIKSAISTVWNMISSASSSVWNGIKSTLSSIWNGIKSTASSVWNGLKDAIMTPVRWVTSAVSGAFNGMKSAVLGVWDGIKSGIRTAINGIIRIINKFI-DGFNTPAELLN</Hsp_hseq>\n-              <Hsp_midline>W+AIK     A  A+  F++ +W  +V W +E    I ++   VWN+I+  +  +   ++  V T W    A++ T + +  ++ +  +++  GI++   +V+Q I   ++  W ++      IW G+ + +   + G+   F             +W  I  V+  +W++I   +    TA+      +  SI+      WN IS+  S +W  I   ++S    +  ++    E IK+    VW        A W  +K    A    +V +VT  FD IK  I N W  I + TS +WNAI ++LS +W  I + AS+ +  IK+ IS V   I S   + WN IK+++S+  N IKS A + WN +K AI T +  + S VS  +N + S V      I S +RT  +  +     FI +  +   +L+N</Hsp_midline>\n-            </Hsp>\n-          </Hit_hsps>\n-        </Hit>\n-      </Iteration_hits>\n-      <Iteration_stat>\n-        <Statistics>\n-          <Statistics_db-num>6589360</Statistics_db-num>\n-          <Statistics_db-len>-2041834015</Statistics_db-len>\n-          <Statistics_hsp-len>0</Statistics_hsp-len>\n-          <Statistics_eff-space>504129014857</Statistics_eff-space>\n-          <Statistics_kappa>0.041</Statistics_kappa>\n-          <Statistics_lambda>0.267</Statistics_lambda>\n-          <Statistics_entropy>0.14</Statistics_entropy>\n-        </Statistics>\n-      </Iteration_stat>\n-    </Iteration>\n-  </BlastOutput_iterations>\n-</BlastOutput>\n'
b
diff -r 45ba7c750bc8 -r 6ef523b390e0 test-data/blastp_sample_converted.tabular
--- a/test-data/blastp_sample_converted.tabular Thu Sep 20 10:12:43 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,9 +0,0 @@
-Sample gi|119953746|ref|YP_950551.1| 96.90 516 16 0 1 516 27 542 0.0 949
-Sample gi|148986157|ref|ZP_01819143.1| 41.27 252 115 3 49 300 679 897 2e-41 174
-Sample gi|77411259|ref|ZP_00787609.1| 41.00 261 143 2 50 310 655 904 8e-39 165
-Sample gi|76786754|ref|YP_329383.1| 39.46 261 147 2 50 310 655 904 7e-37 159
-Sample gi|153811333|ref|ZP_01964001.1| 29.98 557 277 18 3 516 573 1059 2e-36 157
-Sample gi|56962696|ref|YP_174422.1| 28.79 389 228 8 48 433 123 465 3e-33 146
-Sample gi|50914476|ref|YP_060448.1| 43.82 178 100 0 50 227 655 832 5e-33 146
-Sample gi|29374987|ref|NP_814140.1| 25.46 432 244 8 73 482 545 920 7e-31 139
-Sample gi|163941333|ref|YP_001646217.1| 27.19 434 287 7 61 480 142 560 8e-31 138
b
diff -r 45ba7c750bc8 -r 6ef523b390e0 test-data/blastx_rhodopsin_vs_four_human.tabular
--- a/test-data/blastx_rhodopsin_vs_four_human.tabular Thu Sep 20 10:12:43 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,10 +0,0 @@
-gi|57163782|ref|NM_001009242.1| sp|P08100|OPSD_HUMAN 96.55 348 12 0 1 1044 1 348 0.0  662
-gi|2734705|gb|U59921.1|BBU59921 sp|P08100|OPSD_HUMAN 85.24 332 49 0 42 1037 1 332 1e-168  575
-gi|283855845|gb|GQ290303.1| sp|P08100|OPSD_HUMAN 96.40 111 4 0 1 333 11 121 2e-62  224
-gi|283855845|gb|GQ290303.1| sp|P08100|OPSD_HUMAN 92.31 65 5 0 3174 3368 248 312 5e-34  129
-gi|283855845|gb|GQ290303.1| sp|P08100|OPSD_HUMAN 96.43 56 2 0 2855 3022 177 232 2e-31  120
-gi|283855845|gb|GQ290303.1| sp|P08100|OPSD_HUMAN 93.22 59 4 0 1404 1580 119 177 1e-30  118
-gi|283855845|gb|GQ290303.1| sp|P08100|OPSD_HUMAN 92.00 25 2 0 4222 4296 312 336 6e-12 56.2
-gi|283855822|gb|GQ290312.1| sp|P08100|OPSD_HUMAN 95.09 326 16 0 1 978 11 336 2e-180  613
-gi|18148870|dbj|AB062417.1| sp|P08100|OPSD_HUMAN 93.39 348 23 0 1 1044 1 348 0.0  641
-gi|12583664|dbj|AB043817.1| sp|P08100|OPSD_HUMAN 81.93 332 60 0 23 1018 1 332 6e-164  559
b
diff -r 45ba7c750bc8 -r 6ef523b390e0 test-data/blastx_rhodopsin_vs_four_human.xml
--- a/test-data/blastx_rhodopsin_vs_four_human.xml Thu Sep 20 10:12:43 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
b'@@ -1,722 +0,0 @@\n-<?xml version="1.0"?>\n-<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "NCBI_BlastOutput.dtd">\n-<BlastOutput>\n-  <BlastOutput_program>blastx</BlastOutput_program>\n-  <BlastOutput_version>BLASTX 2.2.25+</BlastOutput_version>\n-  <BlastOutput_reference>Stephen F. Altschul, Thomas L. Madden, Alejandro A. Sch&amp;auml;ffer, Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), &quot;Gapped BLAST and PSI-BLAST: a new generation of protein database search programs&quot;, Nucleic Acids Res. 25:3389-3402.</BlastOutput_reference>\n-  <BlastOutput_db></BlastOutput_db>\n-  <BlastOutput_query-ID>Query_1</BlastOutput_query-ID>\n-  <BlastOutput_query-def>gi|57163782|ref|NM_001009242.1| Felis catus rhodopsin (RHO), mRNA</BlastOutput_query-def>\n-  <BlastOutput_query-len>1047</BlastOutput_query-len>\n-  <BlastOutput_param>\n-    <Parameters>\n-      <Parameters_matrix>BLOSUM62</Parameters_matrix>\n-      <Parameters_expect>1e-10</Parameters_expect>\n-      <Parameters_gap-open>11</Parameters_gap-open>\n-      <Parameters_gap-extend>1</Parameters_gap-extend>\n-      <Parameters_filter>L;</Parameters_filter>\n-    </Parameters>\n-  </BlastOutput_param>\n-  <BlastOutput_iterations>\n-    <Iteration>\n-      <Iteration_iter-num>1</Iteration_iter-num>\n-      <Iteration_query-ID>Query_1</Iteration_query-ID>\n-      <Iteration_query-def>gi|57163782|ref|NM_001009242.1| Felis catus rhodopsin (RHO), mRNA</Iteration_query-def>\n-      <Iteration_query-len>1047</Iteration_query-len>\n-      <Iteration_hits></Iteration_hits>\n-      <Iteration_stat>\n-        <Statistics>\n-          <Statistics_db-num>0</Statistics_db-num>\n-          <Statistics_db-len>0</Statistics_db-len>\n-          <Statistics_hsp-len>29</Statistics_hsp-len>\n-          <Statistics_eff-space>102080</Statistics_eff-space>\n-          <Statistics_kappa>0.041</Statistics_kappa>\n-          <Statistics_lambda>0.267</Statistics_lambda>\n-          <Statistics_entropy>0.14</Statistics_entropy>\n-        </Statistics>\n-      </Iteration_stat>\n-      <Iteration_message>No hits found</Iteration_message>\n-    </Iteration>\n-    <Iteration>\n-      <Iteration_iter-num>2</Iteration_iter-num>\n-      <Iteration_query-ID>Query_1</Iteration_query-ID>\n-      <Iteration_query-def>gi|57163782|ref|NM_001009242.1| Felis catus rhodopsin (RHO), mRNA</Iteration_query-def>\n-      <Iteration_query-len>1047</Iteration_query-len>\n-      <Iteration_hits></Iteration_hits>\n-      <Iteration_stat>\n-        <Statistics>\n-          <Statistics_db-num>0</Statistics_db-num>\n-          <Statistics_db-len>0</Statistics_db-len>\n-          <Statistics_hsp-len>29</Statistics_hsp-len>\n-          <Statistics_eff-space>102080</Statistics_eff-space>\n-          <Statistics_kappa>0.041</Statistics_kappa>\n-          <Statistics_lambda>0.267</Statistics_lambda>\n-          <Statistics_entropy>0.14</Statistics_entropy>\n-        </Statistics>\n-      </Iteration_stat>\n-      <Iteration_message>No hits found</Iteration_message>\n-    </Iteration>\n-    <Iteration>\n-      <Iteration_iter-num>3</Iteration_iter-num>\n-      <Iteration_query-ID>Query_1</Iteration_query-ID>\n-      <Iteration_query-def>gi|57163782|ref|NM_001009242.1| Felis catus rhodopsin (RHO), mRNA</Iteration_query-def>\n-      <Iteration_query-len>1047</Iteration_query-len>\n-      <Iteration_hits></Iteration_hits>\n-      <Iteration_stat>\n-        <Statistics>\n-          <Statistics_db-num>0</Statistics_db-num>\n-          <Statistics_db-len>0</Statistics_db-len>\n-          <Statistics_hsp-len>29</Statistics_hsp-len>\n-          <Statistics_eff-space>102080</Statistics_eff-space>\n-          <Statistics_kappa>0.041</Statistics_kappa>\n-          <Statistics_lambda>0.267</Statistics_lambda>\n-          <Statistics_entropy>0.14</Statistics_entropy>\n-        </Statistics>\n-      </Iteration_stat>\n-      <Iteration_message>No hits found</Iteration_message>\n-    </Iteration>\n-    <Iteration>\n-      <Iteration_iter-num>4</Iteration_iter-num>\n-      <Iteration_query-ID>Quer'..b'ion>\n-      <Iteration_iter-num>23</Iteration_iter-num>\n-      <Iteration_query-ID>Query_6</Iteration_query-ID>\n-      <Iteration_query-def>gi|12583664|dbj|AB043817.1| Conger myriaster conf gene for fresh water form rod opsin, complete cds</Iteration_query-def>\n-      <Iteration_query-len>1344</Iteration_query-len>\n-      <Iteration_hits></Iteration_hits>\n-      <Iteration_stat>\n-        <Statistics>\n-          <Statistics_db-num>0</Statistics_db-num>\n-          <Statistics_db-len>0</Statistics_db-len>\n-          <Statistics_hsp-len>31</Statistics_hsp-len>\n-          <Statistics_eff-space>132189</Statistics_eff-space>\n-          <Statistics_kappa>0.041</Statistics_kappa>\n-          <Statistics_lambda>0.267</Statistics_lambda>\n-          <Statistics_entropy>0.14</Statistics_entropy>\n-        </Statistics>\n-      </Iteration_stat>\n-      <Iteration_message>No hits found</Iteration_message>\n-    </Iteration>\n-    <Iteration>\n-      <Iteration_iter-num>24</Iteration_iter-num>\n-      <Iteration_query-ID>Query_6</Iteration_query-ID>\n-      <Iteration_query-def>gi|12583664|dbj|AB043817.1| Conger myriaster conf gene for fresh water form rod opsin, complete cds</Iteration_query-def>\n-      <Iteration_query-len>1344</Iteration_query-len>\n-      <Iteration_hits>\n-        <Hit>\n-          <Hit_num>1</Hit_num>\n-          <Hit_id>Subject_4</Hit_id>\n-          <Hit_def>sp|P08100|OPSD_HUMAN Rhodopsin OS=Homo sapiens GN=RHO PE=1 SV=1</Hit_def>\n-          <Hit_accession>Subject_4</Hit_accession>\n-          <Hit_len>348</Hit_len>\n-          <Hit_hsps>\n-            <Hsp>\n-              <Hsp_num>1</Hsp_num>\n-              <Hsp_bit-score>559.295621601033</Hsp_bit-score>\n-              <Hsp_score>1440</Hsp_score>\n-              <Hsp_evalue>6.32632556748138e-164</Hsp_evalue>\n-              <Hsp_query-from>23</Hsp_query-from>\n-              <Hsp_query-to>1018</Hsp_query-to>\n-              <Hsp_hit-from>1</Hsp_hit-from>\n-              <Hsp_hit-to>332</Hsp_hit-to>\n-              <Hsp_query-frame>2</Hsp_query-frame>\n-              <Hsp_hit-frame>0</Hsp_hit-frame>\n-              <Hsp_identity>272</Hsp_identity>\n-              <Hsp_positive>307</Hsp_positive>\n-              <Hsp_gaps>0</Hsp_gaps>\n-              <Hsp_align-len>332</Hsp_align-len>\n-              <Hsp_qseq>MNGTEGPNFYIPMSNATGVVRSPFEYPQYYLAEPWAFSALSAYMFFLIIAGFPINFLTLYVTIEHKKLRTPLNYILLNLAVADLFMVFGGFTTTMYTSMHGYFVFGPTGCNIEGFFATLGGEIALWCLVVLAIERWMVVCKPVTNFRFGESHAIMGVMVTWTMALACALPPLFGWSRYIPEGLQCSCGIDYYTRAPGINNESFVIYMFTCHFSIPLAVISFCYGRLVCTVKXXXXXXXXXXXXXXXXXXVTRMVVIMVISFLVCWVPYASVAWYIFTHQGSTFGPIFMTIPSFFAKSSALYNPMIYICMNKQFRHCMITTLCCGKNPFEEED</Hsp_qseq>\n-              <Hsp_hseq>MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDE</Hsp_hseq>\n-              <Hsp_midline>MNGTEGPNFY+P SNATGVVRSPFEYPQYYLAEPW FS L+AYMF LI+ GFPINFLTLYVT++HKKLRTPLNYILLNLAVADLFMV GGFT+T+YTS+HGYFVFGPTGCN+EGFFATLGGEIALW LVVLAIER++VVCKP++NFRFGE+HAIMGV  TW MALACA PPL GWSRYIPEGLQCSCGIDYYT  P +NNESFVIYMF  HF+IP+ +I FCYG+LV TVKEAAAQQQES TTQ+AE+EVTRMV+IMVI+FL+CWVPYASVA+YIFTHQGS FGPIFMTIP+FFAKS+A+YNP+IYI MNKQFR+CM+TT+CCGKNP  +++</Hsp_midline>\n-            </Hsp>\n-          </Hit_hsps>\n-        </Hit>\n-      </Iteration_hits>\n-      <Iteration_stat>\n-        <Statistics>\n-          <Statistics_db-num>0</Statistics_db-num>\n-          <Statistics_db-len>0</Statistics_db-len>\n-          <Statistics_hsp-len>31</Statistics_hsp-len>\n-          <Statistics_eff-space>132189</Statistics_eff-space>\n-          <Statistics_kappa>0.041</Statistics_kappa>\n-          <Statistics_lambda>0.267</Statistics_lambda>\n-          <Statistics_entropy>0.14</Statistics_entropy>\n-        </Statistics>\n-      </Iteration_stat>\n-    </Iteration>\n-  </BlastOutput_iterations>\n-</BlastOutput>\n'
b
diff -r 45ba7c750bc8 -r 6ef523b390e0 test-data/blastx_rhodopsin_vs_four_human_converted.tabular
--- a/test-data/blastx_rhodopsin_vs_four_human_converted.tabular Thu Sep 20 10:12:43 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,10 +0,0 @@
-gi|57163782|ref|NM_001009242.1| sp|P08100|OPSD_HUMAN 96.55 348 12 0 1 1044 1 348 0.0 662
-gi|2734705|gb|U59921.1|BBU59921 sp|P08100|OPSD_HUMAN 85.24 332 49 0 42 1037 1 332 1e-168 575
-gi|283855845|gb|GQ290303.1| sp|P08100|OPSD_HUMAN 96.40 111 4 0 1 333 11 121 2e-62 224
-gi|283855845|gb|GQ290303.1| sp|P08100|OPSD_HUMAN 92.31 65 5 0 3174 3368 248 312 5e-34 129
-gi|283855845|gb|GQ290303.1| sp|P08100|OPSD_HUMAN 96.43 56 2 0 2855 3022 177 232 2e-31 120
-gi|283855845|gb|GQ290303.1| sp|P08100|OPSD_HUMAN 93.22 59 4 0 1404 1580 119 177 1e-30 118
-gi|283855845|gb|GQ290303.1| sp|P08100|OPSD_HUMAN 92.00 25 2 0 4222 4296 312 336 6e-12 56.2
-gi|283855822|gb|GQ290312.1| sp|P08100|OPSD_HUMAN 95.09 326 16 0 1 978 11 336 2e-180 613
-gi|18148870|dbj|AB062417.1| sp|P08100|OPSD_HUMAN 93.39 348 23 0 1 1044 1 348 0.0 641
-gi|12583664|dbj|AB043817.1| sp|P08100|OPSD_HUMAN 81.93 332 60 0 23 1018 1 332 6e-164 559
b
diff -r 45ba7c750bc8 -r 6ef523b390e0 test-data/blastx_rhodopsin_vs_four_human_converted_ext.tabular
--- a/test-data/blastx_rhodopsin_vs_four_human_converted_ext.tabular Thu Sep 20 10:12:43 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,10 +0,0 @@
-gi|57163782|ref|NM_001009242.1| sp|P08100|OPSD_HUMAN 96.55 348 12 0 1 1044 1 348 0.0 662 sp|P08100|OPSD_HUMAN 1707 336 343 0 98.56 1 0 MNGTEGPNFYVPFSNKTGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVFGGFTTTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLVGWSRYIPEGMQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIVIFFCYGQLVFTVXXXXXXXXXXXXXXXXXKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTLPAFFAKSSSIYNPVIYIMMNKQFRNCMLTTLCCGKNPLGDDEASTTGSKTETSQVAPA MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA 1047 348
-gi|2734705|gb|U59921.1|BBU59921 sp|P08100|OPSD_HUMAN 85.24 332 49 0 42 1037 1 332 1e-168 575 sp|P08100|OPSD_HUMAN 1481 283 315 0 94.88 3 0 MNGTEGPNFYIPMSNKTGVVRSPFEYPQYYLAEPWQYSILCAYMFLLILLGFPINFMTLYVTIQHKKLRTPLNYILLNLAFANHFMVLCGFTVTMYSSMNGYFILGATGCYVEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFSENHAVMGVAFTWIMALSCAVPPLLGWSRYIPEGMQCSCGVDYYTLKPEVNNESFVIYMFVVHFTIPLIIIFFCYGRLVCTVXXXXXXXXXXXXXXXXXKEVTRMVIIMVVFFLICWVPYASVAFFIFSNQGSEFGPIFMTVPAFFAKSSSIYNPVIYIMLNKQFRNCMITTLCCGKNPFGEDD MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDE 1574 348
-gi|283855845|gb|GQ290303.1| sp|P08100|OPSD_HUMAN 96.40 111 4 0 1 333 11 121 2e-62 224 sp|P08100|OPSD_HUMAN 570 107 109 0 98.20 1 0 VPFSNKTGVVRSPFEHPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVFGGFTTTLYTSLHGYFVFGPTGCNLEGFFATLGG VPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGG 4301 348
-gi|283855845|gb|GQ290303.1| sp|P08100|OPSD_HUMAN 92.31 65 5 0 3174 3368 248 312 5e-34 129 sp|P08100|OPSD_HUMAN 324 60 64 0 98.46 3 0 KEVTRMVIIMVIAFLICWLPYAGVAFYIFTHQGSNFGPIFMTLPAFFAKSSSIYNPVIYIMMNKQ KEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQ 4301 348
-gi|283855845|gb|GQ290303.1| sp|P08100|OPSD_HUMAN 96.43 56 2 0 2855 3022 177 232 2e-31 120 sp|P08100|OPSD_HUMAN 302 54 56 0 100.00 2 0 RYIPEGMQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIVIFFCYGQLVFTVKE RYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKE 4301 348
-gi|283855845|gb|GQ290303.1| sp|P08100|OPSD_HUMAN 93.22 59 4 0 1404 1580 119 177 1e-30 118 sp|P08100|OPSD_HUMAN 295 55 56 0 94.92 3 0 LAGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGLALTWVMALACAAPPLVGWSR LGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSR 4301 348
-gi|283855845|gb|GQ290303.1| sp|P08100|OPSD_HUMAN 92.00 25 2 0 4222 4296 312 336 6e-12 56.2 sp|P08100|OPSD_HUMAN 134 23 24 0 96.00 1 0 QFRNCMLTTLCCGKNPLGDDEASTT QFRNCMLTTICCGKNPLGDDEASAT 4301 348
-gi|283855822|gb|GQ290312.1| sp|P08100|OPSD_HUMAN 95.09 326 16 0 1 978 11 336 2e-180 613 sp|P08100|OPSD_HUMAN 1582 310 322 0 98.77 1 0 VPFSNKTGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVANLFMVFGGFTTTLYTSMHGYFVFGATGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGLAFTWVMALACAAPPLAGWSRYIPEGMQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIVIFFCYGQLVFTVXXXXXXXXXXXXXXXXXKEVTRMVIIMVVAFLICWLPYASVAFYIFTHQGSNFGPVFMTIPAFFAKSSSIYNPVIYIMMNKQFRNCMLTTLCCGKNPLGDDEASTT VPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASAT 983 348
-gi|18148870|dbj|AB062417.1| sp|P08100|OPSD_HUMAN 93.39 348 23 0 1 1044 1 348 0.0 641 sp|P08100|OPSD_HUMAN 1654 325 337 0 96.84 1 0 MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFLLIMLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVFGGFTTTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLVGWSRYIPEGMQCSCGIDYYTPHEETNNESFVIYMFVVHFIIPLIVIFFCYGQLVFTVXXXXXXXXXXXXXXXXXKEVTRMVIIMVIAFLICWLPYAGVAFYIFTHQGSDFGPIFMTIPAFFAKTSAVYNPVIYIMMNKQFRNCMVTTLCCGKNPLGDDEASTTVSKTETSQVAPA MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA 1047 348
-gi|12583664|dbj|AB043817.1| sp|P08100|OPSD_HUMAN 81.93 332 60 0 23 1018 1 332 6e-164 559 sp|P08100|OPSD_HUMAN 1440 272 307 0 92.47 2 0 MNGTEGPNFYIPMSNATGVVRSPFEYPQYYLAEPWAFSALSAYMFFLIIAGFPINFLTLYVTIEHKKLRTPLNYILLNLAVADLFMVFGGFTTTMYTSMHGYFVFGPTGCNIEGFFATLGGEIALWCLVVLAIERWMVVCKPVTNFRFGESHAIMGVMVTWTMALACALPPLFGWSRYIPEGLQCSCGIDYYTRAPGINNESFVIYMFTCHFSIPLAVISFCYGRLVCTVKXXXXXXXXXXXXXXXXXXVTRMVVIMVISFLVCWVPYASVAWYIFTHQGSTFGPIFMTIPSFFAKSSALYNPMIYICMNKQFRHCMITTLCCGKNPFEEED MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDE 1344 348
b
diff -r 45ba7c750bc8 -r 6ef523b390e0 test-data/blastx_rhodopsin_vs_four_human_ext.tabular
--- a/test-data/blastx_rhodopsin_vs_four_human_ext.tabular Thu Sep 20 10:12:43 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,10 +0,0 @@
-gi|57163782|ref|NM_001009242.1| sp|P08100|OPSD_HUMAN 96.55 348 12 0 1 1044 1 348 0.0  662 sp|P08100|OPSD_HUMAN 1707 336 343 0 98.56 1 0 MNGTEGPNFYVPFSNKTGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVFGGFTTTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLVGWSRYIPEGMQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIVIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTLPAFFAKSSSIYNPVIYIMMNKQFRNCMLTTLCCGKNPLGDDEASTTGSKTETSQVAPA MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA 1047 348
-gi|2734705|gb|U59921.1|BBU59921 sp|P08100|OPSD_HUMAN 85.24 332 49 0 42 1037 1 332 1e-168  575 sp|P08100|OPSD_HUMAN 1481 283 315 0 94.88 3 0 MNGTEGPNFYIPMSNKTGVVRSPFEYPQYYLAEPWQYSILCAYMFLLILLGFPINFMTLYVTIQHKKLRTPLNYILLNLAFANHFMVLCGFTVTMYSSMNGYFILGATGCYVEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFSENHAVMGVAFTWIMALSCAVPPLLGWSRYIPEGMQCSCGVDYYTLKPEVNNESFVIYMFVVHFTIPLIIIFFCYGRLVCTVKEAAAQQQESATTQKAEKEVTRMVIIMVVFFLICWVPYASVAFFIFSNQGSEFGPIFMTVPAFFAKSSSIYNPVIYIMLNKQFRNCMITTLCCGKNPFGEDD MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDE 1574 348
-gi|283855845|gb|GQ290303.1| sp|P08100|OPSD_HUMAN 96.40 111 4 0 1 333 11 121 2e-62  224 sp|P08100|OPSD_HUMAN 570 107 109 0 98.20 1 0 VPFSNKTGVVRSPFEHPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVFGGFTTTLYTSLHGYFVFGPTGCNLEGFFATLGG VPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGG 4301 348
-gi|283855845|gb|GQ290303.1| sp|P08100|OPSD_HUMAN 92.31 65 5 0 3174 3368 248 312 5e-34  129 sp|P08100|OPSD_HUMAN 324 60 64 0 98.46 3 0 KEVTRMVIIMVIAFLICWLPYAGVAFYIFTHQGSNFGPIFMTLPAFFAKSSSIYNPVIYIMMNKQ KEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQ 4301 348
-gi|283855845|gb|GQ290303.1| sp|P08100|OPSD_HUMAN 96.43 56 2 0 2855 3022 177 232 2e-31  120 sp|P08100|OPSD_HUMAN 302 54 56 0 100.00 2 0 RYIPEGMQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIVIFFCYGQLVFTVKE RYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKE 4301 348
-gi|283855845|gb|GQ290303.1| sp|P08100|OPSD_HUMAN 93.22 59 4 0 1404 1580 119 177 1e-30  118 sp|P08100|OPSD_HUMAN 295 55 56 0 94.92 3 0 LAGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGLALTWVMALACAAPPLVGWSR LGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSR 4301 348
-gi|283855845|gb|GQ290303.1| sp|P08100|OPSD_HUMAN 92.00 25 2 0 4222 4296 312 336 6e-12 56.2 sp|P08100|OPSD_HUMAN 134 23 24 0 96.00 1 0 QFRNCMLTTLCCGKNPLGDDEASTT QFRNCMLTTICCGKNPLGDDEASAT 4301 348
-gi|283855822|gb|GQ290312.1| sp|P08100|OPSD_HUMAN 95.09 326 16 0 1 978 11 336 2e-180  613 sp|P08100|OPSD_HUMAN 1582 310 322 0 98.77 1 0 VPFSNKTGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVANLFMVFGGFTTTLYTSMHGYFVFGATGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGLAFTWVMALACAAPPLAGWSRYIPEGMQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIVIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVVAFLICWLPYASVAFYIFTHQGSNFGPVFMTIPAFFAKSSSIYNPVIYIMMNKQFRNCMLTTLCCGKNPLGDDEASTT VPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASAT 983 348
-gi|18148870|dbj|AB062417.1| sp|P08100|OPSD_HUMAN 93.39 348 23 0 1 1044 1 348 0.0  641 sp|P08100|OPSD_HUMAN 1654 325 337 0 96.84 1 0 MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFLLIMLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVFGGFTTTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLVGWSRYIPEGMQCSCGIDYYTPHEETNNESFVIYMFVVHFIIPLIVIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWLPYAGVAFYIFTHQGSDFGPIFMTIPAFFAKTSAVYNPVIYIMMNKQFRNCMVTTLCCGKNPLGDDEASTTVSKTETSQVAPA MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA 1047 348
-gi|12583664|dbj|AB043817.1| sp|P08100|OPSD_HUMAN 81.93 332 60 0 23 1018 1 332 6e-164  559 sp|P08100|OPSD_HUMAN 1440 272 307 0 92.47 2 0 MNGTEGPNFYIPMSNATGVVRSPFEYPQYYLAEPWAFSALSAYMFFLIIAGFPINFLTLYVTIEHKKLRTPLNYILLNLAVADLFMVFGGFTTTMYTSMHGYFVFGPTGCNIEGFFATLGGEIALWCLVVLAIERWMVVCKPVTNFRFGESHAIMGVMVTWTMALACALPPLFGWSRYIPEGLQCSCGIDYYTRAPGINNESFVIYMFTCHFSIPLAVISFCYGRLVCTVKEAAAQQQESETTQRAEREVTRMVVIMVISFLVCWVPYASVAWYIFTHQGSTFGPIFMTIPSFFAKSSALYNPMIYICMNKQFRHCMITTLCCGKNPFEEED MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDE 1344 348
b
diff -r 45ba7c750bc8 -r 6ef523b390e0 test-data/blastx_sample.xml
--- a/test-data/blastx_sample.xml Thu Sep 20 10:12:43 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
b'@@ -1,758 +0,0 @@\n-<?xml version="1.0"?>\n-<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "NCBI_BlastOutput.dtd">\n-<BlastOutput>\n-  <BlastOutput_program>blastx</BlastOutput_program>\n-  <BlastOutput_version>BLASTX 2.2.24+</BlastOutput_version>\n-  <BlastOutput_reference>Stephen F. Altschul, Thomas L. Madden, Alejandro A. Sch&amp;auml;ffer, Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), &quot;Gapped BLAST and PSI-BLAST: a new generation of protein database search programs&quot;, Nucleic Acids Res. 25:3389-3402.</BlastOutput_reference>\n-  <BlastOutput_db>/share/BlastDB/nr</BlastOutput_db>\n-  <BlastOutput_query-ID>Query_1</BlastOutput_query-ID>\n-  <BlastOutput_query-def>phage_suis</BlastOutput_query-def>\n-  <BlastOutput_query-len>1890</BlastOutput_query-len>\n-  <BlastOutput_param>\n-    <Parameters>\n-      <Parameters_matrix>BLOSUM62</Parameters_matrix>\n-      <Parameters_expect>0.001</Parameters_expect>\n-      <Parameters_gap-open>11</Parameters_gap-open>\n-      <Parameters_gap-extend>1</Parameters_gap-extend>\n-      <Parameters_filter>L;</Parameters_filter>\n-    </Parameters>\n-  </BlastOutput_param>\n-  <BlastOutput_iterations>\n-    <Iteration>\n-      <Iteration_iter-num>1</Iteration_iter-num>\n-      <Iteration_query-ID>Query_1</Iteration_query-ID>\n-      <Iteration_query-def>phage_suis</Iteration_query-def>\n-      <Iteration_query-len>1890</Iteration_query-len>\n-      <Iteration_hits>\n-        <Hit>\n-          <Hit_num>1</Hit_num>\n-          <Hit_id>gi|119953746|ref|YP_950551.1|</Hit_id>\n-          <Hit_def>tail tape measure protein [Streptococcus phage SMP] &gt;gi|118430558|gb|ABK91882.1| tail tape measure protein [Streptococcus phage SMP]</Hit_def>\n-          <Hit_accession>YP_950551</Hit_accession>\n-          <Hit_len>659</Hit_len>\n-          <Hit_hsps>\n-            <Hsp>\n-              <Hsp_num>1</Hsp_num>\n-              <Hsp_bit-score>988.407949172964</Hsp_bit-score>\n-              <Hsp_score>2554</Hsp_score>\n-              <Hsp_evalue>0</Hsp_evalue>\n-              <Hsp_query-from>336</Hsp_query-from>\n-              <Hsp_query-to>1889</Hsp_query-to>\n-              <Hsp_hit-from>25</Hsp_hit-from>\n-              <Hsp_hit-to>542</Hsp_hit-to>\n-              <Hsp_query-frame>3</Hsp_query-frame>\n-              <Hsp_hit-frame>0</Hsp_hit-frame>\n-              <Hsp_identity>518</Hsp_identity>\n-              <Hsp_positive>518</Hsp_positive>\n-              <Hsp_gaps>0</Hsp_gaps>\n-              <Hsp_align-len>518</Hsp_align-len>\n-              <Hsp_qseq>NWFHLLNSGGSALSVMFAKLVGIIAGISAPIWXXXXXXXXXXXXXXXXYNTNEEFRTKVQAAWEAIKSAISTAVEAVVSFVMDLWGQMVAWWNENQELIRQTAETVWNAIRTVVETVMTALIPIVQTAWDLILAVVTTVLNVIKTVVDTGLKVVLGIIKAVMQMINGDWSGAWETLKGVAGTIWEGIKSLVQVAIDGLVQIFQTGLAFLKSIWDTVWGTIMAVVGPIWDWIKTTVSNAITAVWEIIQNIMTSIQTTWDTVWNAISTVASNIWTAISTTVMSVLTTIWGYIQTYLELIKTVWSAAWEIIKAVFAAILLTIVGLVTGNFDLIKQAISNAWEIIKTKTSEIWNAITTFLSGIWEGIKTAASTAWEWIKTTISNVMTTIKSNIETAWNNIKTSISNALNNIKSAAENAWNNIKSAISTAIENIKSTVSNGWNNLVSTVTNAGPRIVSAVRTGFDNAVNAARNFISNAISVGGDLINGFVEGVKGAAGRLIDAVGGAVSGAIDWAKGLLGIKS</Hsp_qseq>\n-              <Hsp_hseq>NWFHLLNSGGSALSVMFAKLVGIIAGISAPIWAVIGVIAALVAGFVLLYNTNEEFRTKVQAAWEAIKSAISTAVEAVVSFVMDLWGQMVAWWNENQELIRQTAETVWNAIRTVVETVMTALIPIVQTAWDLILAVVTTVLNVIKTVVDTGLKVVLGIIKAVMQMINGDWSGAWETLKGVAGTIWEGIKSLVQVAIDGLVQIFQTGLAFLKSIWDTVWGTIMAVVGPIWDWIKTTVSNAITAVWEIIQNIMTSIQTTWDTVWNAISTVASNIWTAISTTVMSVLTTIWGYIQTYLELIKTVWSAAWEIIKAVFAAILLTIVGLVTGNFDLIKQAISNAWEIIKTKTSEIWNAITTFLSGIWEGIKTAASTAWEWIKTTISNVMTTIKSNIETAWNNIKTSISNALNNIKSAAENAWNNIKSAISTAIENIKSTVSNGWNNLVSTVTNAGPRIVSAVRTGFDNAVNAARNFISNAISVGGDLINGFVEGVKGAAGRLIDAVGGAVSGAIDWAKGLLGIKS</Hsp_hseq>\n-              <Hsp_midline>NWFHLLNSGGSALSVMFAKLVGIIAGISAPIWAVIGVIAALVAGFVLLYNTNEEFRTKVQAAWEAIKSAISTAVEAVVSFVMDLWGQMVAWWNENQELIRQTAETVWNAIRTVVETVMTALIPIVQTAWDLILAVVTTVLNVIKTVVDTGLKVVLGIIKAVMQMINGDWSGAWETLKGVAGTIWEGIKSLVQVAIDGLVQIFQTGLAFLKSIWDTVWGTIMAVVGPIWDWIKTTVSNAITAVWEIIQNIMTSIQTTWDTVWNAISTVASNIWTAISTTVMSVLTTIWGYIQTYLELIKTVWSAAWEIIKAVFAAILLTIVGLVTGNFDLIKQAISNAWEIIKTKTSEIWNAITTFLSGIWEGIKT'..b'\n-              <Hsp_qseq>AIRTVVETVMTALIPIVQTAWDLILAVVTTVLNVIKTVVDTGLKVVLGIIKAVMQMINGDWSGAWETLKGVAGTIWEGIKSLVQVA------IDGLVQIFQTGLAFLKSIWDTVWGTIMAVVGPIWDWIKTTVSNAITAVWEIIQNIMTSIQTTWDTVWNAISTVASNIWTAISTTVMSVLTTIWGYIQTYLELIKTVWSAAWEIIKAVFAAILLTIVGLVTGNFDLIKQAISNAWEIIKTKTSEIWNAITTFLSGIWEGIKTAASTAWEWIKTTISNVMTTIKSNIETAWNNIKTSISNALNNIKSAAENAWNNIKSAISTAIENIKSTVSNGWNNLVSTVTNAGPRIVSAVRTGFDNAVNAARNFISNAISVGGDLINGFVEGVKGAAGRLIDAVGGAVSGAIDW-AKG</Hsp_qseq>\n-              <Hsp_hseq>AMAEVGGVLAEALAPVLELLAQLLQAVANWFSN-LPGPIQTFIVIMGGLITVVGLLLPGLLA-----LQAAAVAMGTTIGGLVVAAAPIVGTVLGIIAVITLLVVWIQELWQNNEGFRTAVI-EIWNAIYAFISVIIQEISTFIMTIWGTLTTWWTENQALIQAAVETVWNAISTVIQTVMSLIGPYLEAAWANIQLIITTAWEIIKTVVETAITVVLGIIKAIMQAITGDWSGAWETIKGVLQRVWQAIQQIVTTILSAIGQFISNTWNGIKNTFSNILSAISGIVSSIWNTIKSVISSVISSIVSFVSSGWSGIQQTISSILSGISSTVSSVWNGIKNSISNA----INGAKNVVSSAINAIKNLFNFKISWPHIPLPHF--SVSGSANPLDWLKGGLPKISIAWYAKG</Hsp_hseq>\n-              <Hsp_midline>A+  V   +  AL P+++    L+ AV     N +   + T + ++ G+I  V  ++ G  +     L+  A  +   I  LV  A      + G++ +    + +++ +W    G   AV+  IW+ I   +S  I  +   I  I  ++ T W      I      +W AIST + +V++ I  Y++     I+ + + AWEIIK V    +  ++G++      I    S AWE IK     +W AI   ++ I   I    S  W  IK T SN+++ I   + + WN IK+ IS+ +++I S   + W+ I+  IS+ +  I STVS+ WN + ++++NA    ++  +    +A+NA +N  +  IS     +  F   V G+A  L    GG    +I W AKG</Hsp_midline>\n-            </Hsp>\n-            <Hsp>\n-              <Hsp_num>3</Hsp_num>\n-              <Hsp_bit-score>121.708903358919</Hsp_bit-score>\n-              <Hsp_score>304</Hsp_score>\n-              <Hsp_evalue>2.99798279087674e-25</Hsp_evalue>\n-              <Hsp_query-from>543</Hsp_query-from>\n-              <Hsp_query-to>1673</Hsp_query-to>\n-              <Hsp_hit-from>637</Hsp_hit-from>\n-              <Hsp_hit-to>1004</Hsp_hit-to>\n-              <Hsp_query-frame>3</Hsp_query-frame>\n-              <Hsp_hit-frame>0</Hsp_hit-frame>\n-              <Hsp_identity>89</Hsp_identity>\n-              <Hsp_positive>168</Hsp_positive>\n-              <Hsp_gaps>29</Hsp_gaps>\n-              <Hsp_align-len>387</Hsp_align-len>\n-              <Hsp_qseq>ISTAVEAVVSFVMDLWGQMVAWWNENQELIRQTAETVWNAIRTVVETVMTALIPIVQTAWDLILAVVTTVLNVIKTVVDTGLKVVLGIIKAVMQMINGDWSGAWETLKGVAGTIWEGIKSLVQVAIDGLVQIFQTGLAFLKSIWDTVWGTIMAVVGPIWDWIKTTVSNAITAVWEIIQNIMTSIQTTWDTVWNAISTVASNIWTAISTTVMSVLTTIWGYIQTYLELIKTVWSAAWEIIKAVFAAILLTIVGLVTGNFDLIKQAISNAWEIIKTKTSEIWNAITTFLSG--IWEGIK------TAASTAWEWIKTTISNVMTT--IKSNIETAWNNIKTSISNALNNIKSAAENAWNNIKSAISTAIENIKSTVSNGWNNLVSTVTN</Hsp_qseq>\n-              <Hsp_hseq>IIAVITLLVVWIQELW--------QNNEGFRTAVIEIWNAIYAFISVIIQEISTFIMTIWGTLTTWWTENQALIQAAVETVWNAISTVIQTVMSLIGPYLEAAWANIQLIITTAWEIIKTVVETAITVVLGIIKAIMQAITGDWSGAWETIKGVLQRVWQAIQQIVTTILSAIGQFISNTWNGIKNTFSNILSAISGIVSSIWNTIKSVISSVISSIVSFV-----------SSGWSGIQQTISSILSGISSTVSSVWNGIKNSISNAINGAKNVVSSAINAIKNLFNFKISWPHIPLPHFSVSGSANPLDWLKGGLPKISIAWYAKGGILTKPTAFGMNEKQLMVGGEAGKEAVLPLTKQNLAAIGEGIASTMGTGGNFINVSITD</Hsp_hseq>\n-              <Hsp_midline>I   +  +V ++ +LW        +N E  R     +WNAI   +  ++  +   + T W  +    T    +I+  V+T    +  +I+ VM +I      AW  ++ +  T WE IK++V+ AI  ++ I +  +  +   W   W TI  V+  +W  I+  V+  ++A+ + I N    I+ T+  + +AIS + S+IW  I + + SV+++I  ++           S+ W  I+   ++IL  I   V+  ++ IK +ISNA    K   S   NAI    +    W  I       + ++   +W+K  +  +      K  I T       +    +   ++  E      K  ++   E I ST+  G N +  ++T+</Hsp_midline>\n-            </Hsp>\n-          </Hit_hsps>\n-        </Hit>\n-      </Iteration_hits>\n-      <Iteration_stat>\n-        <Statistics>\n-          <Statistics_db-num>12310662</Statistics_db-num>\n-          <Statistics_db-len>-87459526</Statistics_db-len>\n-          <Statistics_hsp-len>0</Statistics_hsp-len>\n-          <Statistics_eff-space>1174893963300</Statistics_eff-space>\n-          <Statistics_kappa>0.041</Statistics_kappa>\n-          <Statistics_lambda>0.267</Statistics_lambda>\n-          <Statistics_entropy>0.14</Statistics_entropy>\n-        </Statistics>\n-      </Iteration_stat>\n-    </Iteration>\n-  </BlastOutput_iterations>\n-</BlastOutput>\n'
b
diff -r 45ba7c750bc8 -r 6ef523b390e0 test-data/blastx_sample_converted.tabular
--- a/test-data/blastx_sample_converted.tabular Thu Sep 20 10:12:43 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,33 +0,0 @@
-phage_suis gi|119953746|ref|YP_950551.1| 100.00 518 0 0 336 1889 25 542 0.0 988
-phage_suis gi|289551554|ref|YP_003472458.1| 32.95 516 280 6 342 1889 657 1106 6e-66 256
-phage_suis gi|223044325|ref|ZP_03614360.1| 30.22 546 327 7 393 1889 655 1193 1e-64 252
-phage_suis gi|223044325|ref|ZP_03614360.1| 19.88 508 328 9 384 1796 844 1309 6e-28 130
-phage_suis gi|268611153|ref|ZP_06144880.1| 28.64 639 371 11 78 1847 440 1042 1e-60 239
-phage_suis gi|268611153|ref|ZP_06144880.1| 23.36 441 286 7 543 1856 547 938 4e-31 141
-phage_suis gi|268611153|ref|ZP_06144880.1| 25.27 459 266 11 522 1844 722 1121 8e-31 140
-phage_suis gi|268611153|ref|ZP_06144880.1| 24.63 406 267 8 501 1694 770 1144 3e-23 115
-phage_suis gi|268611153|ref|ZP_06144880.1| 27.80 241 145 3 492 1148 811 1044 6e-16 90.9
-phage_suis gi|268611153|ref|ZP_06144880.1| 19.76 253 168 6 1158 1883 547 775 3e-04 52.0
-phage_suis gi|268610688|ref|ZP_06144415.1| 28.95 639 369 11 78 1847 440 1042 3e-59 234
-phage_suis gi|268610688|ref|ZP_06144415.1| 24.64 491 316 9 501 1856 770 1245 4e-39 167
-phage_suis gi|268610688|ref|ZP_06144415.1| 23.79 517 319 9 492 1832 811 1322 3e-37 161
-phage_suis gi|268610688|ref|ZP_06144415.1| 21.91 493 322 11 510 1859 905 1377 1e-25 123
-phage_suis gi|268610688|ref|ZP_06144415.1| 20.55 292 197 5 486 1343 1138 1400 4e-10 71.6
-phage_suis gi|268610688|ref|ZP_06144415.1| 21.41 341 225 10 894 1883 467 775 8e-05 53.9
-phage_suis gi|153811333|ref|ZP_01964001.1| 28.34 621 364 16 108 1847 493 1073 8e-55 219
-phage_suis gi|153811333|ref|ZP_01964001.1| 29.67 428 250 9 519 1760 709 1099 2e-47 195
-phage_suis gi|153811333|ref|ZP_01964001.1| 29.41 391 226 7 498 1640 746 1096 1e-39 169
-phage_suis gi|153811333|ref|ZP_01964001.1| 26.49 268 174 3 492 1256 854 1111 3e-24 118
-phage_suis gi|153811333|ref|ZP_01964001.1| 27.12 306 198 4 510 1385 816 1110 1e-23 116
-phage_suis gi|262113750|emb|CAR95417.1| 38.46 286 169 1 384 1241 540 818 2e-54 218
-phage_suis gi|262113750|emb|CAR95417.1| 29.68 411 271 7 657 1871 460 858 3e-40 171
-phage_suis gi|77411259|ref|ZP_00787609.1| 37.19 285 172 1 387 1241 628 905 2e-53 215
-phage_suis gi|77411259|ref|ZP_00787609.1| 28.01 407 281 6 660 1871 548 945 1e-40 172
-phage_suis gi|77411259|ref|ZP_00787609.1| 22.82 355 207 7 978 1877 540 882 9e-14 83.6
-phage_suis gi|76786754|ref|YP_329383.1| 36.84 285 173 1 387 1241 628 905 8e-53 213
-phage_suis gi|76786754|ref|YP_329383.1| 27.27 407 284 6 660 1871 548 945 3e-38 164
-phage_suis gi|76786754|ref|YP_329383.1| 24.73 283 194 2 543 1391 637 900 3e-23 115
-phage_suis gi|76786754|ref|YP_329383.1| 22.91 323 204 6 978 1847 540 850 2e-13 82.4
-phage_suis gi|50914476|ref|YP_060448.1| 35.86 290 179 1 372 1241 623 905 4e-51 207
-phage_suis gi|50914476|ref|YP_060448.1| 27.01 411 280 7 660 1871 548 945 2e-35 155
-phage_suis gi|50914476|ref|YP_060448.1| 23.00 387 269 5 543 1673 637 1004 3e-25 121
b
diff -r 45ba7c750bc8 -r 6ef523b390e0 test-data/four_human_proteins.fasta
--- a/test-data/four_human_proteins.fasta Thu Sep 20 10:12:43 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,61 +0,0 @@
->sp|Q9BS26|ERP44_HUMAN Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1
-MHPAVFLSLPDLRCSLLLLVTWVFTPVTTEITSLDTENIDEILNNADVALVNFYADWCRF
-SQMLHPIFEEASDVIKEEFPNENQVVFARVDCDQHSDIAQRYRISKYPTLKLFRNGMMMK
-REYRGQRSVKALADYIRQQKSDPIQEIRDLAEITTLDRSKRNIIGYFEQKDSDNYRVFER
-VANILHDDCAFLSAFGDVSKPERYSGDNIIYKPPGHSAPDMVYLGAMTNFDVTYNWIQDK
-CVPLVREITFENGEELTEEGLPFLILFHMKEDTESLEIFQNEVARQLISEKGTINFLHAD
-CDKFRHPLLHIQKTPADCPVIAIDSFRHMYVFGDFKDVLIPGKLKQFVFDLHSGKLHREF
-HHGPDPTDTAPGEQAQDVASSPPESSFQKLAPSEYRYTLLRDRDEL
->sp|Q9NSY1|BMP2K_HUMAN BMP-2-inducible protein kinase OS=Homo sapiens GN=BMP2K PE=1 SV=2
-MKKFSRMPKSEGGSGGGAAGGGAGGAGAGAGCGSGGSSVGVRVFAVGRHQVTLEESLAEG
-GFSTVFLVRTHGGIRCALKRMYVNNMPDLNVCKREITIMKELSGHKNIVGYLDCAVNSIS
-DNVWEVLILMEYCRAGQVVNQMNKKLQTGFTEPEVLQIFCDTCEAVARLHQCKTPIIHRD
-LKVENILLNDGGNYVLCDFGSATNKFLNPQKDGVNVVEEEIKKYTTLSYRAPEMINLYGG
-KPITTKADIWALGCLLYKLCFFTLPFGESQVAICDGNFTIPDNSRYSRNIHCLIRFMLEP
-DPEHRPDIFQVSYFAFKFAKKDCPVSNINNSSIPSALPEPMTASEAAARKSQIKARITDT
-IGPTETSIAPRQRPKANSATTATPSVLTIQSSATPVKVLAPGEFGNHRPKGALRPGNGPE
-ILLGQGPPQQPPQQHRVLQQLQQGDWRLQQLHLQHRHPHQQQQQQQQQQQQQQQQQQQQQ
-QQQQQQHHHHHHHHLLQDAYMQQYQHATQQQQMLQQQFLMHSVYQPQPSASQYPTMMPQY
-QQAFFQQQMLAQHQPSQQQASPEYLTSPQEFSPALVSYTSSLPAQVGTIMDSSYSANRSV
-ADKEAIANFTNQKNISNPPDMSGWNPFGEDNFSKLTEEELLDREFDLLRSNRLEERASSD
-KNVDSLSAPHNHPPEDPFGSVPFISHSGSPEKKAEHSSINQENGTANPIKNGKTSPASKD
-QRTGKKTSVQGQVQKGNDESESDFESDPPSPKSSEEEEQDDEEVLQGEQGDFNDDDTEPE
-NLGHRPLLMDSEDEEEEEKHSSDSDYEQAKAKYSDMSSVYRDRSGSGPTQDLNTILLTSA
-QLSSDVAVETPKQEFDVFGAVPFFAVRAQQPQQEKNEKNLPQHRFPAAGLEQEEFDVFTK
-APFSKKVNVQECHAVGPEAHTIPGYPKSVDVFGSTPFQPFLTSTSKSESNEDLFGLVPFD
-EITGSQQQKVKQRSLQKLSSRQRRTKQDMSKSNGKRHHGTPTSTKKTLKPTYRTPERARR
-HKKVGRRDSQSSNEFLTISDSKENISVALTDGKDRGNVLQPEESLLDPFGAKPFHSPDLS
-WHPPHQGLSDIRADHNTVLPGRPRQNSLHGSFHSADVLKMDDFGAVPFTELVVQSITPHQ
-SQQSQPVELDPFGAAPFPSKQ
->sp|P06213|INSR_HUMAN Insulin receptor OS=Homo sapiens GN=INSR PE=1 SV=4
-MATGGRRGAAAAPLLVAVAALLLGAAGHLYPGEVCPGMDIRNNLTRLHELENCSVIEGHL
-QILLMFKTRPEDFRDLSFPKLIMITDYLLLFRVYGLESLKDLFPNLTVIRGSRLFFNYAL
-VIFEMVHLKELGLYNLMNITRGSVRIEKNNELCYLATIDWSRILDSVEDNYIVLNKDDNE
-ECGDICPGTAKGKTNCPATVINGQFVERCWTHSHCQKVCPTICKSHGCTAEGLCCHSECL
-GNCSQPDDPTKCVACRNFYLDGRCVETCPPPYYHFQDWRCVNFSFCQDLHHKCKNSRRQG
-CHQYVIHNNKCIPECPSGYTMNSSNLLCTPCLGPCPKVCHLLEGEKTIDSVTSAQELRGC
-TVINGSLIINIRGGNNLAAELEANLGLIEEISGYLKIRRSYALVSLSFFRKLRLIRGETL
-EIGNYSFYALDNQNLRQLWDWSKHNLTITQGKLFFHYNPKLCLSEIHKMEEVSGTKGRQE
-RNDIALKTNGDQASCENELLKFSYIRTSFDKILLRWEPYWPPDFRDLLGFMLFYKEAPYQ
-NVTEFDGQDACGSNSWTVVDIDPPLRSNDPKSQNHPGWLMRGLKPWTQYAIFVKTLVTFS
-DERRTYGAKSDIIYVQTDATNPSVPLDPISVSNSSSQIILKWKPPSDPNGNITHYLVFWE
-RQAEDSELFELDYCLKGLKLPSRTWSPPFESEDSQKHNQSEYEDSAGECCSCPKTDSQIL
-KELEESSFRKTFEDYLHNVVFVPRKTSSGTGAEDPRPSRKRRSLGDVGNVTVAVPTVAAF
-PNTSSTSVPTSPEEHRPFEKVVNKESLVISGLRHFTGYRIELQACNQDTPEERCSVAAYV
-SARTMPEAKADDIVGPVTHEIFENNVVHLMWQEPKEPNGLIVLYEVSYRRYGDEELHLCV
-SRKHFALERGCRLRGLSPGNYSVRIRATSLAGNGSWTEPTYFYVTDYLDVPSNIAKIIIG
-PLIFVFLFSVVIGSIYLFLRKRQPDGPLGPLYASSNPEYLSASDVFPCSVYVPDEWEVSR
-EKITLLRELGQGSFGMVYEGNARDIIKGEAETRVAVKTVNESASLRERIEFLNEASVMKG
-FTCHHVVRLLGVVSKGQPTLVVMELMAHGDLKSYLRSLRPEAENNPGRPPPTLQEMIQMA
-AEIADGMAYLNAKKFVHRDLAARNCMVAHDFTVKIGDFGMTRDIYETDYYRKGGKGLLPV
-RWMAPESLKDGVFTTSSDMWSFGVVLWEITSLAEQPYQGLSNEQVLKFVMDGGYLDQPDN
-CPERVTDLMRMCWQFNPKMRPTFLEIVNLLKDDLHPSFPEVSFFHSEENKAPESEELEME
-FEDMENVPLDRSSHCQREEAGGRDGGSSLGFKRSYEEHIPYTHMNGGKKNGRILTLPRSN
-PS
->sp|P08100|OPSD_HUMAN Rhodopsin OS=Homo sapiens GN=RHO PE=1 SV=1
-MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLY
-VTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLG
-GEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIP
-EGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQES
-ATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAI
-YNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA
b
diff -r 45ba7c750bc8 -r 6ef523b390e0 test-data/rhodopsin_nucs.fasta
--- a/test-data/rhodopsin_nucs.fasta Thu Sep 20 10:12:43 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
b'@@ -1,161 +0,0 @@\n->gi|57163782|ref|NM_001009242.1| Felis catus rhodopsin (RHO), mRNA\n-ATGAACGGGACGGAGGGCCCGAACTTCTACGTGCCCTTCTCCAACAAAACGGGTGTGGTACGCAGCCCCT\n-TCGAGTACCCACAGTACTACCTGGCTGAGCCATGGCAGTTCTCCATGCTGGCCGCCTACATGTTCCTGCT\n-CATCGTGCTTGGCTTCCCCATCAACTTCCTCACGCTCTACGTCACGGTCCAGCACAAGAAGCTGCGCACG\n-CCTCTCAACTACATCCTGCTCAACCTGGCCGTGGCTGACCTCTTCATGGTCTTCGGTGGCTTCACCACCA\n-CCCTCTACACCTCTCTGCATGGATACTTTGTCTTTGGGCCCACAGGATGCAATTTGGAGGGCTTCTTTGC\n-CACACTGGGCGGTGAAATTGCCCTGTGGTCTTTGGTGGTCCTGGCCATTGAGCGGTACGTGGTGGTGTGT\n-AAGCCCATGAGCAACTTCCGCTTTGGGGAGAACCATGCCATAATGGGCGTCGCTTTCACCTGGGTCATGG\n-CACTGGCCTGCGCTGCACCCCCCCTCGTTGGTTGGTCCAGGTACATCCCTGAAGGCATGCAGTGTTCATG\n-CGGGATCGACTACTACACACTCAAGCCAGAAGTCAACAACGAGTCCTTTGTCATCTACATGTTCGTGGTC\n-CACTTCACCATCCCCATGATCGTCATCTTCTTTTGCTACGGGCAGCTTGTCTTCACAGTCAAGGAGGCGG\n-CAGCCCAGCAGCAGGAGTCAGCCACCACCCAGAAGGCTGAGAAGGAGGTCACTCGCATGGTCATCATCAT\n-GGTCATTGCTTTCCTGATCTGTTGGGTGCCCTACGCCAGCGTGGCATTCTACATCTTCACCCACCAGGGG\n-TCCAACTTTGGCCCCATCTTCATGACACTCCCGGCGTTCTTCGCAAAGTCCTCCTCCATCTACAACCCTG\n-TCATCTACATCATGATGAACAAGCAGTTCCGGAACTGCATGCTCACTACCCTCTGCTGTGGCAAGAACCC\n-ACTGGGTGATGACGAGGCTTCCACAACCGGTTCCAAGACGGAGACCAGCCAGGTGGCACCGGCCTAA\n-\n->gi|2734705|gb|U59921.1|BBU59921 Bufo bufo rhodopsin mRNA, complete cds\n-TCTTTCTAGTTTGGGGGGGGGGACTTTAAAGAGCCGCCAATATGAACGGAACAGAAGGCCCAAACTTTTA\n-CATACCCATGTCCAACAAGACTGGGGTGGTGCGAAGCCCCTTTGAATACCCTCAGTATTACCTGGCAGAG\n-CCATGGCAATATTCCATTCTGTGCGCGTACATGTTCCTGCTCATTCTACTTGGGTTCCCAATCAACTTCA\n-TGACCTTGTACGTCACCATCCAGCACAAGAAGCTCCGGACACCCTTAAACTATATCCTGCTGAATTTGGC\n-CTTTGCCAACCACTTCATGGTCCTGTGTGGATTCACGGTGACAATGTACTCCTCAATGAACGGATACTTC\n-ATCCTCGGAGCCACCGGTTGCTATGTTGAAGGCTTCTTCGCTACCCTTGGTGGTGAAATCGCCCTTTGGT\n-CCCTGGTGGTCTTGGCCATTGAACGATACGTGGTCGTCTGTAAGCCCATGAGCAACTTCCGATTTAGTGA\n-GAACCATGCCGTCATGGGCGTAGCGTTCACCTGGATAATGGCTTTGTCCTGTGCTGTTCCTCCACTCCTT\n-GGATGGTCCAGGTACATCCCCGAGGGCATGCAGTGCTCCTGCGGAGTCGACTACTACACCCTGAAGCCCG\n-AGGTCAACAACGAGTCCTTCGTCATCTACATGTTCGTCGTCCACTTCACCATCCCCCTGATTATCATTTT\n-CTTCTGCTATGGCCGCCTGGTGTGCACTGTGAAAGAGGCTGCAGCTCAACAGCAAGAGTCCGCCACCACC\n-CAGAAGGCCGAGAAAGAGGTGACCAGGATGGTGATCATCATGGTGGTCTTCTTCCTTATCTGTTGGGTCC\n-CCTACGCCTCTGTCGCTTTCTTCATCTTCAGCAATCAGGGCTCTGAGTTCGGCCCCATCTTCATGACCGT\n-CCCAGCTTTCTTTGCCAAGAGTTCTTCCATCTACAACCCCGTCATCTACATCATGCTCAACAAGCAGTTC\n-CGTAACTGCATGATCACCACCCTGTGCTGCGGCAAGAATCCCTTTGGAGAAGACGATGCCTCCTCTGCCG\n-CCACCTCCAAGACAGAGGCTTCTTCTGTTTCTTCCAGCCAGGTGTCTCCTGCATAAGACCTTCCACCAGG\n-CCTGTCTCAGGGTCCGCTGCCTCACACAGCTCCCACCGCCCCAACTCCGTCTCCTGCTCGCTAAGGCGGC\n-GAAGTTCCCCTTCCATTACATAAAACGTATCTGTTCAAGAAAGGCGACGACGAAGGAGAAGAAGAGGAGC\n-CCCCCCGAACCCCTTCGCTGCTGCTGAAAACGACTTGATTGCTTCTGCAACGCAACGGGGCCTTACGGCA\n-GCGAAGGGGTTGTCATCCGGACGCGCCAAGAATTCCTTCGAGACTGTAAATATCTTAAAGGAACCGTCCT\n-GCTAGTTACCGACGCCGCTCCTGTAGCCGCCGTTCCCCCGCACTCCGGCCGGTTCATACCTCTTATTTTT\n-TTGCAATGCAACAGAAAATAATATTTTTGTTCCCACGGCTTTTCCCGGTCAGGTCTGGTAGTGGCGGAGA\n-TTGGCCGACCCCTCGCACCTGTAATAAAGCGCAG\n-\n->gi|283855845|gb|GQ290303.1| Cynopterus brachyotis voucher 20020434 rhodopsin (RHO) gene, exons 1 through 5 and partial cds\n-GTGCCCTTCTCCAACAAGACAGGCGTGGTGCGCAGTCCCTTCGAGCATCCACAGTACTACCTGGCCGAGC\n-CATGGCAGTTCTCCATGCTGGCCGCCTACATGTTTCTGCTGATCGTGCTCGGCTTCCCCATCAACTTCCT\n-CACGCTCTATGTCACGGTTCAGCACAAGAAGCTGCGTACGCCTCTCAACTACATCCTGCTCAACCTGGCC\n-GTGGCCGACCTCTTCATGGTCTTCGGAGGCTTCACCACCACCCTCTACACCTCCCTGCATGGATACTTTG\n-TCTTCGGGCCTACGGGATGCAATCTGGAGGGCTTTTTTGCCACCCTGGGAGGTATGAGCTGAGATGCGGG\n-TAAGGAGGAGGCATAGAGGCATCTGGGAACAGTCCCAAGCTTGGGGTGAAGGCTAAGAGGCCTTCTTCCT\n-TGTTCTGTCATTGGCGTCGTCCGAAGCCCTCACTTAATCAACAAACAGTTTGGTGGTGAGGCGCTGAGCT\n-CCATTTGGAGAGGGCAGGTATCGAGCACTGTTTTATCCCCCCTGGAGTGGTGCCATTGCCTTGCTTTACA\n-GCAAAGAAACTGAGGATGAGAGGAGTCGAGGGTCTTGCCAGGTCACATCATGGCAGAGACAGAGCTGAGT\n-TTCAACCCTGCATCTATGTGCAGTTTCCCTTGGAGCAGCTATGTTAGGTCAGACCCACGGTGGGCACTGG\n-GGAGAGAGCTGCACAAGACAGGTCCCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n-NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTTCCTGATTGCCA\n-GGAGTGATGTGCAGCGCAAATGTCTGAATTCCATTATTATGTGCTCCTTCTTCCTCTGAGCCAAACATCC\n-ATCTTCATGGCTCCTAGAATTGGGTCCCACCCACATGAGCAGGTCATTTTGTTTCCCTAGAGGGGAGAGG\n-TCACT'..b'CTTCAGAGGGTCAGATTTGGGATGAGAGTGGAGGCTGCGAGGGCCTGAGTG\n-GGAAGGGATTGGAGGCAAATCTCACCAACCATGTCAGTTTGCTACACACACTTTGGGTGGACCCTGACCC\n-TGACTCATGCTTCTTGCCTTCCAGTTCCGGAACTGCATGCTCACTACCCTCTGCTGTGGCAAGAACCCAC\n-TGGGTGACGATGAGGCCTCCACCACTGCCTC\n-\n->gi|283855822|gb|GQ290312.1| Myotis ricketti voucher GQX10 rhodopsin (RHO) mRNA, partial cds\n-GTGCCCTTCTCCAACAAGACGGGTGTGGTGCGCAGCCCCTTCGAGTACCCGCAGTACTACCTGGCTGAGC\n-CCTGGCAGTTCTCCATGCTGGCTGCCTACATGTTTCTGCTGATCGTGCTCGGATTCCCCATCAACTTCCT\n-CACGCTCTACGTCACCGTCCAGCACAAGAAGCTGCGCACGCCTCTCAACTACATCCTGCTCAACCTGGCT\n-GTGGCCAACCTCTTCATGGTCTTTGGAGGCTTCACCACCACCCTGTATACCTCTATGCATGGATACTTCG\n-TCTTCGGGGCCACGGGATGCAATCTGGAGGGCTTCTTTGCCACGCTGGGCGGTGAAATCGCCCTGTGGTC\n-CCTGGTGGTCCTGGCCATCGAGCGGTATGTGGTGGTCTGCAAGCCCATGAGCAACTTCCGCTTTGGGGAG\n-AACCACGCCATCATGGGCCTCGCCTTCACGTGGGTCATGGCACTGGCCTGCGCTGCACCCCCACTAGCCG\n-GCTGGTCCAGGTACATCCCAGAGGGCATGCAGTGCTCGTGTGGGATTGACTACTACACGCTCAAACCGGA\n-GGTCAACAACGAGTCCTTCGTCATCTACATGTTCGTGGTCCACTTCACCATCCCCATGATTGTCATTTTC\n-TTCTGCTACGGACAGCTGGTGTTCACAGTGAAGGAGGCGGCTGCCCAGCAGCAGGAGTCAGCCACCACCC\n-AGAAGGCCGAGAAGGAAGTCACGCGCATGGTCATCATCATGGTCGTTGCGTTCCTAATCTGTTGGCTGCC\n-CTACGCCAGCGTGGCATTCTACATCTTTACCCACCAGGGCTCTAACTTTGGCCCTGTCTTCATGACCATC\n-CCGGCATTCTTCGCCAAGTCATCCTCCATCTACAACCCGGTCATCTATATCATGATGAACAAGCAGTTCC\n-GGAACTGCATGCTCACCACCCTCTGCTGTGGCAAGAACCCACTGGGTGATGACGAAGCATCCACCACTGC\n-CTC\n-\n->gi|18148870|dbj|AB062417.1| Synthetic construct Bos taurus gene for rhodopsin, complete cds\n-ATGAACGGGACCGAGGGCCCAAACTTCTACGTGCCTTTCTCCAACAAGACGGGCGTCGTACGCAGCCCCT\n-TCGAGGCGCCGCAGTACTACCTGGCTGAGCCATGGCAGTTCAGCATGCTGGCCGCCTACATGTTCCTGCT\n-GATCATGCTTGGCTTCCCCATCAACTTCCTCACGCTGTACGTCACAGTCCAGCACAAGAAGCTGAGGACC\n-CCCCTCAACTACATCCTGCTCAACCTGGCCGTGGCAGATCTCTTCATGGTGTTCGGGGGCTTCACCACCA\n-CCCTGTATACCTCTCTGCACGGGTACTTCGTGTTCGGTCCGACGGGCTGCAACCTCGAGGGCTTCTTTGC\n-CACCTTAGGCGGTGAAATTGCACTGTGGTCCTTGGTGGTGCTAGCCATCGAGCGGTACGTAGTGGTGTGC\n-AAGCCCATGAGCAACTTCCGCTTCGGGGAGAACCACGCCATCATGGGCGTCGCATTCACCTGGGTCATGG\n-CTCTGGCCTGTGCGGCCCCCCCCCTCGTCGGCTGGTCTAGATACATCCCGGAGGGGATGCAGTGCTCGTG\n-CGGGATCGATTACTACACGCCCCACGAGGAGACCAACAATGAGTCGTTCGTCATCTACATGTTCGTTGTA\n-CACTTCATCATCCCCCTGATTGTCATATTCTTCTGCTACGGGCAGCTGGTCTTCACCGTCAAGGAGGCTG\n-CAGCCCAGCAGCAGGAGTCGGCCACCACTCAGAAGGCCGAGAAGGAGGTCACGCGTATGGTCATCATCAT\n-GGTCATCGCTTTCCTCATATGCTGGCTGCCCTACGCAGGTGTGGCGTTCTACATCTTCACCCATCAGGGA\n-TCCGACTTTGGCCCCATCTTCATGACCATCCCGGCTTTCTTTGCCAAGACGTCTGCCGTCTATAACCCCG\n-TCATCTACATCATGATGAACAAGCAGTTCCGGAACTGCATGGTCACCACTCTCTGCTGTGGCAAGAACCC\n-CCTAGGTGACGACGAGGCCTCCACGACCGTGTCCAAGACAGAGACCAGCCAAGTGGCCCCTGCCTAA\n-\n->gi|12583664|dbj|AB043817.1| Conger myriaster conf gene for fresh water form rod opsin, complete cds\n-CCGCTACTGACGAACCGCAACCATGAACGGCACTGAGGGACCTAACTTCTACATCCCCATGTCAAACGCC\n-ACTGGTGTAGTGAGGAGTCCATTTGAATACCCGCAGTACTACCTTGCAGAACCATGGGCTTTCTCAGCTC\n-TGTCTGCCTACATGTTCTTCCTGATTATCGCCGGATTCCCCATCAACTTCCTCACCCTGTATGTCACCAT\n-CGAACATAAGAAACTGAGGACCCCACTGAACTACATTCTGCTGAACCTGGCCGTGGCCGACCTCTTCATG\n-GTGTTTGGCGGATTCACCACCACGATGTACACCTCCATGCACGGCTACTTTGTCTTCGGCCCCACCGGCT\n-GCAACATCGAAGGGTTCTTCGCCACCCTCGGCGGCGAGATTGCCCTCTGGTGCCTCGTTGTCCTGGCCAT\n-TGAAAGGTGGATGGTCGTCTGCAAGCCAGTGACCAATTTCCGCTTCGGTGAGAGCCATGCCATCATGGGT\n-GTCATGGTGACCTGGACCATGGCATTGGCCTGTGCCCTCCCCCCTCTCTTCGGCTGGTCTCGGTACATTC\n-CGGAAGGTCTGCAGTGCTCGTGCGGGATCGACTACTATACCCGGGCGCCTGGGATCAACAATGAGTCCTT\n-TGTGATCTACATGTTTACCTGCCACTTCTCCATCCCACTCGCCGTCATCTCTTTCTGCTACGGCCGACTG\n-GTGTGCACCGTCAAAGAGGCCGCTGCCCAGCAACAGGAGTCCGAGACCACCCAGAGGGCTGAGCGGGAGG\n-TCACCCGCATGGTCGTCATCATGGTCATCTCCTTCCTGGTCTGCTGGGTGCCCTATGCCAGTGTGGCCTG\n-GTACATCTTTACCCACCAGGGAAGCACTTTTGGGCCCATCTTCATGACCATTCCATCCTTCTTTGCCAAG\n-AGTTCAGCCCTCTACAACCCCATGATCTACATCTGCATGAACAAGCAGTTCCGCCATTGCATGATCACCA\n-CCCTCTGCTGTGGGAAGAACCCCTTCGAGGAGGAGGATGGAGCGTCCGCCACTAGCTCTAAAACTGAGGC\n-TTCATCCGTGTCCTCCAGCTCTGTCTCCCCGGCATAAACCTTGTTTGACCGAACACCACGCATCAACACA\n-AAGACCAAGAATGCTGACTAAATGCTAACATTTCAGGGAAATCCAAAGACTTTTTACTATTTTTTTACAC\n-AACCATATAGGTTGCAAACAGAGGTTTAGCCCTGTTTACAGGTTGTCATCAATGTGATGTCAGTATGTAC\n-AATATAGTCAACTTGATAGCAAGTTGTTGGCTTATTTCAGATTGTATGGGCAATGTAATCAACCATATGT\n-GAAATAAATTGCAA\n'
b
diff -r 45ba7c750bc8 -r 6ef523b390e0 test-data/rhodopsin_proteins.fasta
--- a/test-data/rhodopsin_proteins.fasta Thu Sep 20 10:12:43 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
@@ -1,43 +0,0 @@
->gi|57163783|ref|NP_001009242.1| rhodopsin [Felis catus]
-MNGTEGPNFYVPFSNKTGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRT
-PLNYILLNLAVADLFMVFGGFTTTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVC
-KPMSNFRFGENHAIMGVAFTWVMALACAAPPLVGWSRYIPEGMQCSCGIDYYTLKPEVNNESFVIYMFVV
-HFTIPMIVIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQG
-SNFGPIFMTLPAFFAKSSSIYNPVIYIMMNKQFRNCMLTTLCCGKNPLGDDEASTTGSKTETSQVAPA
-
->gi|3024260|sp|P56514.1|OPSD_BUFBU RecName: Full=Rhodopsin
-MNGTEGPNFYIPMSNKTGVVRSPFEYPQYYLAEPWQYSILCAYMFLLILLGFPINFMTLYVTIQHKKLRT
-PLNYILLNLAFANHFMVLCGFTVTMYSSMNGYFILGATGCYVEGFFATLGGEIALWSLVVLAIERYVVVC
-KPMSNFRFSENHAVMGVAFTWIMALSCAVPPLLGWSRYIPEGMQCSCGVDYYTLKPEVNNESFVIYMFVV
-HFTIPLIIIFFCYGRLVCTVKEAAAQQQESATTQKAEKEVTRMVIIMVVFFLICWVPYASVAFFIFSNQG
-SEFGPIFMTVPAFFAKSSSIYNPVIYIMLNKQFRNCMITTLCCGKNPFGEDDASSAATSKTEASSVSSSQ
-VSPA
-
->gi|283855846|gb|ADB45242.1| rhodopsin [Cynopterus brachyotis]
-VPFSNKTGVVRSPFEHPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLA
-VADLFMVFGGFTTTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGE
-NHAIMGLALTWVMALACAAPPLVGWSRYIPEGMQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIVIF
-FCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWLPYAGVAFYIFTHQGSNFGPIFMTL
-PAFFAKSSSIYNPVIYIMMNKQFRNCMLTTLCCGKNPLGDDEASTTAS
-
->gi|283855823|gb|ADB45229.1| rhodopsin [Myotis pilosus]
-VPFSNKTGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLA
-VANLFMVFGGFTTTLYTSMHGYFVFGATGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGE
-NHAIMGLAFTWVMALACAAPPLAGWSRYIPEGMQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIVIF
-FCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVVAFLICWLPYASVAFYIFTHQGSNFGPVFMTI
-PAFFAKSSSIYNPVIYIMMNKQFRNCMLTTLCCGKNPLGDDEASTTAS
-
->gi|223523|prf||0811197A rhodopsin [Bos taurus]
-MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFLLIMLGFPINFLTLYVTVQHKKLRT
-PLNYILLNLAVADLFMVFGGFTTTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVC
-KPMSNFRFGENHAIMGVAFTWVMALACAAPPLVGWSRYIPEGMQCSCGIDYTPHEETNNESFVIYMFVVH
-FIIPLIVIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWLPYAGVAFYIFTHQGS
-DFGPIFMTIPAFFAKTSAVYNPVIYIMMNKQFRNCMVTTLCCGKNPLGDDEASTTVSKTETSQVAPA
-
->gi|12583665|dbj|BAB21486.1| fresh water form rod opsin [Conger myriaster]
-MNGTEGPNFYIPMSNATGVVRSPFEYPQYYLAEPWAFSALSAYMFFLIIAGFPINFLTLYVTIEHKKLRT
-PLNYILLNLAVADLFMVFGGFTTTMYTSMHGYFVFGPTGCNIEGFFATLGGEIALWCLVVLAIERWMVVC
-KPVTNFRFGESHAIMGVMVTWTMALACALPPLFGWSRYIPEGLQCSCGIDYYTRAPGINNESFVIYMFTC
-HFSIPLAVISFCYGRLVCTVKEAAAQQQESETTQRAEREVTRMVVIMVISFLVCWVPYASVAWYIFTHQG
-STFGPIFMTIPSFFAKSSALYNPMIYICMNKQFRHCMITTLCCGKNPFEEEDGASATSSKTEASSVSSSS
-VSPA
b
diff -r 45ba7c750bc8 -r 6ef523b390e0 test-data/tblastn_four_human_vs_rhodopsin.html
--- a/test-data/tblastn_four_human_vs_rhodopsin.html Thu Sep 20 10:12:43 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
b'@@ -1,787 +0,0 @@\n-<HTML>\n-<TITLE>BLAST Search Results</TITLE>\n-<BODY BGCOLOR="#FFFFFF" LINK="#0000FF" VLINK="#660099" ALINK="#660099">\n-<PRE>\n-\n-<b>TBLASTN 2.2.25+</b>\n-\n-\n-<b>Query=</b> sp|Q9BS26|ERP44_HUMAN Endoplasmic reticulum resident protein 44\n-OS=Homo sapiens GN=ERP44 PE=1 SV=1\n-\n-Length=406\n-\n-<b>Subject=</b> gi|57163782|ref|NM_001009242.1| Felis catus rhodopsin (RHO), mRNA\n-\n-Length=1047\n-\n-\n-***** No hits found *****\n-\n-\n-\n-Lambda     K      H\n-   0.347    0.182    0.684 \n-\n-Gapped\n-Lambda     K      H\n-   0.299   0.0710    0.270 \n-\n-Effective search space used: 127710\n-\n-\n-<b>Query=</b> sp|Q9BS26|ERP44_HUMAN Endoplasmic reticulum resident protein 44\n-OS=Homo sapiens GN=ERP44 PE=1 SV=1\n-\n-Length=406\n-\n-<b>Subject=</b> gi|2734705|gb|U59921.1|BBU59921 Bufo bufo rhodopsin mRNA, complete\n-cds\n-\n-Length=1574\n-\n-\n-***** No hits found *****\n-\n-\n-\n-Lambda     K      H\n-   0.347    0.182    0.684 \n-\n-Gapped\n-Lambda     K      H\n-   0.299   0.0710    0.270 \n-\n-Effective search space used: 127710\n-\n-\n-<b>Query=</b> sp|Q9BS26|ERP44_HUMAN Endoplasmic reticulum resident protein 44\n-OS=Homo sapiens GN=ERP44 PE=1 SV=1\n-\n-Length=406\n-\n-<b>Subject=</b> gi|283855845|gb|GQ290303.1| Cynopterus brachyotis voucher 20020434\n-rhodopsin (RHO) gene, exons 1 through 5 and partial cds\n-\n-Length=4301\n-\n-\n-***** No hits found *****\n-\n-\n-\n-Lambda     K      H\n-   0.347    0.182    0.684 \n-\n-Gapped\n-Lambda     K      H\n-   0.299   0.0710    0.270 \n-\n-Effective search space used: 127710\n-\n-\n-<b>Query=</b> sp|Q9BS26|ERP44_HUMAN Endoplasmic reticulum resident protein 44\n-OS=Homo sapiens GN=ERP44 PE=1 SV=1\n-\n-Length=406\n-\n-<b>Subject=</b> gi|283855822|gb|GQ290312.1| Myotis ricketti voucher GQX10 rhodopsin\n-(RHO) mRNA, partial cds\n-\n-Length=983\n-\n-\n-***** No hits found *****\n-\n-\n-\n-Lambda     K      H\n-   0.347    0.182    0.684 \n-\n-Gapped\n-Lambda     K      H\n-   0.299   0.0710    0.270 \n-\n-Effective search space used: 127710\n-\n-\n-<b>Query=</b> sp|Q9BS26|ERP44_HUMAN Endoplasmic reticulum resident protein 44\n-OS=Homo sapiens GN=ERP44 PE=1 SV=1\n-\n-Length=406\n-\n-<b>Subject=</b> gi|18148870|dbj|AB062417.1| Synthetic construct Bos taurus gene for\n-rhodopsin, complete cds\n-\n-Length=1047\n-\n-\n-***** No hits found *****\n-\n-\n-\n-Lambda     K      H\n-   0.347    0.182    0.684 \n-\n-Gapped\n-Lambda     K      H\n-   0.299   0.0710    0.270 \n-\n-Effective search space used: 127710\n-\n-\n-<b>Query=</b> sp|Q9BS26|ERP44_HUMAN Endoplasmic reticulum resident protein 44\n-OS=Homo sapiens GN=ERP44 PE=1 SV=1\n-\n-Length=406\n-\n-<b>Subject=</b> gi|12583664|dbj|AB043817.1| Conger myriaster conf gene for fresh\n-water form rod opsin, complete cds\n-\n-Length=1344\n-\n-\n-***** No hits found *****\n-\n-\n-\n-Lambda     K      H\n-   0.347    0.182    0.684 \n-\n-Gapped\n-Lambda     K      H\n-   0.299   0.0710    0.270 \n-\n-Effective search space used: 127710\n-\n-\n-<b>Query=</b> sp|Q9NSY1|BMP2K_HUMAN BMP-2-inducible protein kinase OS=Homo sapiens\n-GN=BMP2K PE=1 SV=2\n-\n-Length=1161\n-\n-<b>Subject=</b> gi|57163782|ref|NM_001009242.1| Felis catus rhodopsin (RHO), mRNA\n-\n-Length=1047\n-\n-\n-***** No hits found *****\n-\n-\n-\n-Lambda     K      H\n-   0.334    0.170    0.615 \n-\n-Gapped\n-Lambda     K      H\n-   0.299   0.0710    0.270 \n-\n-Effective search space used: 370988\n-\n-\n-<b>Query=</b> sp|Q9NSY1|BMP2K_HUMAN BMP-2-inducible protein kinase OS=Homo sapiens\n-GN=BMP2K PE=1 SV=2\n-\n-Length=1161\n-\n-<b>Subject=</b> gi|2734705|gb|U59921.1|BBU59921 Bufo bufo rhodopsin mRNA, complete\n-cds\n-\n-Length=1574\n-\n-\n-***** No hits found *****\n-\n-\n-\n-Lambda     K      H\n-   0.334    0.170    0.615 \n-\n-Gapped\n-Lambda     K      H\n-   0.299   0.0710    0.270 \n-\n-Effective search space used: 370988\n-\n-\n-<b>Query=</b> sp|Q9NSY1|BMP2K_HUMAN BMP-2-inducible protein kinase OS=Homo sapiens\n-GN=BMP2K PE=1 SV=2\n-\n-Length=1161\n-\n-<b>Subject=</b> gi|283855845|gb|GQ290303.1| Cynopterus brachyotis voucher 20020434\n-rhodopsin (RHO) gene, exons 1 through 5 and partial cds\n-\n-Length=4301\n-\n-\n-***** No hits found *****\n-\n-\n-\n-Lambda     K      H\n-'..b'<b>Subject=</b> gi|18148870|dbj|AB062417.1| Synthetic construct Bos taurus gene for\n-rhodopsin, complete cds\n-\n-Length=1047\n-\n-<script src="blastResult.js"></script>\n- Score =  711 bits (1640),  Expect = 0.0, Method: Compositional matrix adjust.\n- Identities = 325/348 (94%), Positives = 337/348 (97%), Gaps = 0/348 (0%)\n- Frame = +1\n-\n-Query  1     MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLY  60\n-             MNGTEGPNFYVPFSN TGVVRSPFE PQYYLAEPWQFSMLAAYMFLLI+LGFPINFLTLY\n-Sbjct  1     MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFLLIMLGFPINFLTLY  180\n-\n-Query  61    VTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLG  120\n-             VTVQHKKLRTPLNYILLNLAVADLFMV GGFT+TLYTSLHGYFVFGPTGCNLEGFFATLG\n-Sbjct  181   VTVQHKKLRTPLNYILLNLAVADLFMVFGGFTTTLYTSLHGYFVFGPTGCNLEGFFATLG  360\n-\n-Query  121   GEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIP  180\n-             GEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPL GWSRYIP\n-Sbjct  361   GEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLVGWSRYIP  540\n-\n-Query  181   EGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQES  240\n-             EG+QCSCGIDYYT   E NNESFVIYMFVVHF IP+I+IFFCYGQLVFTVKEAAAQQQES\n-Sbjct  541   EGMQCSCGIDYYTPHEETNNESFVIYMFVVHFIIPLIVIFFCYGQLVFTVKEAAAQQQES  720\n-\n-Query  241   ATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAI  300\n-             ATTQKAEKEVTRMVIIMVIAFLICW+PYA VAFYIFTHQGS+FGPIFMTIPAFFAK++A+\n-Sbjct  721   ATTQKAEKEVTRMVIIMVIAFLICWLPYAGVAFYIFTHQGSDFGPIFMTIPAFFAKTSAV  900\n-\n-Query  301   YNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA  348\n-             YNPVIYIMMNKQFRNCM+TT+CCGKNPLGDDEAS TVSKTETSQVAPA\n-Sbjct  901   YNPVIYIMMNKQFRNCMVTTLCCGKNPLGDDEASTTVSKTETSQVAPA  1044\n-\n-\n-\n-Lambda     K      H\n-   0.351    0.182    0.707 \n-\n-Gapped\n-Lambda     K      H\n-   0.299   0.0710    0.270 \n-\n-Effective search space used: 109230\n-\n-\n-<b>Query=</b> sp|P08100|OPSD_HUMAN Rhodopsin OS=Homo sapiens GN=RHO PE=1 SV=1\n-\n-Length=348\n-\n-<b>Subject=</b> gi|12583664|dbj|AB043817.1| Conger myriaster conf gene for fresh\n-water form rod opsin, complete cds\n-\n-Length=1344\n-\n-<script src="blastResult.js"></script>\n- Score =  626 bits (1444),  Expect = 0.0, Method: Compositional matrix adjust.\n- Identities = 281/342 (83%), Positives = 311/342 (91%), Gaps = 1/342 (0%)\n- Frame = +2\n-\n-Query  1     MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLY  60\n-             MNGTEGPNFY+P SNATGVVRSPFEYPQYYLAEPW FS L+AYMF LI+ GFPINFLTLY\n-Sbjct  23    MNGTEGPNFYIPMSNATGVVRSPFEYPQYYLAEPWAFSALSAYMFFLIIAGFPINFLTLY  202\n-\n-Query  61    VTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLG  120\n-             VT++HKKLRTPLNYILLNLAVADLFMV GGFT+T+YTS+HGYFVFGPTGCN+EGFFATLG\n-Sbjct  203   VTIEHKKLRTPLNYILLNLAVADLFMVFGGFTTTMYTSMHGYFVFGPTGCNIEGFFATLG  382\n-\n-Query  121   GEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIP  180\n-             GEIALW LVVLAIER++VVCKP++NFRFGE HAIMGV  TW MALACA PPL GWSRYIP\n-Sbjct  383   GEIALWCLVVLAIERWMVVCKPVTNFRFGESHAIMGVMVTWTMALACALPPLFGWSRYIP  562\n-\n-Query  181   EGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQES  240\n-             EGLQCSCGIDYYT  P +NNESFVIYMF  HF+IP+ +I FCYG+LV TVKEAAAQQQES\n-Sbjct  563   EGLQCSCGIDYYTRAPGINNESFVIYMFTCHFSIPLAVISFCYGRLVCTVKEAAAQQQES  742\n-\n-Query  241   ATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAI  300\n-              TTQ+AE+EVTRMV+IMVI+FL+CWVPYASVA YIFTHQGS FGPIFMTIP+FFAKS+A+\n-Sbjct  743   ETTQRAEREVTRMVVIMVISFLVCWVPYASVAWYIFTHQGSTFGPIFMTIPSFFAKSSAL  922\n-\n-Query  301   YNPVIYIMMNKQFRNCMLTTICCGKNPL-GDDEASATVSKTE  341\n-             YNP+IYI MNKQFR CM+TT+CCGKNP   +D ASAT SKTE\n-Sbjct  923   YNPMIYICMNKQFRHCMITTLCCGKNPFEEEDGASATSSKTE  1048\n-\n-\n-\n-Lambda     K      H\n-   0.351    0.182    0.707 \n-\n-Gapped\n-Lambda     K      H\n-   0.299   0.0710    0.270 \n-\n-Effective search space used: 109230\n-\n-\n-\n-\n-Matrix: BLOSUM80\n-Gap Penalties: Existence: 10, Extension: 1\n-Neighboring words threshold: 14\n-Window for multiple hits: 25\n-</PRE>\n-</BODY>\n-</HTML>\n'
b
diff -r 45ba7c750bc8 -r 6ef523b390e0 test-data/tblastn_four_human_vs_rhodopsin.tabular
--- a/test-data/tblastn_four_human_vs_rhodopsin.tabular Thu Sep 20 10:12:43 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,10 +0,0 @@
-sp|P08100|OPSD_HUMAN gi|57163782|ref|NM_001009242.1| 96.55 348 12 0 1 348 1 1044 0.0  732
-sp|P08100|OPSD_HUMAN gi|2734705|gb|U59921.1|BBU59921 84.80 342 51 1 1 341 42 1067 0.0  646
-sp|P08100|OPSD_HUMAN gi|283855845|gb|GQ290303.1| 93.24 74 5 0 239 312 3147 3368 1e-72  151
-sp|P08100|OPSD_HUMAN gi|283855845|gb|GQ290303.1| 91.53 59 5 0 177 235 2855 3031 1e-72  126
-sp|P08100|OPSD_HUMAN gi|283855845|gb|GQ290303.1| 96.40 111 4 0 11 121 1 333 1e-64  229
-sp|P08100|OPSD_HUMAN gi|283855845|gb|GQ290303.1| 93.22 59 4 0 119 177 1404 1580 1e-32  122
-sp|P08100|OPSD_HUMAN gi|283855845|gb|GQ290303.1| 88.46 26 3 0 312 337 4222 4299 6e-13 57.7
-sp|P08100|OPSD_HUMAN gi|283855822|gb|GQ290312.1| 95.09 326 16 0 11 336 1 978 0.0  658
-sp|P08100|OPSD_HUMAN gi|18148870|dbj|AB062417.1| 93.39 348 23 0 1 348 1 1044 0.0  711
-sp|P08100|OPSD_HUMAN gi|12583664|dbj|AB043817.1| 82.16 342 60 1 1 341 23 1048 0.0  626
b
diff -r 45ba7c750bc8 -r 6ef523b390e0 test-data/tblastn_four_human_vs_rhodopsin.xml
--- a/test-data/tblastn_four_human_vs_rhodopsin.xml Thu Sep 20 10:12:43 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
b'@@ -1,722 +0,0 @@\n-<?xml version="1.0"?>\n-<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "NCBI_BlastOutput.dtd">\n-<BlastOutput>\n-  <BlastOutput_program>tblastn</BlastOutput_program>\n-  <BlastOutput_version>TBLASTN 2.2.25+</BlastOutput_version>\n-  <BlastOutput_reference>Stephen F. Altschul, Thomas L. Madden, Alejandro A. Sch&amp;auml;ffer, Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), &quot;Gapped BLAST and PSI-BLAST: a new generation of protein database search programs&quot;, Nucleic Acids Res. 25:3389-3402.</BlastOutput_reference>\n-  <BlastOutput_db></BlastOutput_db>\n-  <BlastOutput_query-ID>Query_1</BlastOutput_query-ID>\n-  <BlastOutput_query-def>sp|Q9BS26|ERP44_HUMAN Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1</BlastOutput_query-def>\n-  <BlastOutput_query-len>406</BlastOutput_query-len>\n-  <BlastOutput_param>\n-    <Parameters>\n-      <Parameters_matrix>BLOSUM80</Parameters_matrix>\n-      <Parameters_expect>1e-10</Parameters_expect>\n-      <Parameters_gap-open>10</Parameters_gap-open>\n-      <Parameters_gap-extend>1</Parameters_gap-extend>\n-      <Parameters_filter>F</Parameters_filter>\n-    </Parameters>\n-  </BlastOutput_param>\n-  <BlastOutput_iterations>\n-    <Iteration>\n-      <Iteration_iter-num>1</Iteration_iter-num>\n-      <Iteration_query-ID>Query_1</Iteration_query-ID>\n-      <Iteration_query-def>sp|Q9BS26|ERP44_HUMAN Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def>\n-      <Iteration_query-len>406</Iteration_query-len>\n-      <Iteration_hits></Iteration_hits>\n-      <Iteration_stat>\n-        <Statistics>\n-          <Statistics_db-num>0</Statistics_db-num>\n-          <Statistics_db-len>0</Statistics_db-len>\n-          <Statistics_hsp-len>19</Statistics_hsp-len>\n-          <Statistics_eff-space>127710</Statistics_eff-space>\n-          <Statistics_kappa>0.071</Statistics_kappa>\n-          <Statistics_lambda>0.299</Statistics_lambda>\n-          <Statistics_entropy>0.27</Statistics_entropy>\n-        </Statistics>\n-      </Iteration_stat>\n-      <Iteration_message>No hits found</Iteration_message>\n-    </Iteration>\n-    <Iteration>\n-      <Iteration_iter-num>2</Iteration_iter-num>\n-      <Iteration_query-ID>Query_1</Iteration_query-ID>\n-      <Iteration_query-def>sp|Q9BS26|ERP44_HUMAN Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def>\n-      <Iteration_query-len>406</Iteration_query-len>\n-      <Iteration_hits></Iteration_hits>\n-      <Iteration_stat>\n-        <Statistics>\n-          <Statistics_db-num>0</Statistics_db-num>\n-          <Statistics_db-len>0</Statistics_db-len>\n-          <Statistics_hsp-len>19</Statistics_hsp-len>\n-          <Statistics_eff-space>127710</Statistics_eff-space>\n-          <Statistics_kappa>0.071</Statistics_kappa>\n-          <Statistics_lambda>0.299</Statistics_lambda>\n-          <Statistics_entropy>0.27</Statistics_entropy>\n-        </Statistics>\n-      </Iteration_stat>\n-      <Iteration_message>No hits found</Iteration_message>\n-    </Iteration>\n-    <Iteration>\n-      <Iteration_iter-num>3</Iteration_iter-num>\n-      <Iteration_query-ID>Query_1</Iteration_query-ID>\n-      <Iteration_query-def>sp|Q9BS26|ERP44_HUMAN Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def>\n-      <Iteration_query-len>406</Iteration_query-len>\n-      <Iteration_hits></Iteration_hits>\n-      <Iteration_stat>\n-        <Statistics>\n-          <Statistics_db-num>0</Statistics_db-num>\n-          <Statistics_db-len>0</Statistics_db-len>\n-          <Statistics_hsp-len>19</Statistics_hsp-len>\n-          <Statistics_eff-space>127710</Statistics_eff-space>\n-          <Statistics_kappa>0.071</Statistics_kappa>\n-          <Statistics_lambda>0.299</Statistics_lambda>\n-          <Statistics_entropy>0.27</Statistics_entropy>\n-        </Statistics>\n-      </Iteration_stat>\n-      <Iteration_message>No hits found</Iterat'..b'LAAYMFLLI+LGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMV GGFT+TLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPL GWSRYIPEG+QCSCGIDYYT   E NNESFVIYMFVVHF IP+I+IFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICW+PYA VAFYIFTHQGS+FGPIFMTIPAFFAK++A+YNPVIYIMMNKQFRNCM+TT+CCGKNPLGDDEAS TVSKTETSQVAPA</Hsp_midline>\n-            </Hsp>\n-          </Hit_hsps>\n-        </Hit>\n-      </Iteration_hits>\n-      <Iteration_stat>\n-        <Statistics>\n-          <Statistics_db-num>0</Statistics_db-num>\n-          <Statistics_db-len>0</Statistics_db-len>\n-          <Statistics_hsp-len>18</Statistics_hsp-len>\n-          <Statistics_eff-space>109230</Statistics_eff-space>\n-          <Statistics_kappa>0.071</Statistics_kappa>\n-          <Statistics_lambda>0.299</Statistics_lambda>\n-          <Statistics_entropy>0.27</Statistics_entropy>\n-        </Statistics>\n-      </Iteration_stat>\n-    </Iteration>\n-    <Iteration>\n-      <Iteration_iter-num>24</Iteration_iter-num>\n-      <Iteration_query-ID>Query_4</Iteration_query-ID>\n-      <Iteration_query-def>sp|P08100|OPSD_HUMAN Rhodopsin OS=Homo sapiens GN=RHO PE=1 SV=1</Iteration_query-def>\n-      <Iteration_query-len>348</Iteration_query-len>\n-      <Iteration_hits>\n-        <Hit>\n-          <Hit_num>1</Hit_num>\n-          <Hit_id>Subject_6</Hit_id>\n-          <Hit_def>gi|12583664|dbj|AB043817.1| Conger myriaster conf gene for fresh water form rod opsin, complete cds</Hit_def>\n-          <Hit_accession>Subject_6</Hit_accession>\n-          <Hit_len>1344</Hit_len>\n-          <Hit_hsps>\n-            <Hsp>\n-              <Hsp_num>1</Hsp_num>\n-              <Hsp_bit-score>626.708277239213</Hsp_bit-score>\n-              <Hsp_score>1444</Hsp_score>\n-              <Hsp_evalue>0</Hsp_evalue>\n-              <Hsp_query-from>1</Hsp_query-from>\n-              <Hsp_query-to>341</Hsp_query-to>\n-              <Hsp_hit-from>23</Hsp_hit-from>\n-              <Hsp_hit-to>1048</Hsp_hit-to>\n-              <Hsp_query-frame>0</Hsp_query-frame>\n-              <Hsp_hit-frame>2</Hsp_hit-frame>\n-              <Hsp_identity>281</Hsp_identity>\n-              <Hsp_positive>311</Hsp_positive>\n-              <Hsp_gaps>1</Hsp_gaps>\n-              <Hsp_align-len>342</Hsp_align-len>\n-              <Hsp_qseq>MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPL-GDDEASATVSKTE</Hsp_qseq>\n-              <Hsp_hseq>MNGTEGPNFYIPMSNATGVVRSPFEYPQYYLAEPWAFSALSAYMFFLIIAGFPINFLTLYVTIEHKKLRTPLNYILLNLAVADLFMVFGGFTTTMYTSMHGYFVFGPTGCNIEGFFATLGGEIALWCLVVLAIERWMVVCKPVTNFRFGESHAIMGVMVTWTMALACALPPLFGWSRYIPEGLQCSCGIDYYTRAPGINNESFVIYMFTCHFSIPLAVISFCYGRLVCTVKEAAAQQQESETTQRAEREVTRMVVIMVISFLVCWVPYASVAWYIFTHQGSTFGPIFMTIPSFFAKSSALYNPMIYICMNKQFRHCMITTLCCGKNPFEEEDGASATSSKTE</Hsp_hseq>\n-              <Hsp_midline>MNGTEGPNFY+P SNATGVVRSPFEYPQYYLAEPW FS L+AYMF LI+ GFPINFLTLYVT++HKKLRTPLNYILLNLAVADLFMV GGFT+T+YTS+HGYFVFGPTGCN+EGFFATLGGEIALW LVVLAIER++VVCKP++NFRFGE HAIMGV  TW MALACA PPL GWSRYIPEGLQCSCGIDYYT  P +NNESFVIYMF  HF+IP+ +I FCYG+LV TVKEAAAQQQES TTQ+AE+EVTRMV+IMVI+FL+CWVPYASVA YIFTHQGS FGPIFMTIP+FFAKS+A+YNP+IYI MNKQFR CM+TT+CCGKNP   +D ASAT SKTE</Hsp_midline>\n-            </Hsp>\n-          </Hit_hsps>\n-        </Hit>\n-      </Iteration_hits>\n-      <Iteration_stat>\n-        <Statistics>\n-          <Statistics_db-num>0</Statistics_db-num>\n-          <Statistics_db-len>0</Statistics_db-len>\n-          <Statistics_hsp-len>18</Statistics_hsp-len>\n-          <Statistics_eff-space>109230</Statistics_eff-space>\n-          <Statistics_kappa>0.071</Statistics_kappa>\n-          <Statistics_lambda>0.299</Statistics_lambda>\n-          <Statistics_entropy>0.27</Statistics_entropy>\n-        </Statistics>\n-      </Iteration_stat>\n-    </Iteration>\n-  </BlastOutput_iterations>\n-</BlastOutput>\n'
b
diff -r 45ba7c750bc8 -r 6ef523b390e0 test-data/tblastn_four_human_vs_rhodopsin_ext.tabular
--- a/test-data/tblastn_four_human_vs_rhodopsin_ext.tabular Thu Sep 20 10:12:43 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,10 +0,0 @@
-sp|P08100|OPSD_HUMAN gi|57163782|ref|NM_001009242.1| 96.55 348 12 0 1 348 1 1044 0.0  732 gi|57163782|ref|NM_001009242.1| 1689 336 343 0 98.56 0 1 MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA MNGTEGPNFYVPFSNKTGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVFGGFTTTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLVGWSRYIPEGMQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIVIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTLPAFFAKSSSIYNPVIYIMMNKQFRNCMLTTLCCGKNPLGDDEASTTGSKTETSQVAPA 348 1047
-sp|P08100|OPSD_HUMAN gi|2734705|gb|U59921.1|BBU59921 84.80 342 51 1 1 341 42 1067 0.0  646 gi|2734705|gb|U59921.1|BBU59921 1489 290 320 1 93.57 0 3 MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEA-SATVSKTE MNGTEGPNFYIPMSNKTGVVRSPFEYPQYYLAEPWQYSILCAYMFLLILLGFPINFMTLYVTIQHKKLRTPLNYILLNLAFANHFMVLCGFTVTMYSSMNGYFILGATGCYVEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFSENHAVMGVAFTWIMALSCAVPPLLGWSRYIPEGMQCSCGVDYYTLKPEVNNESFVIYMFVVHFTIPLIIIFFCYGRLVCTVKEAAAQQQESATTQKAEKEVTRMVIIMVVFFLICWVPYASVAFFIFSNQGSEFGPIFMTVPAFFAKSSSIYNPVIYIMLNKQFRNCMITTLCCGKNPFGEDDASSAATSKTE 348 1574
-sp|P08100|OPSD_HUMAN gi|283855845|gb|GQ290303.1| 93.24 74 5 0 239 312 3147 3368 1e-72  151 gi|283855845|gb|GQ290303.1| 342 69 73 0 98.65 0 3 ESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQ ESATTQKAEKEVTRMVIIMVIAFLICWLPYAGVAFYIFTHQGSNFGPIFMTLPAFFAKSSSIYNPVIYIMMNKQ 348 4301
-sp|P08100|OPSD_HUMAN gi|283855845|gb|GQ290303.1| 91.53 59 5 0 177 235 2855 3031 1e-72  126 gi|283855845|gb|GQ290303.1| 284 54 57 0 96.61 0 2 RYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAA RYIPEGMQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIVIFFCYGQLVFTVKEVRS 348 4301
-sp|P08100|OPSD_HUMAN gi|283855845|gb|GQ290303.1| 96.40 111 4 0 11 121 1 333 1e-64  229 gi|283855845|gb|GQ290303.1| 523 107 109 0 98.20 0 1 VPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGG VPFSNKTGVVRSPFEHPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVFGGFTTTLYTSLHGYFVFGPTGCNLEGFFATLGG 348 4301
-sp|P08100|OPSD_HUMAN gi|283855845|gb|GQ290303.1| 93.22 59 4 0 119 177 1404 1580 1e-32  122 gi|283855845|gb|GQ290303.1| 276 55 56 0 94.92 0 3 LGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSR LAGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGLALTWVMALACAAPPLVGWSR 348 4301
-sp|P08100|OPSD_HUMAN gi|283855845|gb|GQ290303.1| 88.46 26 3 0 312 337 4222 4299 6e-13 57.7 gi|283855845|gb|GQ290303.1| 125 23 24 0 92.31 0 1 QFRNCMLTTICCGKNPLGDDEASATV QFRNCMLTTLCCGKNPLGDDEASTTA 348 4301
-sp|P08100|OPSD_HUMAN gi|283855822|gb|GQ290312.1| 95.09 326 16 0 11 336 1 978 0.0  658 gi|283855822|gb|GQ290312.1| 1517 310 322 0 98.77 0 1 VPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASAT VPFSNKTGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVANLFMVFGGFTTTLYTSMHGYFVFGATGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGLAFTWVMALACAAPPLAGWSRYIPEGMQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIVIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVVAFLICWLPYASVAFYIFTHQGSNFGPVFMTIPAFFAKSSSIYNPVIYIMMNKQFRNCMLTTLCCGKNPLGDDEASTT 348 983
-sp|P08100|OPSD_HUMAN gi|18148870|dbj|AB062417.1| 93.39 348 23 0 1 348 1 1044 0.0  711 gi|18148870|dbj|AB062417.1| 1640 325 337 0 96.84 0 1 MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFLLIMLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVFGGFTTTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLVGWSRYIPEGMQCSCGIDYYTPHEETNNESFVIYMFVVHFIIPLIVIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWLPYAGVAFYIFTHQGSDFGPIFMTIPAFFAKTSAVYNPVIYIMMNKQFRNCMVTTLCCGKNPLGDDEASTTVSKTETSQVAPA 348 1047
-sp|P08100|OPSD_HUMAN gi|12583664|dbj|AB043817.1| 82.16 342 60 1 1 341 23 1048 0.0  626 gi|12583664|dbj|AB043817.1| 1444 281 311 1 90.94 0 2 MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPL-GDDEASATVSKTE MNGTEGPNFYIPMSNATGVVRSPFEYPQYYLAEPWAFSALSAYMFFLIIAGFPINFLTLYVTIEHKKLRTPLNYILLNLAVADLFMVFGGFTTTMYTSMHGYFVFGPTGCNIEGFFATLGGEIALWCLVVLAIERWMVVCKPVTNFRFGESHAIMGVMVTWTMALACALPPLFGWSRYIPEGLQCSCGIDYYTRAPGINNESFVIYMFTCHFSIPLAVISFCYGRLVCTVKEAAAQQQESETTQRAEREVTRMVVIMVISFLVCWVPYASVAWYIFTHQGSTFGPIFMTIPSFFAKSSALYNPMIYICMNKQFRHCMITTLCCGKNPFEEEDGASATSSKTE 348 1344
b
diff -r 45ba7c750bc8 -r 6ef523b390e0 test-data/tblastn_four_human_vs_rhodopsin_parse_deflines.tabular
--- a/test-data/tblastn_four_human_vs_rhodopsin_parse_deflines.tabular Thu Sep 20 10:12:43 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,10 +0,0 @@
-sp|P08100|OPSD_HUMAN gi|57163782|ref|NM_001009242.1| 96.55 348 12 0 1 348 1 1044 0.0  732
-sp|P08100|OPSD_HUMAN gi|2734705|gb|U59921.1|BBU59921 84.80 342 51 1 1 341 42 1067 0.0  646
-sp|P08100|OPSD_HUMAN gi|283855845|gb|GQ290303.1| 93.24 74 5 0 239 312 3147 3368 1e-72  151
-sp|P08100|OPSD_HUMAN gi|283855845|gb|GQ290303.1| 91.53 59 5 0 177 235 2855 3031 1e-72  126
-sp|P08100|OPSD_HUMAN gi|283855845|gb|GQ290303.1| 96.40 111 4 0 11 121 1 333 1e-64  229
-sp|P08100|OPSD_HUMAN gi|283855845|gb|GQ290303.1| 93.22 59 4 0 119 177 1404 1580 1e-32  122
-sp|P08100|OPSD_HUMAN gi|283855845|gb|GQ290303.1| 88.46 26 3 0 312 337 4222 4299 6e-13 57.7
-sp|P08100|OPSD_HUMAN gi|283855822|gb|GQ290312.1| 95.09 326 16 0 11 336 1 978 0.0  658
-sp|P08100|OPSD_HUMAN gi|18148870|dbj|AB062417.1| 93.39 348 23 0 1 348 1 1044 0.0  711
-sp|P08100|OPSD_HUMAN gi|12583664|dbj|AB043817.1| 82.16 342 60 1 1 341 23 1048 0.0  626
b
diff -r 45ba7c750bc8 -r 6ef523b390e0 tools/ncbi_blast_plus/blastdb.loc.sample
--- a/tools/ncbi_blast_plus/blastdb.loc.sample Thu Sep 20 10:12:43 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,38 +0,0 @@
-#This is a sample file distributed with Galaxy that is used to define a
-#list of nucleotide BLAST databases, using three columns tab separated
-#(longer whitespace are TAB characters):
-#
-#<unique_id> <database_caption> <base_name_path>
-#
-#The captions typically contain spaces and might end with the build date.
-#It is important that the actual database name does not have a space in it,
-#and that the first tab that appears in the line is right before the path.
-#
-#So, for example, if your database is nt and the path to your base name 
-#is /depot/data2/galaxy/blastdb/nt/nt.chunk, then the blastdb.loc entry 
-#would look like this:
-#
-#nt_02_Dec_2009      nt 02 Dec 2009      /depot/data2/galaxy/blastdb/nt/nt.chunk
-#
-#and your /depot/data2/galaxy/blastdb/nt directory would contain all of 
-#your "base names" (e.g.):
-#
-#-rw-r--r--  1 wychung galaxy  23437408 2008-04-09 11:26 nt.chunk.00.nhr
-#-rw-r--r--  1 wychung galaxy   3689920 2008-04-09 11:26 nt.chunk.00.nin
-#-rw-r--r--  1 wychung galaxy 251215198 2008-04-09 11:26 nt.chunk.00.nsq
-#...etc...
-#
-#Your blastdb.loc file should include an entry per line for each "base name" 
-#you have stored.  For example:
-#
-#nt_02_Dec_2009 nt 02 Dec 2009 /depot/data2/galaxy/blastdb/nt/nt.chunk
-#wgs_30_Nov_2009 wgs 30 Nov 2009 /depot/data2/galaxy/blastdb/wgs/wgs.chunk
-#test_20_Sep_2008 test 20 Sep 2008 /depot/data2/galaxy/blastdb/test/test
-#...etc...
-#
-#See also blastdb_p.loc which is for any protein BLAST database.
-#
-#Note that for backwards compatibility with workflows, the unique ID of
-#an entry must be the path that was in the original loc file, because that
-#is the value stored in the workflow for that parameter.
-#
b
diff -r 45ba7c750bc8 -r 6ef523b390e0 tools/ncbi_blast_plus/blastdb_p.loc.sample
--- a/tools/ncbi_blast_plus/blastdb_p.loc.sample Thu Sep 20 10:12:43 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,27 +0,0 @@
-#This is a sample file distributed with Galaxy that is used to define a
-#list of protein BLAST databases, using three columns tab separated
-#(longer whitespace are TAB characters):
-#
-#<unique_id> <database_caption> <base_name_path>
-#
-#The captions typically contain spaces and might end with the build date.
-#It is important that the actual database name does not have a space in it,
-#and that the first tab that appears in the line is right before the path.
-#
-#So, for example, if your database is NR and the path to your base name
-#is /data/blastdb/nr, then the blastdb_p.loc entry would look like this:
-#
-#nr NCBI NR (non redundant) /data/blastdb/nr
-#
-#and your /data/blastdb directory would contain all of the files associated
-#with the database, /data/blastdb/nr.*.
-#
-#Your blastdb_p.loc file should include an entry per line for each "base name"
-#you have stored. For example:
-#
-#nr_05Jun2010 NCBI NR (non redundant) 05 Jun 2010 /data/blastdb/05Jun2010/nr
-#nr_15Aug2010 NCBI NR (non redundant) 15 Aug 2010 /data/blastdb/15Aug2010/nr
-#...etc...
-#
-#See also blastdb.loc which is for any nucleotide BLAST database.
-#
b
diff -r 45ba7c750bc8 -r 6ef523b390e0 tools/ncbi_blast_plus/blastxml_to_tabular.py
--- a/tools/ncbi_blast_plus/blastxml_to_tabular.py Thu Sep 20 10:12:43 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
b'@@ -1,254 +0,0 @@\n-#!/usr/bin/env python\n-"""Convert a BLAST XML file to 12 column tabular output\n-\n-Takes three command line options, input BLAST XML filename, output tabular\n-BLAST filename, output format (std for standard 12 columns, or ext for the\n-extended 24 columns offered in the BLAST+ wrappers).\n-\n-The 12 columns output are \'qseqid sseqid pident length mismatch gapopen qstart\n-qend sstart send evalue bitscore\' or \'std\' at the BLAST+ command line, which\n-mean:\n-   \n-====== ========= ============================================\n-Column NCBI name Description\n------- --------- --------------------------------------------\n-     1 qseqid    Query Seq-id (ID of your sequence)\n-     2 sseqid    Subject Seq-id (ID of the database hit)\n-     3 pident    Percentage of identical matches\n-     4 length    Alignment length\n-     5 mismatch  Number of mismatches\n-     6 gapopen   Number of gap openings\n-     7 qstart    Start of alignment in query\n-     8 qend      End of alignment in query\n-     9 sstart    Start of alignment in subject (database hit)\n-    10 send      End of alignment in subject (database hit)\n-    11 evalue    Expectation value (E-value)\n-    12 bitscore  Bit score\n-====== ========= ============================================\n-\n-The additional columns offered in the Galaxy BLAST+ wrappers are:\n-\n-====== ============= ===========================================\n-Column NCBI name     Description\n------- ------------- -------------------------------------------\n-    13 sallseqid     All subject Seq-id(s), separated by a \';\'\n-    14 score         Raw score\n-    15 nident        Number of identical matches\n-    16 positive      Number of positive-scoring matches\n-    17 gaps          Total number of gaps\n-    18 ppos          Percentage of positive-scoring matches\n-    19 qframe        Query frame\n-    20 sframe        Subject frame\n-    21 qseq          Aligned part of query sequence\n-    22 sseq          Aligned part of subject sequence\n-    23 qlen          Query sequence length\n-    24 slen          Subject sequence length\n-====== ============= ===========================================\n-\n-Most of these fields are given explicitly in the XML file, others some like\n-the percentage identity and the number of gap openings must be calculated.\n-\n-Be aware that the sequence in the extended tabular output or XML direct from\n-BLAST+ may or may not use XXXX masking on regions of low complexity. This\n-can throw the off the calculation of percentage identity and gap openings.\n-[In fact, both BLAST 2.2.24+ and 2.2.25+ have a subtle bug in this regard,\n-with these numbers changing depending on whether or not the low complexity\n-filter is used.]\n-\n-This script attempts to produce identical output to what BLAST+ would have done.\n-However, check this with "diff -b ..." since BLAST+ sometimes includes an extra\n-space character (probably a bug).\n-"""\n-import sys\n-import re\n-\n-if sys.version_info[:2] >= ( 2, 5 ):\n-    import xml.etree.cElementTree as ElementTree\n-else:\n-    from galaxy import eggs\n-    import pkg_resources; pkg_resources.require( "elementtree" )\n-    from elementtree import ElementTree\n-\n-def stop_err( msg ):\n-    sys.stderr.write("%s\\n" % msg)\n-    sys.exit(1)\n-\n-#Parse Command Line\n-try:\n-    in_file, out_file, out_fmt = sys.argv[1:]\n-except:\n-    stop_err("Expect 3 arguments: input BLAST XML file, output tabular file, out format (std or ext)")\n-\n-if out_fmt == "std":\n-    extended = False\n-elif out_fmt == "x22":\n-    stop_err("Format argument x22 has been replaced with ext (extended 24 columns)")\n-elif out_fmt == "ext":\n-    extended = True\n-else:\n-    stop_err("Format argument should be std (12 column) or ext (extended 24 columns)")\n-\n-\n-# get an iterable\n-try: \n-    context = ElementTree.iterparse(in_file, events=("start", "end"))\n-except:\n-    stop_err("Invalid data format.")\n-# turn it into an iterator\n-context = iter(context)\n-# get the root element\n-try:\n-    event, root = context.next()\n-except:\n-    st'..b'")\n-                xx = sum(1 for q,h in zip(q_seq, h_seq) if q=="X" and h=="X")\n-                if not (expected_mismatch - q_seq.count("X") <= int(mismatch) <= expected_mismatch + xx):\n-                    stop_err("%s vs %s mismatches, expected %i <= %i <= %i" \\\n-                             % (qseqid, sseqid, expected_mismatch - q_seq.count("X"),\n-                                int(mismatch), expected_mismatch))\n-\n-                #TODO - Remove this alternative identity calculation and test\n-                #once satisifed there are no problems\n-                expected_identity = sum(1 for q,h in zip(q_seq, h_seq) if q == h)\n-                if not (expected_identity - xx <= int(nident) <= expected_identity + q_seq.count("X")):\n-                    stop_err("%s vs %s identities, expected %i <= %i <= %i" \\\n-                             % (qseqid, sseqid, expected_identity, int(nident),\n-                                expected_identity + q_seq.count("X")))\n-                \n-\n-                evalue = hsp.findtext("Hsp_evalue")\n-                if evalue == "0":\n-                    evalue = "0.0"\n-                else:\n-                    evalue = "%0.0e" % float(evalue)\n-                \n-                bitscore = float(hsp.findtext("Hsp_bit-score"))\n-                if bitscore < 100:\n-                    #Seems to show one decimal place for lower scores\n-                    bitscore = "%0.1f" % bitscore\n-                else:\n-                    #Note BLAST does not round to nearest int, it truncates\n-                    bitscore = "%i" % bitscore\n-\n-                values = [qseqid,\n-                          sseqid,\n-                          pident,\n-                          length, #hsp.findtext("Hsp_align-len")\n-                          str(mismatch),\n-                          gapopen,\n-                          hsp.findtext("Hsp_query-from"), #qstart,\n-                          hsp.findtext("Hsp_query-to"), #qend,\n-                          hsp.findtext("Hsp_hit-from"), #sstart,\n-                          hsp.findtext("Hsp_hit-to"), #send,\n-                          evalue, #hsp.findtext("Hsp_evalue") in scientific notation\n-                          bitscore, #hsp.findtext("Hsp_bit-score") rounded\n-                          ]\n-\n-                if extended:\n-                    sallseqid = ";".join(name.split(None,1)[0] for name in hit_def.split(">"))\n-                    #print hit_def, "-->", sallseqid\n-                    positive = hsp.findtext("Hsp_positive")\n-                    ppos = "%0.2f" % (100*float(positive)/float(length))\n-                    qframe = hsp.findtext("Hsp_query-frame")\n-                    sframe = hsp.findtext("Hsp_hit-frame")\n-                    if blast_program == "blastp":\n-                        #Probably a bug in BLASTP that they use 0 or 1 depending on format\n-                        if qframe == "0": qframe = "1"\n-                        if sframe == "0": sframe = "1"\n-                    slen = int(hit.findtext("Hit_len"))\n-                    values.extend([sallseqid,\n-                                   hsp.findtext("Hsp_score"), #score,\n-                                   nident,\n-                                   positive,\n-                                   hsp.findtext("Hsp_gaps"), #gaps,\n-                                   ppos,\n-                                   qframe,\n-                                   sframe,\n-                                   #NOTE - for blastp, XML shows original seq, tabular uses XXX masking\n-                                   q_seq,\n-                                   h_seq,\n-                                   str(qlen),\n-                                   str(slen),\n-                                   ])\n-                #print "\\t".join(values) \n-                outfile.write("\\t".join(values) + "\\n")\n-        # prevents ElementTree from growing large datastructure\n-        root.clear()\n-        elem.clear()\n-outfile.close()\n'
b
diff -r 45ba7c750bc8 -r 6ef523b390e0 tools/ncbi_blast_plus/blastxml_to_tabular.xml
--- a/tools/ncbi_blast_plus/blastxml_to_tabular.xml Thu Sep 20 10:12:43 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,127 +0,0 @@
-<tool id="blastxml_to_tabular" name="BLAST XML to tabular" version="0.0.8">
-    <description>Convert BLAST XML output to tabular</description>
-    <command interpreter="python">
-      blastxml_to_tabular.py $blastxml_file $tabular_file $out_format
-    </command>
-    <inputs>
-        <param name="blastxml_file" type="data" format="blastxml" label="BLAST results as XML"/> 
-        <param name="out_format" type="select" label="Output format">
-            <option value="std" selected="True">Tabular (standard 12 columns)</option>
-            <option value="ext">Tabular (extended 24 columns)</option>
-        </param>
-    </inputs>
-    <outputs>
-        <data name="tabular_file" format="tabular" label="BLAST results as tabular" />
-    </outputs>
-    <requirements>
-    </requirements>
-    <tests>
-        <test>
-            <param name="blastxml_file" value="blastp_four_human_vs_rhodopsin.xml" ftype="blastxml" />
-            <param name="out_format" value="std" />
-            <!-- Note this has some white space differences from the actual blastp output blast_four_human_vs_rhodopsin.tabluar -->
-            <output name="tabular_file" file="blastp_four_human_vs_rhodopsin_converted.tabular" ftype="tabular" />
-        </test>
-        <test>
-            <param name="blastxml_file" value="blastp_four_human_vs_rhodopsin.xml" ftype="blastxml" />
-            <param name="out_format" value="ext" />
-            <!-- Note this has some white space differences from the actual blastp output blast_four_human_vs_rhodopsin_22c.tabluar -->
-            <output name="tabular_file" file="blastp_four_human_vs_rhodopsin_converted_ext.tabular" ftype="tabular" />
-        </test>
-        <test>
-            <param name="blastxml_file" value="blastp_sample.xml" ftype="blastxml" />
-            <param name="out_format" value="std" />
-            <!-- Note this has some white space differences from the actual blastp output -->
-            <output name="tabular_file" file="blastp_sample_converted.tabular" ftype="tabular" />
-        </test>
-        <test>
-            <param name="blastxml_file" value="blastx_rhodopsin_vs_four_human.xml" ftype="blastxml" />
-            <param name="out_format" value="std" />
-            <!-- Note this has some white space differences from the actual blastx output -->
-            <output name="tabular_file" file="blastx_rhodopsin_vs_four_human_converted.tabular" ftype="tabular" />
-        </test>
-        <test>
-            <param name="blastxml_file" value="blastx_rhodopsin_vs_four_human.xml" ftype="blastxml" />
-            <param name="out_format" value="ext" />
-            <!-- Note this has some white space and XXXX masking differences from the actual blastx output -->
-            <output name="tabular_file" file="blastx_rhodopsin_vs_four_human_converted_ext.tabular" ftype="tabular" />
-        </test>
-        <test>
-            <param name="blastxml_file" value="blastx_sample.xml" ftype="blastxml" />
-            <param name="out_format" value="std" />
-            <!-- Note this has some white space differences from the actual blastx output -->
-            <output name="tabular_file" file="blastx_sample_converted.tabular" ftype="tabular" />
-        </test>
-        <test>
-            <param name="blastxml_file" value="blastp_human_vs_pdb_seg_no.xml" ftype="blastxml" />
-            <param name="out_format" value="std" />
-            <!-- Note this has some white space differences from the actual blastp output -->
-            <output name="tabular_file" file="blastp_human_vs_pdb_seg_no_converted_std.tabular" ftype="tabular" />
-        </test>
-        <test>
-            <param name="blastxml_file" value="blastp_human_vs_pdb_seg_no.xml" ftype="blastxml" />
-            <param name="out_format" value="ext" />
-            <!-- Note this has some white space differences from the actual blastp output -->
-            <output name="tabular_file" file="blastp_human_vs_pdb_seg_no_converted_ext.tabular" ftype="tabular" />
-        </test>
-    </tests>
-    <help>
-    
-**What it does**
-
-NCBI BLAST+ (and the older NCBI 'legacy' BLAST) can output in a range of
-formats including tabular and a more detailed XML format. A complex workflow
-may need both the XML and the tabular output - but running BLAST twice is
-slow and wasteful.
-
-This tool takes the BLAST XML output and by default converts it into the
-standard 12 column tabular equivalent:
-
-====== ========= ============================================
-Column NCBI name Description
------- --------- --------------------------------------------
-     1 qseqid    Query Seq-id (ID of your sequence)
-     2 sseqid    Subject Seq-id (ID of the database hit)
-     3 pident    Percentage of identical matches
-     4 length    Alignment length
-     5 mismatch  Number of mismatches
-     6 gapopen   Number of gap openings
-     7 qstart    Start of alignment in query
-     8 qend      End of alignment in query
-     9 sstart    Start of alignment in subject (database hit)
-    10 send      End of alignment in subject (database hit)
-    11 evalue    Expectation value (E-value)
-    12 bitscore  Bit score
-====== ========= ============================================
-
-The BLAST+ tools can optionally output additional columns of information,
-but this takes longer to calculate. Most (but not all) of these columns are
-included by selecting the extended tabular output. The extra columns are
-included *after* the standard 12 columns. This is so that you can write
-workflow filtering steps that accept either the 12 or 22 column tabular
-BLAST output.
-
-====== ============= ===========================================
-Column NCBI name     Description
------- ------------- -------------------------------------------
-    13 sallseqid     All subject Seq-id(s), separated by a ';'
-    14 score         Raw score
-    15 nident        Number of identical matches
-    16 positive      Number of positive-scoring matches
-    17 gaps          Total number of gaps
-    18 ppos          Percentage of positive-scoring matches
-    19 qframe        Query frame
-    20 sframe        Subject frame
-    21 qseq          Aligned part of query sequence
-    22 sseq          Aligned part of subject sequence
-    23 qlen          Query sequence length
-    24 slen          Subject sequence length
-====== ============= ===========================================
-
-Beware that the XML file (and thus the conversion) and the tabular output
-direct from BLAST+ may differ in the presence of XXXX masking on regions
-low complexity (columns 21 and 22), and thus also calculated figures like
-the percentage idenity (column 3).
-
-    </help>
-</tool>
b
diff -r 45ba7c750bc8 -r 6ef523b390e0 tools/ncbi_blast_plus/ncbi_blast_plus.txt
--- a/tools/ncbi_blast_plus/ncbi_blast_plus.txt Thu Sep 20 10:12:43 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,84 +0,0 @@
-Galaxy wrappers for NCBI BLAST+ suite
-=====================================
-
-These wrappers are copyright 2010-2012 by Peter Cock, The James Hutton Institute
-(formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved.
-See the licence text below.
-
-Currently tested with NCBI BLAST 2.2.26+ (i.e. version 2.2.26 of BLAST+),
-and do not work with the NCBI 'legacy' BLAST suite (e.g. blastall).
-
-Note that these wrappers (and the associated datetypes) were originally
-distributed as part of the main Galaxy repository, but as of August 2012
-moved to the Galaxy Tool Shed as 'ncbi_blast_plus' (and 'blast_datatypes').
-My thanks to Dannon Baker from the Galaxy development team for his assistance
-with this.
-
-
-Manual Installation
-===================
-
-For those not using Galaxy's automated installation from the Tool Shed, put
-the XML and Python files under tools/ncbi_blast_plus and add the XML files
-to your tool_conf.xml as normal.
-
-You must tell Galaxy about any system level BLAST databases using configuration
-files blastdb.loc (nucleotide databases like NT) and blastdb_p.loc (protein
-databases like NR).
-
-You will also need to install 'blast_datatypes' from the Tool Shed. This
-defines the BLAST XML file format ('blastxml').
-
-
-History
-=======
-
-v0.0.11 - Final revision as part of the Galaxy main repository, and the
-          first release via the Tool Shed
-v0.0.12 - Implements genetic code option for translation searches.
-        - Changes <parallelism> to 1000 sequences at a time (to cope with
-          very large sets of queries where BLAST+ can become memory hungry)
-        - Include warning that BLAST+ with subject FASTA gives pairwise
-          e-values
-v0.0.13 - Use the new error handling options in Galaxy (the previously
-          bundled hide_stderr.py script is no longer needed).
-
-
-Developers
-==========
-
-This script and related tools are being developed on the following hg branch:
-http://bitbucket.org/peterjc/galaxy-central/src/tools
-
-For making the "Galaxy Tool Shed" http://community.g2.bx.psu.edu/ tarball I use
-the following command from the Galaxy root folder:
-
-$ ./tools/ncbi_blast_plus/make_ncbi_blast_plus.sh
-
-This similifies ensuring a consistent set of files is bundled each time,
-including all the relevant test files.
-
-
-Licence (MIT/BSD style)
-=======================
-
-Permission to use, copy, modify, and distribute this software and its
-documentation with or without modifications and for any purpose and
-without fee is hereby granted, provided that any copyright notices
-appear in all copies and that both those copyright notices and this
-permission notice appear in supporting documentation, and that the
-names of the contributors or copyright holders not be used in
-advertising or publicity pertaining to distribution of the software
-without specific prior permission.
-
-THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL
-WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE
-CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT
-OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
-OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
-OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
-OR PERFORMANCE OF THIS SOFTWARE.
-
-NOTE: This is the licence for the Galaxy Wrapper only. BLAST+ and
-associated data files are available and licenced separately.
b
diff -r 45ba7c750bc8 -r 6ef523b390e0 tools/ncbi_blast_plus/ncbi_blastn_wrapper.xml
--- a/tools/ncbi_blast_plus/ncbi_blastn_wrapper.xml Thu Sep 20 10:12:43 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
b'@@ -1,215 +0,0 @@\n-<tool id="ncbi_blastn_wrapper" name="NCBI BLAST+ blastn" version="0.0.13">\n-    <description>Search nucleotide database with nucleotide query sequence(s)</description>\n-    <!-- If job splitting is enabled, break up the query file into parts -->\n-    <parallelism method="multi" split_inputs="query" split_mode="to_size" split_size="1000" shared_inputs="subject" merge_outputs="output1"></parallelism>\n-    <version_command>blastn -version</version_command>\n-    <command>\n-## The command is a Cheetah template which allows some Python based syntax.\n-## Lines starting hash hash are comments. Galaxy will turn newlines into spaces\n-blastn\n--query "$query"\n-#if $db_opts.db_opts_selector == "db":\n-  -db "${db_opts.database.fields.path}"\n-#else:\n-  -subject "$db_opts.subject"\n-#end if\n--task $blast_type\n--evalue $evalue_cutoff\n--out $output1\n-##Set the extended list here so if/when we add things, saved workflows are not affected\n-#if str($out_format)=="ext":\n-    -outfmt "6 std sallseqid score nident positive gaps ppos qframe sframe qseq sseq qlen slen"\n-#else:\n-    -outfmt $out_format\n-#end if\n--num_threads 8\n-#if $adv_opts.adv_opts_selector=="advanced":\n-$adv_opts.filter_query\n-$adv_opts.strand\n-## Need int(str(...)) because $adv_opts.max_hits is an InputValueWrapper object not a string\n-## Note -max_target_seqs overrides -num_descriptions and -num_alignments\n-#if (str($adv_opts.max_hits) and int(str($adv_opts.max_hits)) > 0):\n--max_target_seqs $adv_opts.max_hits\n-#end if\n-#if (str($adv_opts.word_size) and int(str($adv_opts.word_size)) > 0):\n--word_size $adv_opts.word_size\n-#end if\n-$adv_opts.ungapped\n-$adv_opts.parse_deflines\n-## End of advanced options:\n-#end if\n-    </command>\n-    <stdio>\n-        <exit_code range="1:" />\n-\t<exit_code range="://0" />\n-    </stdio>\n-    <inputs>\n-        <param name="query" type="data" format="fasta" label="Nucleotide query sequence(s)"/> \n-        <conditional name="db_opts">\n-            <param name="db_opts_selector" type="select" label="Subject database/sequences">\n-              <option value="db" selected="True">BLAST Database</option>\n-              <option value="file">FASTA file (pairwise e-values)</option>\n-            </param>\n-            <when value="db">\n-                <param name="database" type="select" label="Nucleotide BLAST database">\n-                    <options from_file="blastdb.loc">\n-                      <column name="value" index="0"/>\n-                      <column name="name" index="1"/>\n-                      <column name="path" index="2"/>\n-                    </options>\n-                </param>\n-                <param name="subject" type="hidden" value="" /> \n-            </when>\n-            <when value="file">\n-                <param name="database" type="hidden" value="" /> \n-                <param name="subject" type="data" format="fasta" label="Nucleotide FASTA file to use as database"/> \n-            </when>\n-        </conditional>\n-        <param name="blast_type" type="select" display="radio" label="Type of BLAST">\n-            <option value="megablast">megablast</option>\n-            <option value="blastn">blastn</option>\n-            <option value="blastn-short">blastn-short</option>\n-            <option value="dc-megablast">dc-megablast</option>\n-            <!-- Using BLAST 2.2.24+ this gives an error:\n-            BLAST engine error: Program type \'vecscreen\' not supported\n-            <option value="vecscreen">vecscreen</option>\n-            -->\n-        </param>\n-        <param name="evalue_cutoff" type="float" size="15" value="0.001" label="Set expectation value cutoff" />\n-        <param name="out_format" type="select" label="Output format">\n-            <option value="6" selected="True">Tabular (standard 12 columns)</option>\n-            <option value="ext">Tabular (extended 24 columns)</option>\n-            <option value="5">BLAST XML</option>\n-            <option value="0">Pairwise text</option>\n-            <option value="0 -'..b'>\n-                <when input="out_format" value="0 -html" format="html"/>\n-                <when input="out_format" value="2" format="txt"/>\n-                <when input="out_format" value="2 -html" format="html"/>\n-                <when input="out_format" value="4" format="txt"/>\n-                <when input="out_format" value="4 -html" format="html"/>\n-                <when input="out_format" value="5" format="blastxml"/>\n-            </change_format>\n-        </data>\n-    </outputs>\n-    <requirements>\n-        <requirement type="binary">blastn</requirement>\n-    </requirements>\n-    <help>\n-    \n-.. class:: warningmark\n-\n-**Note**. Database searches may take a substantial amount of time.\n-For large input datasets it is advisable to allow overnight processing.  \n-\n------\n-\n-**What it does**\n-\n-Search a *nucleotide database* using a *nucleotide query*,\n-using the NCBI BLAST+ blastn command line tool.\n-Algorithms include blastn, megablast, and discontiguous megablast.\n-\n------\n-\n-**Output format**\n-\n-Because Galaxy focuses on processing tabular data, the default output of this\n-tool is tabular. The standard BLAST+ tabular output contains 12 columns:\n-\n-====== ========= ============================================\n-Column NCBI name Description\n------- --------- --------------------------------------------\n-     1 qseqid    Query Seq-id (ID of your sequence)\n-     2 sseqid    Subject Seq-id (ID of the database hit)\n-     3 pident    Percentage of identical matches\n-     4 length    Alignment length\n-     5 mismatch  Number of mismatches\n-     6 gapopen   Number of gap openings\n-     7 qstart    Start of alignment in query\n-     8 qend      End of alignment in query\n-     9 sstart    Start of alignment in subject (database hit)\n-    10 send      End of alignment in subject (database hit)\n-    11 evalue    Expectation value (E-value)\n-    12 bitscore  Bit score\n-====== ========= ============================================\n-\n-The BLAST+ tools can optionally output additional columns of information,\n-but this takes longer to calculate. Most (but not all) of these columns are\n-included by selecting the extended tabular output. The extra columns are\n-included *after* the standard 12 columns. This is so that you can write\n-workflow filtering steps that accept either the 12 or 24 column tabular\n-BLAST output.\n-\n-====== ============= ===========================================\n-Column NCBI name     Description\n------- ------------- -------------------------------------------\n-    13 sallseqid     All subject Seq-id(s), separated by a \';\'\n-    14 score         Raw score\n-    15 nident        Number of identical matches\n-    16 positive      Number of positive-scoring matches\n-    17 gaps          Total number of gaps\n-    18 ppos          Percentage of positive-scoring matches\n-    19 qframe        Query frame\n-    20 sframe        Subject frame\n-    21 qseq          Aligned part of query sequence\n-    22 sseq          Aligned part of subject sequence\n-    23 qlen          Query sequence length\n-    24 slen          Subject sequence length\n-====== ============= ===========================================\n-\n-The third option is BLAST XML output, which is designed to be parsed by\n-another program, and is understood by some Galaxy tools.\n-\n-You can also choose several plain text or HTML output formats which are designed to be read by a person (not by another program).\n-The HTML versions use basic webpage formatting and can include links to the hits on the NCBI website.\n-The pairwise output (the default on the NCBI BLAST website) shows each match as a pairwise alignment with the query.\n-The two query anchored outputs show a multiple sequence alignment between the query and all the matches,\n-and differ in how insertions are shown (marked as insertions or with gap characters added to the other sequences).\n-\n--------\n-\n-**References**\n-\n-Zhang et al. A Greedy Algorithm for Aligning DNA Sequences. 2000. JCB: 203-214.\n-\n-    </help>\n-</tool>\n'
b
diff -r 45ba7c750bc8 -r 6ef523b390e0 tools/ncbi_blast_plus/ncbi_blastp_wrapper.xml
--- a/tools/ncbi_blast_plus/ncbi_blastp_wrapper.xml Thu Sep 20 10:12:43 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
b'@@ -1,282 +0,0 @@\n-<tool id="ncbi_blastp_wrapper" name="NCBI BLAST+ blastp" version="0.0.13">\n-    <description>Search protein database with protein query sequence(s)</description>\n-    <!-- If job splitting is enabled, break up the query file into parts -->\n-    <parallelism method="multi" split_inputs="query" split_mode="to_size" split_size="1000" shared_inputs="subject" merge_outputs="output1"></parallelism>\n-    <version_command>blastp -version</version_command>\n-    <command>\n-## The command is a Cheetah template which allows some Python based syntax.\n-## Lines starting hash hash are comments. Galaxy will turn newlines into spaces\n-blastp\n--query "$query"\n-#if $db_opts.db_opts_selector == "db":\n-  -db "${db_opts.database.fields.path}"\n-#else:\n-  -subject "$db_opts.subject"\n-#end if\n--task $blast_type\n--evalue $evalue_cutoff\n--out $output1\n-##Set the extended list here so if/when we add things, saved workflows are not affected\n-#if str($out_format)=="ext":\n-    -outfmt "6 std sallseqid score nident positive gaps ppos qframe sframe qseq sseq qlen slen"\n-#else:\n-    -outfmt $out_format\n-#end if\n--num_threads 8\n-#if $adv_opts.adv_opts_selector=="advanced":\n-$adv_opts.filter_query\n--matrix $adv_opts.matrix\n-## Need int(str(...)) because $adv_opts.max_hits is an InputValueWrapper object not a string\n-## Note -max_target_seqs overrides -num_descriptions and -num_alignments\n-#if (str($adv_opts.max_hits) and int(str($adv_opts.max_hits)) > 0):\n--max_target_seqs $adv_opts.max_hits\n-#end if\n-#if (str($adv_opts.word_size) and int(str($adv_opts.word_size)) > 0):\n--word_size $adv_opts.word_size\n-#end if\n-##Ungapped disabled for now - see comments below\n-##$adv_opts.ungapped\n-$adv_opts.parse_deflines\n-## End of advanced options:\n-#end if\n-    </command>\n-    <stdio>\n-        <exit_code range="1:" />\n-\t<exit_code range="://0" />\n-    </stdio>\n-    <inputs>\n-        <param name="query" type="data" format="fasta" label="Protein query sequence(s)"/> \n-        <conditional name="db_opts">\n-            <param name="db_opts_selector" type="select" label="Subject database/sequences">\n-              <option value="db" selected="True">BLAST Database</option>\n-              <option value="file">FASTA file (pairwise e-values)</option>\n-            </param>\n-            <when value="db">\n-                <param name="database" type="select" label="Protein BLAST database">\n-                    <options from_file="blastdb_p.loc">\n-                      <column name="value" index="0"/>\n-                      <column name="name" index="1"/>\n-                      <column name="path" index="2"/>\n-                    </options>\n-                </param>\n-                <param name="subject" type="hidden" value="" /> \n-            </when>\n-            <when value="file">\n-                <param name="database" type="hidden" value="" /> \n-                <param name="subject" type="data" format="fasta" label="Protein FASTA file to use as database"/> \n-            </when>\n-        </conditional>\n-        <param name="blast_type" type="select" display="radio" label="Type of BLAST">\n-            <option value="blastp">blastp</option>\n-            <option value="blastp-short">blastp-short</option>\n-        </param>\n-        <param name="evalue_cutoff" type="float" size="15" value="0.001" label="Set expectation value cutoff" />\n-        <param name="out_format" type="select" label="Output format">\n-            <option value="6" selected="True">Tabular (standard 12 columns)</option>\n-            <option value="ext">Tabular (extended 24 columns)</option>\n-            <option value="5">BLAST XML</option>\n-            <option value="0">Pairwise text</option>\n-            <option value="0 -html">Pairwise HTML</option>\n-            <option value="2">Query-anchored text</option>\n-            <option value="2 -html">Query-anchored HTML</option>\n-            <option value="4">Flat query-anchored text</option>\n-            <option value="4 -html">Flat query-anchored HTML'..b'.fasta" ftype="fasta" />\n-            <param name="database" value="" />\n-            <param name="evalue_cutoff" value="1e-8" />\n-            <param name="blast_type" value="blastp" />\n-            <param name="out_format" value="6" />\n-            <param name="adv_opts_selector" value="basic" />\n-            <output name="output1" file="blastp_rhodopsin_vs_four_human.tabular" ftype="tabular" />\n-        </test>\n-    </tests>\n-    <help>\n-    \n-.. class:: warningmark\n-\n-**Note**. Database searches may take a substantial amount of time.\n-For large input datasets it is advisable to allow overnight processing.  \n-\n------\n-\n-**What it does**\n-\n-Search a *protein database* using a *protein query*,\n-using the NCBI BLAST+ blastp command line tool.\n-\n------\n-\n-**Output format**\n-\n-Because Galaxy focuses on processing tabular data, the default output of this\n-tool is tabular. The standard BLAST+ tabular output contains 12 columns:\n-\n-====== ========= ============================================\n-Column NCBI name Description\n------- --------- --------------------------------------------\n-     1 qseqid    Query Seq-id (ID of your sequence)\n-     2 sseqid    Subject Seq-id (ID of the database hit)\n-     3 pident    Percentage of identical matches\n-     4 length    Alignment length\n-     5 mismatch  Number of mismatches\n-     6 gapopen   Number of gap openings\n-     7 qstart    Start of alignment in query\n-     8 qend      End of alignment in query\n-     9 sstart    Start of alignment in subject (database hit)\n-    10 send      End of alignment in subject (database hit)\n-    11 evalue    Expectation value (E-value)\n-    12 bitscore  Bit score\n-====== ========= ============================================\n-\n-The BLAST+ tools can optionally output additional columns of information,\n-but this takes longer to calculate. Most (but not all) of these columns are\n-included by selecting the extended tabular output. The extra columns are\n-included *after* the standard 12 columns. This is so that you can write\n-workflow filtering steps that accept either the 12 or 24 column tabular\n-BLAST output.\n-\n-====== ============= ===========================================\n-Column NCBI name     Description\n------- ------------- -------------------------------------------\n-    13 sallseqid     All subject Seq-id(s), separated by a \';\'\n-    14 score         Raw score\n-    15 nident        Number of identical matches\n-    16 positive      Number of positive-scoring matches\n-    17 gaps          Total number of gaps\n-    18 ppos          Percentage of positive-scoring matches\n-    19 qframe        Query frame\n-    20 sframe        Subject frame\n-    21 qseq          Aligned part of query sequence\n-    22 sseq          Aligned part of subject sequence\n-    23 qlen          Query sequence length\n-    24 slen          Subject sequence length\n-====== ============= ===========================================\n-\n-The third option is BLAST XML output, which is designed to be parsed by\n-another program, and is understood by some Galaxy tools.\n-\n-You can also choose several plain text or HTML output formats which are designed to be read by a person (not by another program).\n-The HTML versions use basic webpage formatting and can include links to the hits on the NCBI website.\n-The pairwise output (the default on the NCBI BLAST website) shows each match as a pairwise alignment with the query.\n-The two query anchored outputs show a multiple sequence alignment between the query and all the matches,\n-and differ in how insertions are shown (marked as insertions or with gap characters added to the other sequences).\n-\n--------\n-\n-**References**\n-\n-Altschul et al. Gapped BLAST and PSI-BLAST: a new generation of protein database search programs. 1997. Nucleic Acids Res. 25:3389-3402.\n-\n-Schaffer et al. Improving the accuracy of PSI-BLAST protein database searches with composition-based statistics and other refinements. 2001. Nucleic Acids Res. 29:2994-3005.\n-\n-    </help>\n-</tool>\n'
b
diff -r 45ba7c750bc8 -r 6ef523b390e0 tools/ncbi_blast_plus/ncbi_blastx_wrapper.xml
--- a/tools/ncbi_blast_plus/ncbi_blastx_wrapper.xml Thu Sep 20 10:12:43 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
b'@@ -1,268 +0,0 @@\n-<tool id="ncbi_blastx_wrapper" name="NCBI BLAST+ blastx" version="0.0.13">\n-    <description>Search protein database with translated nucleotide query sequence(s)</description>\n-    <!-- If job splitting is enabled, break up the query file into parts -->\n-    <parallelism method="multi" split_inputs="query" split_mode="to_size" split_size="1000" shared_inputs="subject" merge_outputs="output1"></parallelism>\n-    <version_command>blastx -version</version_command>\n-    <command>\n-## The command is a Cheetah template which allows some Python based syntax.\n-## Lines starting hash hash are comments. Galaxy will turn newlines into spaces\n-blastx\n--query "$query"\n-#if $db_opts.db_opts_selector == "db":\n-  -db "${db_opts.database.fields.path}"\n-#else:\n-  -subject "$db_opts.subject"\n-#end if\n--query_gencode $query_gencode\n--evalue $evalue_cutoff\n--out $output1\n-##Set the extended list here so if/when we add things, saved workflows are not affected\n-#if str($out_format)=="ext":\n-    -outfmt "6 std sallseqid score nident positive gaps ppos qframe sframe qseq sseq qlen slen"\n-#else:\n-    -outfmt $out_format\n-#end if\n--num_threads 8\n-#if $adv_opts.adv_opts_selector=="advanced":\n-$adv_opts.filter_query\n-$adv_opts.strand\n--matrix $adv_opts.matrix\n-## Need int(str(...)) because $adv_opts.max_hits is an InputValueWrapper object not a string\n-## Note -max_target_seqs overrides -num_descriptions and -num_alignments\n-#if (str($adv_opts.max_hits) and int(str($adv_opts.max_hits)) > 0):\n--max_target_seqs $adv_opts.max_hits\n-#end if\n-#if (str($adv_opts.word_size) and int(str($adv_opts.word_size)) > 0):\n--word_size $adv_opts.word_size\n-#end if\n-$adv_opts.ungapped\n-$adv_opts.parse_deflines\n-## End of advanced options:\n-#end if\n-    </command>\n-    <stdio>\n-        <exit_code range="1:" />\n-        <exit_code range="://0" />\n-    </stdio>\n-    <inputs>\n-        <param name="query" type="data" format="fasta" label="Nucleotide query sequence(s)"/> \n-        <conditional name="db_opts">\n-            <param name="db_opts_selector" type="select" label="Subject database/sequences">\n-              <option value="db" selected="True">BLAST Database</option>\n-              <option value="file">FASTA file (pairwise e-values)</option>\n-            </param>\n-            <when value="db">\n-                <param name="database" type="select" label="Protein BLAST database">\n-                    <options from_file="blastdb_p.loc">\n-                      <column name="value" index="0"/>\n-                      <column name="name" index="1"/>\n-                      <column name="path" index="2"/>\n-                    </options>\n-                </param>\n-                <param name="subject" type="hidden" value="" /> \n-            </when>\n-            <when value="file">\n-                <param name="database" type="hidden" value="" /> \n-                <param name="subject" type="data" format="fasta" label="Protein FASTA file to use as database"/> \n-            </when>\n-        </conditional>\n-        <param name="query_gencode" type="select" label="Query genetic code">\n-            <!-- See http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi for details -->\n-            <option value="1" select="True">1. Standard</option>\n-            <option value="2">2. Vertebrate Mitochondrial</option>\n-            <option value="3">3. Yeast Mitochondrial</option>\n-            <option value="4">4. Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code</option>\n-            <option value="5">5. Invertebrate Mitochondrial</option>\n-            <option value="6">6. Ciliate, Dasycladacean and Hexamita Nuclear Code</option>\n-            <option value="9">9. Echinoderm Mitochondrial</option>\n-            <option value="10">10. Euplotid Nuclear</option>\n-            <option value="11">11. Bacteria and Archaea</option>\n-            <option value="12">12. Alternative Yeast Nuclear</option>         \n-            <option value="13">13. As'..b'    <test>\n-            <param name="query" value="rhodopsin_nucs.fasta" ftype="fasta" />\n-            <param name="db_opts_selector" value="file" />\n-            <param name="subject" value="four_human_proteins.fasta" ftype="fasta" />\n-            <param name="database" value="" />\n-            <param name="evalue_cutoff" value="1e-10" />\n-            <param name="out_format" value="ext" />\n-            <param name="adv_opts_selector" value="basic" />\n-            <output name="output1" file="blastx_rhodopsin_vs_four_human_ext.tabular" ftype="tabular" />\n-        </test>\n-    </tests>\n-    <help>\n-    \n-.. class:: warningmark\n-\n-**Note**. Database searches may take a substantial amount of time.\n-For large input datasets it is advisable to allow overnight processing.  \n-\n------\n-\n-**What it does**\n-\n-Search a *protein database* using a *translated nucleotide query*,\n-using the NCBI BLAST+ blastx command line tool.\n-\n------\n-\n-**Output format**\n-\n-Because Galaxy focuses on processing tabular data, the default output of this\n-tool is tabular. The standard BLAST+ tabular output contains 12 columns:\n-\n-====== ========= ============================================\n-Column NCBI name Description\n------- --------- --------------------------------------------\n-     1 qseqid    Query Seq-id (ID of your sequence)\n-     2 sseqid    Subject Seq-id (ID of the database hit)\n-     3 pident    Percentage of identical matches\n-     4 length    Alignment length\n-     5 mismatch  Number of mismatches\n-     6 gapopen   Number of gap openings\n-     7 qstart    Start of alignment in query\n-     8 qend      End of alignment in query\n-     9 sstart    Start of alignment in subject (database hit)\n-    10 send      End of alignment in subject (database hit)\n-    11 evalue    Expectation value (E-value)\n-    12 bitscore  Bit score\n-====== ========= ============================================\n-\n-The BLAST+ tools can optionally output additional columns of information,\n-but this takes longer to calculate. Most (but not all) of these columns are\n-included by selecting the extended tabular output. The extra columns are\n-included *after* the standard 12 columns. This is so that you can write\n-workflow filtering steps that accept either the 12 or 24 column tabular\n-BLAST output.\n-\n-====== ============= ===========================================\n-Column NCBI name     Description\n------- ------------- -------------------------------------------\n-    13 sallseqid     All subject Seq-id(s), separated by a \';\'\n-    14 score         Raw score\n-    15 nident        Number of identical matches\n-    16 positive      Number of positive-scoring matches\n-    17 gaps          Total number of gaps\n-    18 ppos          Percentage of positive-scoring matches\n-    19 qframe        Query frame\n-    20 sframe        Subject frame\n-    21 qseq          Aligned part of query sequence\n-    22 sseq          Aligned part of subject sequence\n-    23 qlen          Query sequence length\n-    24 slen          Subject sequence length \n-====== ============= ===========================================\n-\n-The third option is BLAST XML output, which is designed to be parsed by\n-another program, and is understood by some Galaxy tools.\n-\n-You can also choose several plain text or HTML output formats which are designed to be read by a person (not by another program).\n-The HTML versions use basic webpage formatting and can include links to the hits on the NCBI website.\n-The pairwise output (the default on the NCBI BLAST website) shows each match as a pairwise alignment with the query.\n-The two query anchored outputs show a multiple sequence alignment between the query and all the matches,\n-and differ in how insertions are shown (marked as insertions or with gap characters added to the other sequences).\n-\n--------\n-\n-**References**\n-\n-Altschul et al. Gapped BLAST and PSI-BLAST: a new generation of protein database search programs. 1997. Nucleic Acids Res. 25:3389-3402.\n-\n-    </help>\n-</tool>\n'
b
diff -r 45ba7c750bc8 -r 6ef523b390e0 tools/ncbi_blast_plus/ncbi_tblastn_wrapper.xml
--- a/tools/ncbi_blast_plus/ncbi_tblastn_wrapper.xml Thu Sep 20 10:12:43 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
b'@@ -1,314 +0,0 @@\n-<tool id="ncbi_tblastn_wrapper" name="NCBI BLAST+ tblastn" version="0.0.13">\n-    <description>Search translated nucleotide database with protein query sequence(s)</description>\n-    <!-- If job splitting is enabled, break up the query file into parts -->\n-    <parallelism method="multi" split_inputs="query" split_mode="to_size" split_size="1000" shared_inputs="subject" merge_outputs="output1"></parallelism>\n-    <version_command>tblastn -version</version_command>\n-    <command>\n-## The command is a Cheetah template which allows some Python based syntax.\n-## Lines starting hash hash are comments. Galaxy will turn newlines into spaces\n-tblastn\n--query "$query"\n-#if $db_opts.db_opts_selector == "db":\n-  -db "${db_opts.database.fields.path}"\n-#else:\n-  -subject "$db_opts.subject"\n-#end if\n--evalue $evalue_cutoff\n--out $output1\n-##Set the extended list here so if/when we add things, saved workflows are not affected\n-#if str($out_format)=="ext":\n-    -outfmt "6 std sallseqid score nident positive gaps ppos qframe sframe qseq sseq qlen slen"\n-#else:\n-    -outfmt $out_format\n-#end if\n--num_threads 8\n-#if $adv_opts.adv_opts_selector=="advanced":\n--db_gencode $adv_opts.db_gencode\n-$adv_opts.filter_query\n--matrix $adv_opts.matrix\n-## Need int(str(...)) because $adv_opts.max_hits is an InputValueWrapper object not a string\n-## Note -max_target_seqs overrides -num_descriptions and -num_alignments\n-#if (str($adv_opts.max_hits) and int(str($adv_opts.max_hits)) > 0):\n--max_target_seqs $adv_opts.max_hits\n-#end if\n-#if (str($adv_opts.word_size) and int(str($adv_opts.word_size)) > 0):\n--word_size $adv_opts.word_size\n-#end if\n-##Ungapped disabled for now - see comments below\n-##$adv_opts.ungapped\n-$adv_opts.parse_deflines\n-## End of advanced options:\n-#end if\n-    </command>\n-    <stdio>\n-        <exit_code range="1:" />\n-        <exit_code range="://0" />\n-    </stdio>\n-    <inputs>\n-        <param name="query" type="data" format="fasta" label="Protein query sequence(s)"/> \n-        <conditional name="db_opts">\n-            <param name="db_opts_selector" type="select" label="Subject database/sequences">\n-              <option value="db" selected="True">BLAST Database</option>\n-              <option value="file">FASTA file (pairwise e-values)</option>\n-            </param>\n-            <when value="db">\n-                <param name="database" type="select" label="Nucleotide BLAST database">\n-                    <options from_file="blastdb.loc">\n-                      <column name="value" index="0"/>\n-                      <column name="name" index="1"/>\n-                      <column name="path" index="2"/>\n-                    </options>\n-                </param>\n-                <param name="subject" type="hidden" value="" /> \n-            </when>\n-            <when value="file">\n-                <param name="database" type="hidden" value="" /> \n-                <param name="subject" type="data" format="fasta" label="Nucleotide FASTA file to use as database"/> \n-            </when>\n-        </conditional>\n-        <param name="evalue_cutoff" type="float" size="15" value="0.001" label="Set expectation value cutoff" />\n-        <param name="out_format" type="select" label="Output format">\n-            <option value="6" selected="True">Tabular (standard 12 columns)</option>\n-            <option value="ext">Tabular (extended 24 columns)</option>\n-            <option value="5">BLAST XML</option>\n-            <option value="0">Pairwise text</option>\n-            <option value="0 -html">Pairwise HTML</option>\n-            <option value="2">Query-anchored text</option>\n-            <option value="2 -html">Query-anchored HTML</option>\n-            <option value="4">Flat query-anchored text</option>\n-            <option value="4 -html">Flat query-anchored HTML</option>\n-            <!--\n-            <option value="-outfmt 11">BLAST archive format (ASN.1)</option>\n-            -->\n-        </param>\n-        <conditional name="adv_opts'..b'ase" value="" />\n-            <param name="evalue_cutoff" value="1e-10" />\n-            <param name="out_format" value="0 -html" />\n-            <param name="adv_opts_selector" value="advanced" />\n-            <param name="filter_query" value="false" />\n-            <param name="matrix" value="BLOSUM80" />\n-            <param name="max_hits" value="0" />\n-            <param name="word_size" value="0" />\n-            <param name="parse_deflines" value="false" />\n-            <output name="output1" file="tblastn_four_human_vs_rhodopsin.html" ftype="html" />\n-        </test>\n-    </tests>\n-    <help>\n-    \n-.. class:: warningmark\n-\n-**Note**. Database searches may take a substantial amount of time.\n-For large input datasets it is advisable to allow overnight processing.  \n-\n------\n-\n-**What it does**\n-\n-Search a *translated nucleotide database* using a *protein query*,\n-using the NCBI BLAST+ tblastn command line tool.\n-\n------\n-\n-**Output format**\n-\n-Because Galaxy focuses on processing tabular data, the default output of this\n-tool is tabular. The standard BLAST+ tabular output contains 12 columns:\n-\n-====== ========= ============================================\n-Column NCBI name Description\n------- --------- --------------------------------------------\n-     1 qseqid    Query Seq-id (ID of your sequence)\n-     2 sseqid    Subject Seq-id (ID of the database hit)\n-     3 pident    Percentage of identical matches\n-     4 length    Alignment length\n-     5 mismatch  Number of mismatches\n-     6 gapopen   Number of gap openings\n-     7 qstart    Start of alignment in query\n-     8 qend      End of alignment in query\n-     9 sstart    Start of alignment in subject (database hit)\n-    10 send      End of alignment in subject (database hit)\n-    11 evalue    Expectation value (E-value)\n-    12 bitscore  Bit score\n-====== ========= ============================================\n-\n-The BLAST+ tools can optionally output additional columns of information,\n-but this takes longer to calculate. Most (but not all) of these columns are\n-included by selecting the extended tabular output. The extra columns are\n-included *after* the standard 12 columns. This is so that you can write\n-workflow filtering steps that accept either the 12 or 24 column tabular\n-BLAST output.\n-\n-====== ============= ===========================================\n-Column NCBI name     Description\n------- ------------- -------------------------------------------\n-    13 sallseqid     All subject Seq-id(s), separated by a \';\'\n-    14 score         Raw score\n-    15 nident        Number of identical matches\n-    16 positive      Number of positive-scoring matches\n-    17 gaps          Total number of gaps\n-    18 ppos          Percentage of positive-scoring matches\n-    19 qframe        Query frame\n-    20 sframe        Subject frame\n-    21 qseq          Aligned part of query sequence\n-    22 sseq          Aligned part of subject sequence\n-    23 qlen          Query sequence length\n-    24 slen          Subject sequence length\n-====== ============= ===========================================\n-\n-The third option is BLAST XML output, which is designed to be parsed by\n-another program, and is understood by some Galaxy tools.\n-\n-You can also choose several plain text or HTML output formats which are designed to be read by a person (not by another program).\n-The HTML versions use basic webpage formatting and can include links to the hits on the NCBI website.\n-The pairwise output (the default on the NCBI BLAST website) shows each match as a pairwise alignment with the query.\n-The two query anchored outputs show a multiple sequence alignment between the query and all the matches,\n-and differ in how insertions are shown (marked as insertions or with gap characters added to the other sequences).\n-\n--------\n-\n-**References**\n-\n-Altschul et al. Gapped BLAST and PSI-BLAST: a new generation of protein database search programs. 1997. Nucleic Acids Res. 25:3389-3402.\n-\n-    </help>\n-</tool>\n'
b
diff -r 45ba7c750bc8 -r 6ef523b390e0 tools/ncbi_blast_plus/ncbi_tblastx_wrapper.xml
--- a/tools/ncbi_blast_plus/ncbi_tblastx_wrapper.xml Thu Sep 20 10:12:43 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
b'@@ -1,256 +0,0 @@\n-<tool id="ncbi_tblastx_wrapper" name="NCBI BLAST+ tblastx" version="0.0.13">\n-    <description>Search translated nucleotide database with translated nucleotide query sequence(s)</description>\n-    <!-- If job splitting is enabled, break up the query file into parts -->\n-    <parallelism method="multi" split_inputs="query" split_mode="to_size" split_size="1000" shared_inputs="subject" merge_outputs="output1"></parallelism>\n-    <version_command>tblastx -version</version_command>\n-    <command>\n-## The command is a Cheetah template which allows some Python based syntax.\n-## Lines starting hash hash are comments. Galaxy will turn newlines into spaces\n-tblastx\n--query "$query"\n-#if $db_opts.db_opts_selector == "db":\n-  -db "${db_opts.database.fields.path}"\n-#else:\n-  -subject "$db_opts.subject"\n-#end if\n--query_gencode $query_gencode\n--evalue $evalue_cutoff\n--out $output1\n-##Set the extended list here so if/when we add things, saved workflows are not affected\n-#if str($out_format)=="ext":\n-    -outfmt "6 std sallseqid score nident positive gaps ppos qframe sframe qseq sseq qlen slen"\n-#else:\n-    -outfmt $out_format\n-#end if\n--num_threads 8\n-#if $adv_opts.adv_opts_selector=="advanced":\n--db_gencode $adv_opts.db_gencode\n-$adv_opts.filter_query\n-$adv_opts.strand\n--matrix $adv_opts.matrix\n-## Need int(str(...)) because $adv_opts.max_hits is an InputValueWrapper object not a string\n-## Note -max_target_seqs overrides -num_descriptions and -num_alignments\n-#if (str($adv_opts.max_hits) and int(str($adv_opts.max_hits)) > 0):\n--max_target_seqs $adv_opts.max_hits\n-#end if\n-#if (str($adv_opts.word_size) and int(str($adv_opts.word_size)) > 0):\n--word_size $adv_opts.word_size\n-#end if\n-$adv_opts.parse_deflines\n-## End of advanced options:\n-#end if\n-    </command>\n-    <stdio>\n-        <exit_code range="1:" />\n-        <exit_code range="://0" />\n-    </stdio>\n-    <inputs>\n-        <param name="query" type="data" format="fasta" label="Nucleotide query sequence(s)"/> \n-        <conditional name="db_opts">\n-            <param name="db_opts_selector" type="select" label="Subject database/sequences">\n-              <option value="db" selected="True">BLAST Database</option>\n-              <option value="file">FASTA file (pairwise e-values)</option>\n-            </param>\n-            <when value="db">\n-                <param name="database" type="select" label="Nucleotide BLAST database">\n-                    <options from_file="blastdb.loc">\n-                      <column name="value" index="0"/>\n-                      <column name="name" index="1"/>\n-                      <column name="path" index="2"/>\n-                    </options>\n-                </param>\n-                <param name="subject" type="hidden" value="" /> \n-            </when>\n-            <when value="file">\n-                <param name="database" type="hidden" value="" /> \n-                <param name="subject" type="data" format="fasta" label="Nucleotide FASTA file to use as database"/> \n-            </when>\n-        </conditional>\n-        <param name="query_gencode" type="select" label="Query genetic code">\n-            <!-- See http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi for details -->\n-            <option value="1" select="True">1. Standard</option>\n-            <option value="2">2. Vertebrate Mitochondrial</option>\n-            <option value="3">3. Yeast Mitochondrial</option>\n-            <option value="4">4. Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code</option>\n-            <option value="5">5. Invertebrate Mitochondrial</option>\n-            <option value="6">6. Ciliate, Dasycladacean and Hexamita Nuclear Code</option>\n-            <option value="9">9. Echinoderm Mitochondrial</option>\n-            <option value="10">10. Euplotid Nuclear</option>\n-            <option value="11">11. Bacteria and Archaea</option>\n-            <option value="12">12. Alternative Yeast Nuclear</option>\n-          '..b'/>\n-                <when input="out_format" value="0 -html" format="html"/>\n-                <when input="out_format" value="2" format="txt"/>\n-                <when input="out_format" value="2 -html" format="html"/>\n-                <when input="out_format" value="4" format="txt"/>\n-                <when input="out_format" value="4 -html" format="html"/>\n-                <when input="out_format" value="5" format="blastxml"/>\n-            </change_format>\n-        </data>\n-    </outputs>\n-    <requirements>\n-        <requirement type="binary">tblastx</requirement>\n-    </requirements>\n-    <help>\n-    \n-.. class:: warningmark\n-\n-**Note**. Database searches may take a substantial amount of time.\n-For large input datasets it is advisable to allow overnight processing.  \n-\n------\n-\n-**What it does**\n-\n-Search a *translated nucleotide database* using a *protein query*,\n-using the NCBI BLAST+ tblastx command line tool.\n-\n------\n-\n-**Output format**\n-\n-Because Galaxy focuses on processing tabular data, the default output of this\n-tool is tabular. The standard BLAST+ tabular output contains 12 columns:\n-\n-====== ========= ============================================\n-Column NCBI name Description\n------- --------- --------------------------------------------\n-     1 qseqid    Query Seq-id (ID of your sequence)\n-     2 sseqid    Subject Seq-id (ID of the database hit)\n-     3 pident    Percentage of identical matches\n-     4 length    Alignment length\n-     5 mismatch  Number of mismatches\n-     6 gapopen   Number of gap openings\n-     7 qstart    Start of alignment in query\n-     8 qend      End of alignment in query\n-     9 sstart    Start of alignment in subject (database hit)\n-    10 send      End of alignment in subject (database hit)\n-    11 evalue    Expectation value (E-value)\n-    12 bitscore  Bit score\n-====== ========= ============================================\n-\n-The BLAST+ tools can optionally output additional columns of information,\n-but this takes longer to calculate. Most (but not all) of these columns are\n-included by selecting the extended tabular output. The extra columns are\n-included *after* the standard 12 columns. This is so that you can write\n-workflow filtering steps that accept either the 12 or 24 column tabular\n-BLAST output.\n-\n-====== ============= ===========================================\n-Column NCBI name     Description\n------- ------------- -------------------------------------------\n-    13 sallseqid     All subject Seq-id(s), separated by a \';\'\n-    14 score         Raw score\n-    15 nident        Number of identical matches\n-    16 positive      Number of positive-scoring matches\n-    17 gaps          Total number of gaps\n-    18 ppos          Percentage of positive-scoring matches\n-    19 qframe        Query frame\n-    20 sframe        Subject frame\n-    21 qseq          Aligned part of query sequence\n-    22 sseq          Aligned part of subject sequence\n-    23 qlen          Query sequence length\n-    24 slen          Subject sequence length\n-====== ============= ===========================================\n-\n-The third option is BLAST XML output, which is designed to be parsed by\n-another program, and is understood by some Galaxy tools.\n-\n-You can also choose several plain text or HTML output formats which are designed to be read by a person (not by another program).\n-The HTML versions use basic webpage formatting and can include links to the hits on the NCBI website.\n-The pairwise output (the default on the NCBI BLAST website) shows each match as a pairwise alignment with the query.\n-The two query anchored outputs show a multiple sequence alignment between the query and all the matches,\n-and differ in how insertions are shown (marked as insertions or with gap characters added to the other sequences).\n-\n--------\n-\n-**References**\n-\n-Altschul et al. Gapped BLAST and PSI-BLAST: a new generation of protein database search programs. 1997. Nucleic Acids Res. 25:3389-3402.\n-\n-    </help>\n-</tool>\n'
b
diff -r 45ba7c750bc8 -r 6ef523b390e0 tools/ncbi_blast_plus/tool_dependencies.xml
--- a/tools/ncbi_blast_plus/tool_dependencies.xml Thu Sep 20 10:12:43 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,21 +0,0 @@
-<?xml version="1.0"?>
-<tool_dependency>
-    <package name="blast+" version="2.2.26+">
-        <install version="1.0">
-            <actions>
-                <action type="download_by_url">ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/2.2.26/ncbi-blast-2.2.26+-src.tar.gz</action>
-                <action type="shell_command">cd c++ &amp;&amp; ./configure --prefix=$INSTALL_DIR &amp;&amp; make &amp;&amp; make install</action>
-                <action type="set_environment">
-                    <environment_variable name="PATH" action="prepend_to">$INSTALL_DIR/bin</environment_variable>
-                </action>
-            </actions>
-        </install>
-        <readme>
-These links provide information for building the NCBI Blast+ package in most environments.
-
-System requirements
-http://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&amp;PAGE_TYPE=BlastDocs&amp;DOC_TYPE=Download
-        </readme>
-    </package>
-</tool_dependency>
-