+
+
diff -r 10dce68b584b -r 45ba7c750bc8 test-data/tblastn_four_human_vs_rhodopsin.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/tblastn_four_human_vs_rhodopsin.tabular Thu Sep 20 10:12:43 2012 -0400
@@ -0,0 +1,10 @@
+sp|P08100|OPSD_HUMAN gi|57163782|ref|NM_001009242.1| 96.55 348 12 0 1 348 1 1044 0.0 732
+sp|P08100|OPSD_HUMAN gi|2734705|gb|U59921.1|BBU59921 84.80 342 51 1 1 341 42 1067 0.0 646
+sp|P08100|OPSD_HUMAN gi|283855845|gb|GQ290303.1| 93.24 74 5 0 239 312 3147 3368 1e-72 151
+sp|P08100|OPSD_HUMAN gi|283855845|gb|GQ290303.1| 91.53 59 5 0 177 235 2855 3031 1e-72 126
+sp|P08100|OPSD_HUMAN gi|283855845|gb|GQ290303.1| 96.40 111 4 0 11 121 1 333 1e-64 229
+sp|P08100|OPSD_HUMAN gi|283855845|gb|GQ290303.1| 93.22 59 4 0 119 177 1404 1580 1e-32 122
+sp|P08100|OPSD_HUMAN gi|283855845|gb|GQ290303.1| 88.46 26 3 0 312 337 4222 4299 6e-13 57.7
+sp|P08100|OPSD_HUMAN gi|283855822|gb|GQ290312.1| 95.09 326 16 0 11 336 1 978 0.0 658
+sp|P08100|OPSD_HUMAN gi|18148870|dbj|AB062417.1| 93.39 348 23 0 1 348 1 1044 0.0 711
+sp|P08100|OPSD_HUMAN gi|12583664|dbj|AB043817.1| 82.16 342 60 1 1 341 23 1048 0.0 626
diff -r 10dce68b584b -r 45ba7c750bc8 test-data/tblastn_four_human_vs_rhodopsin.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/tblastn_four_human_vs_rhodopsin.xml Thu Sep 20 10:12:43 2012 -0400
@@ -0,0 +1,722 @@
+
+
+
+ tblastn
+ TBLASTN 2.2.25+
+ Stephen F. Altschul, Thomas L. Madden, Alejandro A. Schäffer, Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), "Gapped BLAST and PSI-BLAST: a new generation of protein database search programs", Nucleic Acids Res. 25:3389-3402.
+
+ Query_1
+ sp|Q9BS26|ERP44_HUMAN Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1
+ 406
+
+
+ BLOSUM80
+ 1e-10
+ 10
+ 1
+ F
+
+
+
+
+ 1
+ Query_1
+ sp|Q9BS26|ERP44_HUMAN Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1
+ 406
+
+
+
+ 0
+ 0
+ 19
+ 127710
+ 0.071
+ 0.299
+ 0.27
+
+
+ No hits found
+
+
+ 2
+ Query_1
+ sp|Q9BS26|ERP44_HUMAN Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1
+ 406
+
+
+
+ 0
+ 0
+ 19
+ 127710
+ 0.071
+ 0.299
+ 0.27
+
+
+ No hits found
+
+
+ 3
+ Query_1
+ sp|Q9BS26|ERP44_HUMAN Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1
+ 406
+
+
+
+ 0
+ 0
+ 19
+ 127710
+ 0.071
+ 0.299
+ 0.27
+
+
+ No hits found
+
+
+ 4
+ Query_1
+ sp|Q9BS26|ERP44_HUMAN Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1
+ 406
+
+
+
+ 0
+ 0
+ 19
+ 127710
+ 0.071
+ 0.299
+ 0.27
+
+
+ No hits found
+
+
+ 5
+ Query_1
+ sp|Q9BS26|ERP44_HUMAN Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1
+ 406
+
+
+
+ 0
+ 0
+ 19
+ 127710
+ 0.071
+ 0.299
+ 0.27
+
+
+ No hits found
+
+
+ 6
+ Query_1
+ sp|Q9BS26|ERP44_HUMAN Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1
+ 406
+
+
+
+ 0
+ 0
+ 19
+ 127710
+ 0.071
+ 0.299
+ 0.27
+
+
+ No hits found
+
+
+ 7
+ Query_2
+ sp|Q9NSY1|BMP2K_HUMAN BMP-2-inducible protein kinase OS=Homo sapiens GN=BMP2K PE=1 SV=2
+ 1161
+
+
+
+ 0
+ 0
+ 23
+ 370988
+ 0.071
+ 0.299
+ 0.27
+
+
+ No hits found
+
+
+ 8
+ Query_2
+ sp|Q9NSY1|BMP2K_HUMAN BMP-2-inducible protein kinase OS=Homo sapiens GN=BMP2K PE=1 SV=2
+ 1161
+
+
+
+ 0
+ 0
+ 23
+ 370988
+ 0.071
+ 0.299
+ 0.27
+
+
+ No hits found
+
+
+ 9
+ Query_2
+ sp|Q9NSY1|BMP2K_HUMAN BMP-2-inducible protein kinase OS=Homo sapiens GN=BMP2K PE=1 SV=2
+ 1161
+
+
+
+ 0
+ 0
+ 23
+ 370988
+ 0.071
+ 0.299
+ 0.27
+
+
+ No hits found
+
+
+ 10
+ Query_2
+ sp|Q9NSY1|BMP2K_HUMAN BMP-2-inducible protein kinase OS=Homo sapiens GN=BMP2K PE=1 SV=2
+ 1161
+
+
+
+ 0
+ 0
+ 23
+ 370988
+ 0.071
+ 0.299
+ 0.27
+
+
+ No hits found
+
+
+ 11
+ Query_2
+ sp|Q9NSY1|BMP2K_HUMAN BMP-2-inducible protein kinase OS=Homo sapiens GN=BMP2K PE=1 SV=2
+ 1161
+
+
+
+ 0
+ 0
+ 23
+ 370988
+ 0.071
+ 0.299
+ 0.27
+
+
+ No hits found
+
+
+ 12
+ Query_2
+ sp|Q9NSY1|BMP2K_HUMAN BMP-2-inducible protein kinase OS=Homo sapiens GN=BMP2K PE=1 SV=2
+ 1161
+
+
+
+ 0
+ 0
+ 23
+ 370988
+ 0.071
+ 0.299
+ 0.27
+
+
+ No hits found
+
+
+ 13
+ Query_3
+ sp|P06213|INSR_HUMAN Insulin receptor OS=Homo sapiens GN=INSR PE=1 SV=4
+ 1382
+
+
+
+ 0
+ 0
+ 24
+ 441350
+ 0.071
+ 0.299
+ 0.27
+
+
+ No hits found
+
+
+ 14
+ Query_3
+ sp|P06213|INSR_HUMAN Insulin receptor OS=Homo sapiens GN=INSR PE=1 SV=4
+ 1382
+
+
+
+ 0
+ 0
+ 24
+ 441350
+ 0.071
+ 0.299
+ 0.27
+
+
+ No hits found
+
+
+ 15
+ Query_3
+ sp|P06213|INSR_HUMAN Insulin receptor OS=Homo sapiens GN=INSR PE=1 SV=4
+ 1382
+
+
+
+ 0
+ 0
+ 24
+ 441350
+ 0.071
+ 0.299
+ 0.27
+
+
+ No hits found
+
+
+ 16
+ Query_3
+ sp|P06213|INSR_HUMAN Insulin receptor OS=Homo sapiens GN=INSR PE=1 SV=4
+ 1382
+
+
+
+ 0
+ 0
+ 24
+ 441350
+ 0.071
+ 0.299
+ 0.27
+
+
+ No hits found
+
+
+ 17
+ Query_3
+ sp|P06213|INSR_HUMAN Insulin receptor OS=Homo sapiens GN=INSR PE=1 SV=4
+ 1382
+
+
+
+ 0
+ 0
+ 24
+ 441350
+ 0.071
+ 0.299
+ 0.27
+
+
+ No hits found
+
+
+ 18
+ Query_3
+ sp|P06213|INSR_HUMAN Insulin receptor OS=Homo sapiens GN=INSR PE=1 SV=4
+ 1382
+
+
+
+ 0
+ 0
+ 24
+ 441350
+ 0.071
+ 0.299
+ 0.27
+
+
+ No hits found
+
+
+ 19
+ Query_4
+ sp|P08100|OPSD_HUMAN Rhodopsin OS=Homo sapiens GN=RHO PE=1 SV=1
+ 348
+
+
+ 1
+ Subject_1
+ gi|57163782|ref|NM_001009242.1| Felis catus rhodopsin (RHO), mRNA
+ Subject_1
+ 1047
+
+
+ 1
+ 732.392902459534
+ 1689
+ 0
+ 1
+ 348
+ 1
+ 1044
+ 0
+ 1
+ 336
+ 343
+ 0
+ 348
+ MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA
+ MNGTEGPNFYVPFSNKTGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVFGGFTTTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLVGWSRYIPEGMQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIVIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTLPAFFAKSSSIYNPVIYIMMNKQFRNCMLTTLCCGKNPLGDDEASTTGSKTETSQVAPA
+ MNGTEGPNFYVPFSN TGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMV GGFT+TLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPL GWSRYIPEG+QCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMI+IFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMT+PAFFAKS++IYNPVIYIMMNKQFRNCMLTT+CCGKNPLGDDEAS T SKTETSQVAPA
+
+
+
+
+
+
+ 0
+ 0
+ 18
+ 109230
+ 0.071
+ 0.299
+ 0.27
+
+
+
+
+ 20
+ Query_4
+ sp|P08100|OPSD_HUMAN Rhodopsin OS=Homo sapiens GN=RHO PE=1 SV=1
+ 348
+
+
+ 1
+ Subject_2
+ gi|2734705|gb|U59921.1|BBU59921 Bufo bufo rhodopsin mRNA, complete cds
+ Subject_2
+ 1574
+
+
+ 1
+ 646.119739014374
+ 1489
+ 0
+ 1
+ 341
+ 42
+ 1067
+ 0
+ 3
+ 290
+ 320
+ 1
+ 342
+ MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEA-SATVSKTE
+ MNGTEGPNFYIPMSNKTGVVRSPFEYPQYYLAEPWQYSILCAYMFLLILLGFPINFMTLYVTIQHKKLRTPLNYILLNLAFANHFMVLCGFTVTMYSSMNGYFILGATGCYVEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFSENHAVMGVAFTWIMALSCAVPPLLGWSRYIPEGMQCSCGVDYYTLKPEVNNESFVIYMFVVHFTIPLIIIFFCYGRLVCTVKEAAAQQQESATTQKAEKEVTRMVIIMVVFFLICWVPYASVAFFIFSNQGSEFGPIFMTVPAFFAKSSSIYNPVIYIMLNKQFRNCMITTLCCGKNPFGEDDASSAATSKTE
+ MNGTEGPNFY+P SN TGVVRSPFEYPQYYLAEPWQ+S+L AYMFLLI+LGFPINF+TLYVT+QHKKLRTPLNYILLNLA A+ FMVL GFT T+Y+S+ GYF+ G TGC +EGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRF ENHA+MGVAFTW+MAL+CA PPL GWSRYIPEG+QCSCG+DYYTLKPEVNNESFVIYMFVVHFTIP+IIIFFCYG+LV TVKEAAAQQQESATTQKAEKEVTRMVIIMV+ FLICWVPYASVAF+IF+ QGS FGPIFMT+PAFFAKS++IYNPVIYIM+NKQFRNCM+TT+CCGKNP G+D+A SA SKTE
+
+
+
+
+
+
+ 0
+ 0
+ 18
+ 109230
+ 0.071
+ 0.299
+ 0.27
+
+
+
+
+ 21
+ Query_4
+ sp|P08100|OPSD_HUMAN Rhodopsin OS=Homo sapiens GN=RHO PE=1 SV=1
+ 348
+
+
+ 1
+ Subject_3
+ gi|283855845|gb|GQ290303.1| Cynopterus brachyotis voucher 20020434 rhodopsin (RHO) gene, exons 1 through 5 and partial cds
+ Subject_3
+ 4301
+
+
+ 1
+ 151.343146656381
+ 342
+ 1.39566684546685e-72
+ 239
+ 312
+ 3147
+ 3368
+ 0
+ 3
+ 69
+ 73
+ 0
+ 74
+ ESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQ
+ ESATTQKAEKEVTRMVIIMVIAFLICWLPYAGVAFYIFTHQGSNFGPIFMTLPAFFAKSSSIYNPVIYIMMNKQ
+ ESATTQKAEKEVTRMVIIMVIAFLICW+PYA VAFYIFTHQGSNFGPIFMT+PAFFAKS++IYNPVIYIMMNKQ
+
+
+ 2
+ 126.323929257285
+ 284
+ 1.39566684546685e-72
+ 177
+ 235
+ 2855
+ 3031
+ 0
+ 2
+ 54
+ 57
+ 0
+ 59
+ RYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAA
+ RYIPEGMQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIVIFFCYGQLVFTVKEVRS
+ RYIPEG+QCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMI+IFFCYGQLVFTVKE +
+
+
+ 3
+ 229.420359574251
+ 523
+ 9.84654801241353e-65
+ 11
+ 121
+ 1
+ 333
+ 0
+ 1
+ 107
+ 109
+ 0
+ 111
+ VPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGG
+ VPFSNKTGVVRSPFEHPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVFGGFTTTLYTSLHGYFVFGPTGCNLEGFFATLGG
+ VPFSN TGVVRSPFE+PQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMV GGFT+TLYTSLHGYFVFGPTGCNLEGFFATLGG
+
+
+ 4
+ 122.873002719478
+ 276
+ 1.40732096096596e-32
+ 119
+ 177
+ 1404
+ 1580
+ 0
+ 3
+ 55
+ 56
+ 0
+ 59
+ LGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSR
+ LAGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGLALTWVMALACAAPPLVGWSR
+ L GEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMG+A TWVMALACAAPPL GWSR
+
+
+ 5
+ 57.7367643183824
+ 125
+ 5.60065526485586e-13
+ 312
+ 337
+ 4222
+ 4299
+ 0
+ 1
+ 23
+ 24
+ 0
+ 26
+ QFRNCMLTTICCGKNPLGDDEASATV
+ QFRNCMLTTLCCGKNPLGDDEASTTA
+ QFRNCMLTT+CCGKNPLGDDEAS T
+
+
+
+
+
+
+ 0
+ 0
+ 18
+ 109230
+ 0.071
+ 0.299
+ 0.27
+
+
+
+
+ 22
+ Query_4
+ sp|P08100|OPSD_HUMAN Rhodopsin OS=Homo sapiens GN=RHO PE=1 SV=1
+ 348
+
+
+ 1
+ Subject_4
+ gi|283855822|gb|GQ290312.1| Myotis ricketti voucher GQX10 rhodopsin (RHO) mRNA, partial cds
+ Subject_4
+ 983
+
+
+ 1
+ 658.197981896696
+ 1517
+ 0
+ 11
+ 336
+ 1
+ 978
+ 0
+ 1
+ 310
+ 322
+ 0
+ 326
+ VPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASAT
+ VPFSNKTGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVANLFMVFGGFTTTLYTSMHGYFVFGATGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGLAFTWVMALACAAPPLAGWSRYIPEGMQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIVIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVVAFLICWLPYASVAFYIFTHQGSNFGPVFMTIPAFFAKSSSIYNPVIYIMMNKQFRNCMLTTLCCGKNPLGDDEASTT
+ VPFSN TGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVA+LFMV GGFT+TLYTS+HGYFVFG TGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMG+AFTWVMALACAAPPLAGWSRYIPEG+QCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMI+IFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMV+AFLICW+PYASVAFYIFTHQGSNFGP+FMTIPAFFAKS++IYNPVIYIMMNKQFRNCMLTT+CCGKNPLGDDEAS T
+
+
+
+
+
+
+ 0
+ 0
+ 18
+ 109230
+ 0.071
+ 0.299
+ 0.27
+
+
+
+
+ 23
+ Query_4
+ sp|P08100|OPSD_HUMAN Rhodopsin OS=Homo sapiens GN=RHO PE=1 SV=1
+ 348
+
+
+ 1
+ Subject_5
+ gi|18148870|dbj|AB062417.1| Synthetic construct Bos taurus gene for rhodopsin, complete cds
+ Subject_5
+ 1047
+
+
+ 1
+ 711.255977415469
+ 1640
+ 0
+ 1
+ 348
+ 1
+ 1044
+ 0
+ 1
+ 325
+ 337
+ 0
+ 348
+ MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA
+ MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFLLIMLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVFGGFTTTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLVGWSRYIPEGMQCSCGIDYYTPHEETNNESFVIYMFVVHFIIPLIVIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWLPYAGVAFYIFTHQGSDFGPIFMTIPAFFAKTSAVYNPVIYIMMNKQFRNCMVTTLCCGKNPLGDDEASTTVSKTETSQVAPA
+ MNGTEGPNFYVPFSN TGVVRSPFE PQYYLAEPWQFSMLAAYMFLLI+LGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMV GGFT+TLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPL GWSRYIPEG+QCSCGIDYYT E NNESFVIYMFVVHF IP+I+IFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICW+PYA VAFYIFTHQGS+FGPIFMTIPAFFAK++A+YNPVIYIMMNKQFRNCM+TT+CCGKNPLGDDEAS TVSKTETSQVAPA
+
+
+
+
+
+
+ 0
+ 0
+ 18
+ 109230
+ 0.071
+ 0.299
+ 0.27
+
+
+
+
+ 24
+ Query_4
+ sp|P08100|OPSD_HUMAN Rhodopsin OS=Homo sapiens GN=RHO PE=1 SV=1
+ 348
+
+
+ 1
+ Subject_6
+ gi|12583664|dbj|AB043817.1| Conger myriaster conf gene for fresh water form rod opsin, complete cds
+ Subject_6
+ 1344
+
+
+ 1
+ 626.708277239213
+ 1444
+ 0
+ 1
+ 341
+ 23
+ 1048
+ 0
+ 2
+ 281
+ 311
+ 1
+ 342
+ MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPL-GDDEASATVSKTE
+ MNGTEGPNFYIPMSNATGVVRSPFEYPQYYLAEPWAFSALSAYMFFLIIAGFPINFLTLYVTIEHKKLRTPLNYILLNLAVADLFMVFGGFTTTMYTSMHGYFVFGPTGCNIEGFFATLGGEIALWCLVVLAIERWMVVCKPVTNFRFGESHAIMGVMVTWTMALACALPPLFGWSRYIPEGLQCSCGIDYYTRAPGINNESFVIYMFTCHFSIPLAVISFCYGRLVCTVKEAAAQQQESETTQRAEREVTRMVVIMVISFLVCWVPYASVAWYIFTHQGSTFGPIFMTIPSFFAKSSALYNPMIYICMNKQFRHCMITTLCCGKNPFEEEDGASATSSKTE
+ MNGTEGPNFY+P SNATGVVRSPFEYPQYYLAEPW FS L+AYMF LI+ GFPINFLTLYVT++HKKLRTPLNYILLNLAVADLFMV GGFT+T+YTS+HGYFVFGPTGCN+EGFFATLGGEIALW LVVLAIER++VVCKP++NFRFGE HAIMGV TW MALACA PPL GWSRYIPEGLQCSCGIDYYT P +NNESFVIYMF HF+IP+ +I FCYG+LV TVKEAAAQQQES TTQ+AE+EVTRMV+IMVI+FL+CWVPYASVA YIFTHQGS FGPIFMTIP+FFAKS+A+YNP+IYI MNKQFR CM+TT+CCGKNP +D ASAT SKTE
+
+
+
+
+
+
+ 0
+ 0
+ 18
+ 109230
+ 0.071
+ 0.299
+ 0.27
+
+
+
+
+
diff -r 10dce68b584b -r 45ba7c750bc8 test-data/tblastn_four_human_vs_rhodopsin_ext.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/tblastn_four_human_vs_rhodopsin_ext.tabular Thu Sep 20 10:12:43 2012 -0400
@@ -0,0 +1,10 @@
+sp|P08100|OPSD_HUMAN gi|57163782|ref|NM_001009242.1| 96.55 348 12 0 1 348 1 1044 0.0 732 gi|57163782|ref|NM_001009242.1| 1689 336 343 0 98.56 0 1 MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA MNGTEGPNFYVPFSNKTGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVFGGFTTTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLVGWSRYIPEGMQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIVIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTLPAFFAKSSSIYNPVIYIMMNKQFRNCMLTTLCCGKNPLGDDEASTTGSKTETSQVAPA 348 1047
+sp|P08100|OPSD_HUMAN gi|2734705|gb|U59921.1|BBU59921 84.80 342 51 1 1 341 42 1067 0.0 646 gi|2734705|gb|U59921.1|BBU59921 1489 290 320 1 93.57 0 3 MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEA-SATVSKTE MNGTEGPNFYIPMSNKTGVVRSPFEYPQYYLAEPWQYSILCAYMFLLILLGFPINFMTLYVTIQHKKLRTPLNYILLNLAFANHFMVLCGFTVTMYSSMNGYFILGATGCYVEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFSENHAVMGVAFTWIMALSCAVPPLLGWSRYIPEGMQCSCGVDYYTLKPEVNNESFVIYMFVVHFTIPLIIIFFCYGRLVCTVKEAAAQQQESATTQKAEKEVTRMVIIMVVFFLICWVPYASVAFFIFSNQGSEFGPIFMTVPAFFAKSSSIYNPVIYIMLNKQFRNCMITTLCCGKNPFGEDDASSAATSKTE 348 1574
+sp|P08100|OPSD_HUMAN gi|283855845|gb|GQ290303.1| 93.24 74 5 0 239 312 3147 3368 1e-72 151 gi|283855845|gb|GQ290303.1| 342 69 73 0 98.65 0 3 ESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQ ESATTQKAEKEVTRMVIIMVIAFLICWLPYAGVAFYIFTHQGSNFGPIFMTLPAFFAKSSSIYNPVIYIMMNKQ 348 4301
+sp|P08100|OPSD_HUMAN gi|283855845|gb|GQ290303.1| 91.53 59 5 0 177 235 2855 3031 1e-72 126 gi|283855845|gb|GQ290303.1| 284 54 57 0 96.61 0 2 RYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAA RYIPEGMQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIVIFFCYGQLVFTVKEVRS 348 4301
+sp|P08100|OPSD_HUMAN gi|283855845|gb|GQ290303.1| 96.40 111 4 0 11 121 1 333 1e-64 229 gi|283855845|gb|GQ290303.1| 523 107 109 0 98.20 0 1 VPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGG VPFSNKTGVVRSPFEHPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVFGGFTTTLYTSLHGYFVFGPTGCNLEGFFATLGG 348 4301
+sp|P08100|OPSD_HUMAN gi|283855845|gb|GQ290303.1| 93.22 59 4 0 119 177 1404 1580 1e-32 122 gi|283855845|gb|GQ290303.1| 276 55 56 0 94.92 0 3 LGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSR LAGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGLALTWVMALACAAPPLVGWSR 348 4301
+sp|P08100|OPSD_HUMAN gi|283855845|gb|GQ290303.1| 88.46 26 3 0 312 337 4222 4299 6e-13 57.7 gi|283855845|gb|GQ290303.1| 125 23 24 0 92.31 0 1 QFRNCMLTTICCGKNPLGDDEASATV QFRNCMLTTLCCGKNPLGDDEASTTA 348 4301
+sp|P08100|OPSD_HUMAN gi|283855822|gb|GQ290312.1| 95.09 326 16 0 11 336 1 978 0.0 658 gi|283855822|gb|GQ290312.1| 1517 310 322 0 98.77 0 1 VPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASAT VPFSNKTGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVANLFMVFGGFTTTLYTSMHGYFVFGATGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGLAFTWVMALACAAPPLAGWSRYIPEGMQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIVIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVVAFLICWLPYASVAFYIFTHQGSNFGPVFMTIPAFFAKSSSIYNPVIYIMMNKQFRNCMLTTLCCGKNPLGDDEASTT 348 983
+sp|P08100|OPSD_HUMAN gi|18148870|dbj|AB062417.1| 93.39 348 23 0 1 348 1 1044 0.0 711 gi|18148870|dbj|AB062417.1| 1640 325 337 0 96.84 0 1 MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFLLIMLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVFGGFTTTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLVGWSRYIPEGMQCSCGIDYYTPHEETNNESFVIYMFVVHFIIPLIVIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWLPYAGVAFYIFTHQGSDFGPIFMTIPAFFAKTSAVYNPVIYIMMNKQFRNCMVTTLCCGKNPLGDDEASTTVSKTETSQVAPA 348 1047
+sp|P08100|OPSD_HUMAN gi|12583664|dbj|AB043817.1| 82.16 342 60 1 1 341 23 1048 0.0 626 gi|12583664|dbj|AB043817.1| 1444 281 311 1 90.94 0 2 MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPL-GDDEASATVSKTE MNGTEGPNFYIPMSNATGVVRSPFEYPQYYLAEPWAFSALSAYMFFLIIAGFPINFLTLYVTIEHKKLRTPLNYILLNLAVADLFMVFGGFTTTMYTSMHGYFVFGPTGCNIEGFFATLGGEIALWCLVVLAIERWMVVCKPVTNFRFGESHAIMGVMVTWTMALACALPPLFGWSRYIPEGLQCSCGIDYYTRAPGINNESFVIYMFTCHFSIPLAVISFCYGRLVCTVKEAAAQQQESETTQRAEREVTRMVVIMVISFLVCWVPYASVAWYIFTHQGSTFGPIFMTIPSFFAKSSALYNPMIYICMNKQFRHCMITTLCCGKNPFEEEDGASATSSKTE 348 1344
diff -r 10dce68b584b -r 45ba7c750bc8 test-data/tblastn_four_human_vs_rhodopsin_parse_deflines.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/tblastn_four_human_vs_rhodopsin_parse_deflines.tabular Thu Sep 20 10:12:43 2012 -0400
@@ -0,0 +1,10 @@
+sp|P08100|OPSD_HUMAN gi|57163782|ref|NM_001009242.1| 96.55 348 12 0 1 348 1 1044 0.0 732
+sp|P08100|OPSD_HUMAN gi|2734705|gb|U59921.1|BBU59921 84.80 342 51 1 1 341 42 1067 0.0 646
+sp|P08100|OPSD_HUMAN gi|283855845|gb|GQ290303.1| 93.24 74 5 0 239 312 3147 3368 1e-72 151
+sp|P08100|OPSD_HUMAN gi|283855845|gb|GQ290303.1| 91.53 59 5 0 177 235 2855 3031 1e-72 126
+sp|P08100|OPSD_HUMAN gi|283855845|gb|GQ290303.1| 96.40 111 4 0 11 121 1 333 1e-64 229
+sp|P08100|OPSD_HUMAN gi|283855845|gb|GQ290303.1| 93.22 59 4 0 119 177 1404 1580 1e-32 122
+sp|P08100|OPSD_HUMAN gi|283855845|gb|GQ290303.1| 88.46 26 3 0 312 337 4222 4299 6e-13 57.7
+sp|P08100|OPSD_HUMAN gi|283855822|gb|GQ290312.1| 95.09 326 16 0 11 336 1 978 0.0 658
+sp|P08100|OPSD_HUMAN gi|18148870|dbj|AB062417.1| 93.39 348 23 0 1 348 1 1044 0.0 711
+sp|P08100|OPSD_HUMAN gi|12583664|dbj|AB043817.1| 82.16 342 60 1 1 341 23 1048 0.0 626
diff -r 10dce68b584b -r 45ba7c750bc8 tools/ncbi_blast_plus/blastdb.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ncbi_blast_plus/blastdb.loc.sample Thu Sep 20 10:12:43 2012 -0400
@@ -0,0 +1,38 @@
+#This is a sample file distributed with Galaxy that is used to define a
+#list of nucleotide BLAST databases, using three columns tab separated
+#(longer whitespace are TAB characters):
+#
+#
+#
+#The captions typically contain spaces and might end with the build date.
+#It is important that the actual database name does not have a space in it,
+#and that the first tab that appears in the line is right before the path.
+#
+#So, for example, if your database is nt and the path to your base name
+#is /depot/data2/galaxy/blastdb/nt/nt.chunk, then the blastdb.loc entry
+#would look like this:
+#
+#nt_02_Dec_2009 nt 02 Dec 2009 /depot/data2/galaxy/blastdb/nt/nt.chunk
+#
+#and your /depot/data2/galaxy/blastdb/nt directory would contain all of
+#your "base names" (e.g.):
+#
+#-rw-r--r-- 1 wychung galaxy 23437408 2008-04-09 11:26 nt.chunk.00.nhr
+#-rw-r--r-- 1 wychung galaxy 3689920 2008-04-09 11:26 nt.chunk.00.nin
+#-rw-r--r-- 1 wychung galaxy 251215198 2008-04-09 11:26 nt.chunk.00.nsq
+#...etc...
+#
+#Your blastdb.loc file should include an entry per line for each "base name"
+#you have stored. For example:
+#
+#nt_02_Dec_2009 nt 02 Dec 2009 /depot/data2/galaxy/blastdb/nt/nt.chunk
+#wgs_30_Nov_2009 wgs 30 Nov 2009 /depot/data2/galaxy/blastdb/wgs/wgs.chunk
+#test_20_Sep_2008 test 20 Sep 2008 /depot/data2/galaxy/blastdb/test/test
+#...etc...
+#
+#See also blastdb_p.loc which is for any protein BLAST database.
+#
+#Note that for backwards compatibility with workflows, the unique ID of
+#an entry must be the path that was in the original loc file, because that
+#is the value stored in the workflow for that parameter.
+#
diff -r 10dce68b584b -r 45ba7c750bc8 tools/ncbi_blast_plus/blastdb_p.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ncbi_blast_plus/blastdb_p.loc.sample Thu Sep 20 10:12:43 2012 -0400
@@ -0,0 +1,27 @@
+#This is a sample file distributed with Galaxy that is used to define a
+#list of protein BLAST databases, using three columns tab separated
+#(longer whitespace are TAB characters):
+#
+#
+#
+#The captions typically contain spaces and might end with the build date.
+#It is important that the actual database name does not have a space in it,
+#and that the first tab that appears in the line is right before the path.
+#
+#So, for example, if your database is NR and the path to your base name
+#is /data/blastdb/nr, then the blastdb_p.loc entry would look like this:
+#
+#nr NCBI NR (non redundant) /data/blastdb/nr
+#
+#and your /data/blastdb directory would contain all of the files associated
+#with the database, /data/blastdb/nr.*.
+#
+#Your blastdb_p.loc file should include an entry per line for each "base name"
+#you have stored. For example:
+#
+#nr_05Jun2010 NCBI NR (non redundant) 05 Jun 2010 /data/blastdb/05Jun2010/nr
+#nr_15Aug2010 NCBI NR (non redundant) 15 Aug 2010 /data/blastdb/15Aug2010/nr
+#...etc...
+#
+#See also blastdb.loc which is for any nucleotide BLAST database.
+#
diff -r 10dce68b584b -r 45ba7c750bc8 tools/ncbi_blast_plus/blastxml_to_tabular.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ncbi_blast_plus/blastxml_to_tabular.py Thu Sep 20 10:12:43 2012 -0400
@@ -0,0 +1,254 @@
+#!/usr/bin/env python
+"""Convert a BLAST XML file to 12 column tabular output
+
+Takes three command line options, input BLAST XML filename, output tabular
+BLAST filename, output format (std for standard 12 columns, or ext for the
+extended 24 columns offered in the BLAST+ wrappers).
+
+The 12 columns output are 'qseqid sseqid pident length mismatch gapopen qstart
+qend sstart send evalue bitscore' or 'std' at the BLAST+ command line, which
+mean:
+
+====== ========= ============================================
+Column NCBI name Description
+------ --------- --------------------------------------------
+ 1 qseqid Query Seq-id (ID of your sequence)
+ 2 sseqid Subject Seq-id (ID of the database hit)
+ 3 pident Percentage of identical matches
+ 4 length Alignment length
+ 5 mismatch Number of mismatches
+ 6 gapopen Number of gap openings
+ 7 qstart Start of alignment in query
+ 8 qend End of alignment in query
+ 9 sstart Start of alignment in subject (database hit)
+ 10 send End of alignment in subject (database hit)
+ 11 evalue Expectation value (E-value)
+ 12 bitscore Bit score
+====== ========= ============================================
+
+The additional columns offered in the Galaxy BLAST+ wrappers are:
+
+====== ============= ===========================================
+Column NCBI name Description
+------ ------------- -------------------------------------------
+ 13 sallseqid All subject Seq-id(s), separated by a ';'
+ 14 score Raw score
+ 15 nident Number of identical matches
+ 16 positive Number of positive-scoring matches
+ 17 gaps Total number of gaps
+ 18 ppos Percentage of positive-scoring matches
+ 19 qframe Query frame
+ 20 sframe Subject frame
+ 21 qseq Aligned part of query sequence
+ 22 sseq Aligned part of subject sequence
+ 23 qlen Query sequence length
+ 24 slen Subject sequence length
+====== ============= ===========================================
+
+Most of these fields are given explicitly in the XML file, others some like
+the percentage identity and the number of gap openings must be calculated.
+
+Be aware that the sequence in the extended tabular output or XML direct from
+BLAST+ may or may not use XXXX masking on regions of low complexity. This
+can throw the off the calculation of percentage identity and gap openings.
+[In fact, both BLAST 2.2.24+ and 2.2.25+ have a subtle bug in this regard,
+with these numbers changing depending on whether or not the low complexity
+filter is used.]
+
+This script attempts to produce identical output to what BLAST+ would have done.
+However, check this with "diff -b ..." since BLAST+ sometimes includes an extra
+space character (probably a bug).
+"""
+import sys
+import re
+
+if sys.version_info[:2] >= ( 2, 5 ):
+ import xml.etree.cElementTree as ElementTree
+else:
+ from galaxy import eggs
+ import pkg_resources; pkg_resources.require( "elementtree" )
+ from elementtree import ElementTree
+
+def stop_err( msg ):
+ sys.stderr.write("%s\n" % msg)
+ sys.exit(1)
+
+#Parse Command Line
+try:
+ in_file, out_file, out_fmt = sys.argv[1:]
+except:
+ stop_err("Expect 3 arguments: input BLAST XML file, output tabular file, out format (std or ext)")
+
+if out_fmt == "std":
+ extended = False
+elif out_fmt == "x22":
+ stop_err("Format argument x22 has been replaced with ext (extended 24 columns)")
+elif out_fmt == "ext":
+ extended = True
+else:
+ stop_err("Format argument should be std (12 column) or ext (extended 24 columns)")
+
+
+# get an iterable
+try:
+ context = ElementTree.iterparse(in_file, events=("start", "end"))
+except:
+ stop_err("Invalid data format.")
+# turn it into an iterator
+context = iter(context)
+# get the root element
+try:
+ event, root = context.next()
+except:
+ stop_err( "Invalid data format." )
+
+
+re_default_query_id = re.compile("^Query_\d+$")
+assert re_default_query_id.match("Query_101")
+assert not re_default_query_id.match("Query_101a")
+assert not re_default_query_id.match("MyQuery_101")
+re_default_subject_id = re.compile("^Subject_\d+$")
+assert re_default_subject_id.match("Subject_1")
+assert not re_default_subject_id.match("Subject_")
+assert not re_default_subject_id.match("Subject_12a")
+assert not re_default_subject_id.match("TheSubject_1")
+
+
+outfile = open(out_file, 'w')
+blast_program = None
+for event, elem in context:
+ if event == "end" and elem.tag == "BlastOutput_program":
+ blast_program = elem.text
+ # for every tag
+ if event == "end" and elem.tag == "Iteration":
+ #Expecting either this, from BLAST 2.2.25+ using FASTA vs FASTA
+ # sp|Q9BS26|ERP44_HUMAN
+ # Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1
+ # 406
+ #
+ #
+ #Or, from BLAST 2.2.24+ run online
+ # Query_1
+ # Sample
+ # 516
+ # ...
+ qseqid = elem.findtext("Iteration_query-ID")
+ if re_default_query_id.match(qseqid):
+ #Place holder ID, take the first word of the query definition
+ qseqid = elem.findtext("Iteration_query-def").split(None,1)[0]
+ qlen = int(elem.findtext("Iteration_query-len"))
+
+ # for every within
+ for hit in elem.findall("Iteration_hits/Hit"):
+ #Expecting either this,
+ # gi|3024260|sp|P56514.1|OPSD_BUFBU
+ # RecName: Full=Rhodopsin
+ # P56514
+ #or,
+ # Subject_1
+ # gi|57163783|ref|NP_001009242.1| rhodopsin [Felis catus]
+ # Subject_1
+ #
+ #apparently depending on the parse_deflines switch
+ sseqid = hit.findtext("Hit_id").split(None,1)[0]
+ hit_def = sseqid + " " + hit.findtext("Hit_def")
+ if re_default_subject_id.match(sseqid) \
+ and sseqid == hit.findtext("Hit_accession"):
+ #Place holder ID, take the first word of the subject definition
+ hit_def = hit.findtext("Hit_def")
+ sseqid = hit_def.split(None,1)[0]
+ # for every within
+ for hsp in hit.findall("Hit_hsps/Hsp"):
+ nident = hsp.findtext("Hsp_identity")
+ length = hsp.findtext("Hsp_align-len")
+ pident = "%0.2f" % (100*float(nident)/float(length))
+
+ q_seq = hsp.findtext("Hsp_qseq")
+ h_seq = hsp.findtext("Hsp_hseq")
+ m_seq = hsp.findtext("Hsp_midline")
+ assert len(q_seq) == len(h_seq) == len(m_seq) == int(length)
+ gapopen = str(len(q_seq.replace('-', ' ').split())-1 + \
+ len(h_seq.replace('-', ' ').split())-1)
+
+ mismatch = m_seq.count(' ') + m_seq.count('+') \
+ - q_seq.count('-') - h_seq.count('-')
+ #TODO - Remove this alternative mismatch calculation and test
+ #once satisifed there are no problems
+ expected_mismatch = len(q_seq) \
+ - sum(1 for q,h in zip(q_seq, h_seq) \
+ if q == h or q == "-" or h == "-")
+ xx = sum(1 for q,h in zip(q_seq, h_seq) if q=="X" and h=="X")
+ if not (expected_mismatch - q_seq.count("X") <= int(mismatch) <= expected_mismatch + xx):
+ stop_err("%s vs %s mismatches, expected %i <= %i <= %i" \
+ % (qseqid, sseqid, expected_mismatch - q_seq.count("X"),
+ int(mismatch), expected_mismatch))
+
+ #TODO - Remove this alternative identity calculation and test
+ #once satisifed there are no problems
+ expected_identity = sum(1 for q,h in zip(q_seq, h_seq) if q == h)
+ if not (expected_identity - xx <= int(nident) <= expected_identity + q_seq.count("X")):
+ stop_err("%s vs %s identities, expected %i <= %i <= %i" \
+ % (qseqid, sseqid, expected_identity, int(nident),
+ expected_identity + q_seq.count("X")))
+
+
+ evalue = hsp.findtext("Hsp_evalue")
+ if evalue == "0":
+ evalue = "0.0"
+ else:
+ evalue = "%0.0e" % float(evalue)
+
+ bitscore = float(hsp.findtext("Hsp_bit-score"))
+ if bitscore < 100:
+ #Seems to show one decimal place for lower scores
+ bitscore = "%0.1f" % bitscore
+ else:
+ #Note BLAST does not round to nearest int, it truncates
+ bitscore = "%i" % bitscore
+
+ values = [qseqid,
+ sseqid,
+ pident,
+ length, #hsp.findtext("Hsp_align-len")
+ str(mismatch),
+ gapopen,
+ hsp.findtext("Hsp_query-from"), #qstart,
+ hsp.findtext("Hsp_query-to"), #qend,
+ hsp.findtext("Hsp_hit-from"), #sstart,
+ hsp.findtext("Hsp_hit-to"), #send,
+ evalue, #hsp.findtext("Hsp_evalue") in scientific notation
+ bitscore, #hsp.findtext("Hsp_bit-score") rounded
+ ]
+
+ if extended:
+ sallseqid = ";".join(name.split(None,1)[0] for name in hit_def.split(">"))
+ #print hit_def, "-->", sallseqid
+ positive = hsp.findtext("Hsp_positive")
+ ppos = "%0.2f" % (100*float(positive)/float(length))
+ qframe = hsp.findtext("Hsp_query-frame")
+ sframe = hsp.findtext("Hsp_hit-frame")
+ if blast_program == "blastp":
+ #Probably a bug in BLASTP that they use 0 or 1 depending on format
+ if qframe == "0": qframe = "1"
+ if sframe == "0": sframe = "1"
+ slen = int(hit.findtext("Hit_len"))
+ values.extend([sallseqid,
+ hsp.findtext("Hsp_score"), #score,
+ nident,
+ positive,
+ hsp.findtext("Hsp_gaps"), #gaps,
+ ppos,
+ qframe,
+ sframe,
+ #NOTE - for blastp, XML shows original seq, tabular uses XXX masking
+ q_seq,
+ h_seq,
+ str(qlen),
+ str(slen),
+ ])
+ #print "\t".join(values)
+ outfile.write("\t".join(values) + "\n")
+ # prevents ElementTree from growing large datastructure
+ root.clear()
+ elem.clear()
+outfile.close()
diff -r 10dce68b584b -r 45ba7c750bc8 tools/ncbi_blast_plus/blastxml_to_tabular.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ncbi_blast_plus/blastxml_to_tabular.xml Thu Sep 20 10:12:43 2012 -0400
@@ -0,0 +1,127 @@
+
+ Convert BLAST XML output to tabular
+
+ blastxml_to_tabular.py $blastxml_file $tabular_file $out_format
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+NCBI BLAST+ (and the older NCBI 'legacy' BLAST) can output in a range of
+formats including tabular and a more detailed XML format. A complex workflow
+may need both the XML and the tabular output - but running BLAST twice is
+slow and wasteful.
+
+This tool takes the BLAST XML output and by default converts it into the
+standard 12 column tabular equivalent:
+
+====== ========= ============================================
+Column NCBI name Description
+------ --------- --------------------------------------------
+ 1 qseqid Query Seq-id (ID of your sequence)
+ 2 sseqid Subject Seq-id (ID of the database hit)
+ 3 pident Percentage of identical matches
+ 4 length Alignment length
+ 5 mismatch Number of mismatches
+ 6 gapopen Number of gap openings
+ 7 qstart Start of alignment in query
+ 8 qend End of alignment in query
+ 9 sstart Start of alignment in subject (database hit)
+ 10 send End of alignment in subject (database hit)
+ 11 evalue Expectation value (E-value)
+ 12 bitscore Bit score
+====== ========= ============================================
+
+The BLAST+ tools can optionally output additional columns of information,
+but this takes longer to calculate. Most (but not all) of these columns are
+included by selecting the extended tabular output. The extra columns are
+included *after* the standard 12 columns. This is so that you can write
+workflow filtering steps that accept either the 12 or 22 column tabular
+BLAST output.
+
+====== ============= ===========================================
+Column NCBI name Description
+------ ------------- -------------------------------------------
+ 13 sallseqid All subject Seq-id(s), separated by a ';'
+ 14 score Raw score
+ 15 nident Number of identical matches
+ 16 positive Number of positive-scoring matches
+ 17 gaps Total number of gaps
+ 18 ppos Percentage of positive-scoring matches
+ 19 qframe Query frame
+ 20 sframe Subject frame
+ 21 qseq Aligned part of query sequence
+ 22 sseq Aligned part of subject sequence
+ 23 qlen Query sequence length
+ 24 slen Subject sequence length
+====== ============= ===========================================
+
+Beware that the XML file (and thus the conversion) and the tabular output
+direct from BLAST+ may differ in the presence of XXXX masking on regions
+low complexity (columns 21 and 22), and thus also calculated figures like
+the percentage idenity (column 3).
+
+
+
diff -r 10dce68b584b -r 45ba7c750bc8 tools/ncbi_blast_plus/ncbi_blast_plus.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ncbi_blast_plus/ncbi_blast_plus.txt Thu Sep 20 10:12:43 2012 -0400
@@ -0,0 +1,84 @@
+Galaxy wrappers for NCBI BLAST+ suite
+=====================================
+
+These wrappers are copyright 2010-2012 by Peter Cock, The James Hutton Institute
+(formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved.
+See the licence text below.
+
+Currently tested with NCBI BLAST 2.2.26+ (i.e. version 2.2.26 of BLAST+),
+and do not work with the NCBI 'legacy' BLAST suite (e.g. blastall).
+
+Note that these wrappers (and the associated datetypes) were originally
+distributed as part of the main Galaxy repository, but as of August 2012
+moved to the Galaxy Tool Shed as 'ncbi_blast_plus' (and 'blast_datatypes').
+My thanks to Dannon Baker from the Galaxy development team for his assistance
+with this.
+
+
+Manual Installation
+===================
+
+For those not using Galaxy's automated installation from the Tool Shed, put
+the XML and Python files under tools/ncbi_blast_plus and add the XML files
+to your tool_conf.xml as normal.
+
+You must tell Galaxy about any system level BLAST databases using configuration
+files blastdb.loc (nucleotide databases like NT) and blastdb_p.loc (protein
+databases like NR).
+
+You will also need to install 'blast_datatypes' from the Tool Shed. This
+defines the BLAST XML file format ('blastxml').
+
+
+History
+=======
+
+v0.0.11 - Final revision as part of the Galaxy main repository, and the
+ first release via the Tool Shed
+v0.0.12 - Implements genetic code option for translation searches.
+ - Changes to 1000 sequences at a time (to cope with
+ very large sets of queries where BLAST+ can become memory hungry)
+ - Include warning that BLAST+ with subject FASTA gives pairwise
+ e-values
+v0.0.13 - Use the new error handling options in Galaxy (the previously
+ bundled hide_stderr.py script is no longer needed).
+
+
+Developers
+==========
+
+This script and related tools are being developed on the following hg branch:
+http://bitbucket.org/peterjc/galaxy-central/src/tools
+
+For making the "Galaxy Tool Shed" http://community.g2.bx.psu.edu/ tarball I use
+the following command from the Galaxy root folder:
+
+$ ./tools/ncbi_blast_plus/make_ncbi_blast_plus.sh
+
+This similifies ensuring a consistent set of files is bundled each time,
+including all the relevant test files.
+
+
+Licence (MIT/BSD style)
+=======================
+
+Permission to use, copy, modify, and distribute this software and its
+documentation with or without modifications and for any purpose and
+without fee is hereby granted, provided that any copyright notices
+appear in all copies and that both those copyright notices and this
+permission notice appear in supporting documentation, and that the
+names of the contributors or copyright holders not be used in
+advertising or publicity pertaining to distribution of the software
+without specific prior permission.
+
+THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL
+WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE
+CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT
+OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
+OR PERFORMANCE OF THIS SOFTWARE.
+
+NOTE: This is the licence for the Galaxy Wrapper only. BLAST+ and
+associated data files are available and licenced separately.
diff -r 10dce68b584b -r 45ba7c750bc8 tools/ncbi_blast_plus/ncbi_blastn_wrapper.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ncbi_blast_plus/ncbi_blastn_wrapper.xml Thu Sep 20 10:12:43 2012 -0400
@@ -0,0 +1,215 @@
+
+ Search nucleotide database with nucleotide query sequence(s)
+
+
+ blastn -version
+
+## The command is a Cheetah template which allows some Python based syntax.
+## Lines starting hash hash are comments. Galaxy will turn newlines into spaces
+blastn
+-query "$query"
+#if $db_opts.db_opts_selector == "db":
+ -db "${db_opts.database.fields.path}"
+#else:
+ -subject "$db_opts.subject"
+#end if
+-task $blast_type
+-evalue $evalue_cutoff
+-out $output1
+##Set the extended list here so if/when we add things, saved workflows are not affected
+#if str($out_format)=="ext":
+ -outfmt "6 std sallseqid score nident positive gaps ppos qframe sframe qseq sseq qlen slen"
+#else:
+ -outfmt $out_format
+#end if
+-num_threads 8
+#if $adv_opts.adv_opts_selector=="advanced":
+$adv_opts.filter_query
+$adv_opts.strand
+## Need int(str(...)) because $adv_opts.max_hits is an InputValueWrapper object not a string
+## Note -max_target_seqs overrides -num_descriptions and -num_alignments
+#if (str($adv_opts.max_hits) and int(str($adv_opts.max_hits)) > 0):
+-max_target_seqs $adv_opts.max_hits
+#end if
+#if (str($adv_opts.word_size) and int(str($adv_opts.word_size)) > 0):
+-word_size $adv_opts.word_size
+#end if
+$adv_opts.ungapped
+$adv_opts.parse_deflines
+## End of advanced options:
+#end if
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ blastn
+
+
+
+.. class:: warningmark
+
+**Note**. Database searches may take a substantial amount of time.
+For large input datasets it is advisable to allow overnight processing.
+
+-----
+
+**What it does**
+
+Search a *nucleotide database* using a *nucleotide query*,
+using the NCBI BLAST+ blastn command line tool.
+Algorithms include blastn, megablast, and discontiguous megablast.
+
+-----
+
+**Output format**
+
+Because Galaxy focuses on processing tabular data, the default output of this
+tool is tabular. The standard BLAST+ tabular output contains 12 columns:
+
+====== ========= ============================================
+Column NCBI name Description
+------ --------- --------------------------------------------
+ 1 qseqid Query Seq-id (ID of your sequence)
+ 2 sseqid Subject Seq-id (ID of the database hit)
+ 3 pident Percentage of identical matches
+ 4 length Alignment length
+ 5 mismatch Number of mismatches
+ 6 gapopen Number of gap openings
+ 7 qstart Start of alignment in query
+ 8 qend End of alignment in query
+ 9 sstart Start of alignment in subject (database hit)
+ 10 send End of alignment in subject (database hit)
+ 11 evalue Expectation value (E-value)
+ 12 bitscore Bit score
+====== ========= ============================================
+
+The BLAST+ tools can optionally output additional columns of information,
+but this takes longer to calculate. Most (but not all) of these columns are
+included by selecting the extended tabular output. The extra columns are
+included *after* the standard 12 columns. This is so that you can write
+workflow filtering steps that accept either the 12 or 24 column tabular
+BLAST output.
+
+====== ============= ===========================================
+Column NCBI name Description
+------ ------------- -------------------------------------------
+ 13 sallseqid All subject Seq-id(s), separated by a ';'
+ 14 score Raw score
+ 15 nident Number of identical matches
+ 16 positive Number of positive-scoring matches
+ 17 gaps Total number of gaps
+ 18 ppos Percentage of positive-scoring matches
+ 19 qframe Query frame
+ 20 sframe Subject frame
+ 21 qseq Aligned part of query sequence
+ 22 sseq Aligned part of subject sequence
+ 23 qlen Query sequence length
+ 24 slen Subject sequence length
+====== ============= ===========================================
+
+The third option is BLAST XML output, which is designed to be parsed by
+another program, and is understood by some Galaxy tools.
+
+You can also choose several plain text or HTML output formats which are designed to be read by a person (not by another program).
+The HTML versions use basic webpage formatting and can include links to the hits on the NCBI website.
+The pairwise output (the default on the NCBI BLAST website) shows each match as a pairwise alignment with the query.
+The two query anchored outputs show a multiple sequence alignment between the query and all the matches,
+and differ in how insertions are shown (marked as insertions or with gap characters added to the other sequences).
+
+-------
+
+**References**
+
+Zhang et al. A Greedy Algorithm for Aligning DNA Sequences. 2000. JCB: 203-214.
+
+
+
diff -r 10dce68b584b -r 45ba7c750bc8 tools/ncbi_blast_plus/ncbi_blastp_wrapper.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ncbi_blast_plus/ncbi_blastp_wrapper.xml Thu Sep 20 10:12:43 2012 -0400
@@ -0,0 +1,282 @@
+
+ Search protein database with protein query sequence(s)
+
+
+ blastp -version
+
+## The command is a Cheetah template which allows some Python based syntax.
+## Lines starting hash hash are comments. Galaxy will turn newlines into spaces
+blastp
+-query "$query"
+#if $db_opts.db_opts_selector == "db":
+ -db "${db_opts.database.fields.path}"
+#else:
+ -subject "$db_opts.subject"
+#end if
+-task $blast_type
+-evalue $evalue_cutoff
+-out $output1
+##Set the extended list here so if/when we add things, saved workflows are not affected
+#if str($out_format)=="ext":
+ -outfmt "6 std sallseqid score nident positive gaps ppos qframe sframe qseq sseq qlen slen"
+#else:
+ -outfmt $out_format
+#end if
+-num_threads 8
+#if $adv_opts.adv_opts_selector=="advanced":
+$adv_opts.filter_query
+-matrix $adv_opts.matrix
+## Need int(str(...)) because $adv_opts.max_hits is an InputValueWrapper object not a string
+## Note -max_target_seqs overrides -num_descriptions and -num_alignments
+#if (str($adv_opts.max_hits) and int(str($adv_opts.max_hits)) > 0):
+-max_target_seqs $adv_opts.max_hits
+#end if
+#if (str($adv_opts.word_size) and int(str($adv_opts.word_size)) > 0):
+-word_size $adv_opts.word_size
+#end if
+##Ungapped disabled for now - see comments below
+##$adv_opts.ungapped
+$adv_opts.parse_deflines
+## End of advanced options:
+#end if
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ blastp
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: warningmark
+
+**Note**. Database searches may take a substantial amount of time.
+For large input datasets it is advisable to allow overnight processing.
+
+-----
+
+**What it does**
+
+Search a *protein database* using a *protein query*,
+using the NCBI BLAST+ blastp command line tool.
+
+-----
+
+**Output format**
+
+Because Galaxy focuses on processing tabular data, the default output of this
+tool is tabular. The standard BLAST+ tabular output contains 12 columns:
+
+====== ========= ============================================
+Column NCBI name Description
+------ --------- --------------------------------------------
+ 1 qseqid Query Seq-id (ID of your sequence)
+ 2 sseqid Subject Seq-id (ID of the database hit)
+ 3 pident Percentage of identical matches
+ 4 length Alignment length
+ 5 mismatch Number of mismatches
+ 6 gapopen Number of gap openings
+ 7 qstart Start of alignment in query
+ 8 qend End of alignment in query
+ 9 sstart Start of alignment in subject (database hit)
+ 10 send End of alignment in subject (database hit)
+ 11 evalue Expectation value (E-value)
+ 12 bitscore Bit score
+====== ========= ============================================
+
+The BLAST+ tools can optionally output additional columns of information,
+but this takes longer to calculate. Most (but not all) of these columns are
+included by selecting the extended tabular output. The extra columns are
+included *after* the standard 12 columns. This is so that you can write
+workflow filtering steps that accept either the 12 or 24 column tabular
+BLAST output.
+
+====== ============= ===========================================
+Column NCBI name Description
+------ ------------- -------------------------------------------
+ 13 sallseqid All subject Seq-id(s), separated by a ';'
+ 14 score Raw score
+ 15 nident Number of identical matches
+ 16 positive Number of positive-scoring matches
+ 17 gaps Total number of gaps
+ 18 ppos Percentage of positive-scoring matches
+ 19 qframe Query frame
+ 20 sframe Subject frame
+ 21 qseq Aligned part of query sequence
+ 22 sseq Aligned part of subject sequence
+ 23 qlen Query sequence length
+ 24 slen Subject sequence length
+====== ============= ===========================================
+
+The third option is BLAST XML output, which is designed to be parsed by
+another program, and is understood by some Galaxy tools.
+
+You can also choose several plain text or HTML output formats which are designed to be read by a person (not by another program).
+The HTML versions use basic webpage formatting and can include links to the hits on the NCBI website.
+The pairwise output (the default on the NCBI BLAST website) shows each match as a pairwise alignment with the query.
+The two query anchored outputs show a multiple sequence alignment between the query and all the matches,
+and differ in how insertions are shown (marked as insertions or with gap characters added to the other sequences).
+
+-------
+
+**References**
+
+Altschul et al. Gapped BLAST and PSI-BLAST: a new generation of protein database search programs. 1997. Nucleic Acids Res. 25:3389-3402.
+
+Schaffer et al. Improving the accuracy of PSI-BLAST protein database searches with composition-based statistics and other refinements. 2001. Nucleic Acids Res. 29:2994-3005.
+
+
+
diff -r 10dce68b584b -r 45ba7c750bc8 tools/ncbi_blast_plus/ncbi_blastx_wrapper.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ncbi_blast_plus/ncbi_blastx_wrapper.xml Thu Sep 20 10:12:43 2012 -0400
@@ -0,0 +1,268 @@
+
+ Search protein database with translated nucleotide query sequence(s)
+
+
+ blastx -version
+
+## The command is a Cheetah template which allows some Python based syntax.
+## Lines starting hash hash are comments. Galaxy will turn newlines into spaces
+blastx
+-query "$query"
+#if $db_opts.db_opts_selector == "db":
+ -db "${db_opts.database.fields.path}"
+#else:
+ -subject "$db_opts.subject"
+#end if
+-query_gencode $query_gencode
+-evalue $evalue_cutoff
+-out $output1
+##Set the extended list here so if/when we add things, saved workflows are not affected
+#if str($out_format)=="ext":
+ -outfmt "6 std sallseqid score nident positive gaps ppos qframe sframe qseq sseq qlen slen"
+#else:
+ -outfmt $out_format
+#end if
+-num_threads 8
+#if $adv_opts.adv_opts_selector=="advanced":
+$adv_opts.filter_query
+$adv_opts.strand
+-matrix $adv_opts.matrix
+## Need int(str(...)) because $adv_opts.max_hits is an InputValueWrapper object not a string
+## Note -max_target_seqs overrides -num_descriptions and -num_alignments
+#if (str($adv_opts.max_hits) and int(str($adv_opts.max_hits)) > 0):
+-max_target_seqs $adv_opts.max_hits
+#end if
+#if (str($adv_opts.word_size) and int(str($adv_opts.word_size)) > 0):
+-word_size $adv_opts.word_size
+#end if
+$adv_opts.ungapped
+$adv_opts.parse_deflines
+## End of advanced options:
+#end if
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ blastx
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: warningmark
+
+**Note**. Database searches may take a substantial amount of time.
+For large input datasets it is advisable to allow overnight processing.
+
+-----
+
+**What it does**
+
+Search a *protein database* using a *translated nucleotide query*,
+using the NCBI BLAST+ blastx command line tool.
+
+-----
+
+**Output format**
+
+Because Galaxy focuses on processing tabular data, the default output of this
+tool is tabular. The standard BLAST+ tabular output contains 12 columns:
+
+====== ========= ============================================
+Column NCBI name Description
+------ --------- --------------------------------------------
+ 1 qseqid Query Seq-id (ID of your sequence)
+ 2 sseqid Subject Seq-id (ID of the database hit)
+ 3 pident Percentage of identical matches
+ 4 length Alignment length
+ 5 mismatch Number of mismatches
+ 6 gapopen Number of gap openings
+ 7 qstart Start of alignment in query
+ 8 qend End of alignment in query
+ 9 sstart Start of alignment in subject (database hit)
+ 10 send End of alignment in subject (database hit)
+ 11 evalue Expectation value (E-value)
+ 12 bitscore Bit score
+====== ========= ============================================
+
+The BLAST+ tools can optionally output additional columns of information,
+but this takes longer to calculate. Most (but not all) of these columns are
+included by selecting the extended tabular output. The extra columns are
+included *after* the standard 12 columns. This is so that you can write
+workflow filtering steps that accept either the 12 or 24 column tabular
+BLAST output.
+
+====== ============= ===========================================
+Column NCBI name Description
+------ ------------- -------------------------------------------
+ 13 sallseqid All subject Seq-id(s), separated by a ';'
+ 14 score Raw score
+ 15 nident Number of identical matches
+ 16 positive Number of positive-scoring matches
+ 17 gaps Total number of gaps
+ 18 ppos Percentage of positive-scoring matches
+ 19 qframe Query frame
+ 20 sframe Subject frame
+ 21 qseq Aligned part of query sequence
+ 22 sseq Aligned part of subject sequence
+ 23 qlen Query sequence length
+ 24 slen Subject sequence length
+====== ============= ===========================================
+
+The third option is BLAST XML output, which is designed to be parsed by
+another program, and is understood by some Galaxy tools.
+
+You can also choose several plain text or HTML output formats which are designed to be read by a person (not by another program).
+The HTML versions use basic webpage formatting and can include links to the hits on the NCBI website.
+The pairwise output (the default on the NCBI BLAST website) shows each match as a pairwise alignment with the query.
+The two query anchored outputs show a multiple sequence alignment between the query and all the matches,
+and differ in how insertions are shown (marked as insertions or with gap characters added to the other sequences).
+
+-------
+
+**References**
+
+Altschul et al. Gapped BLAST and PSI-BLAST: a new generation of protein database search programs. 1997. Nucleic Acids Res. 25:3389-3402.
+
+
+
diff -r 10dce68b584b -r 45ba7c750bc8 tools/ncbi_blast_plus/ncbi_tblastn_wrapper.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ncbi_blast_plus/ncbi_tblastn_wrapper.xml Thu Sep 20 10:12:43 2012 -0400
@@ -0,0 +1,314 @@
+
+ Search translated nucleotide database with protein query sequence(s)
+
+
+ tblastn -version
+
+## The command is a Cheetah template which allows some Python based syntax.
+## Lines starting hash hash are comments. Galaxy will turn newlines into spaces
+tblastn
+-query "$query"
+#if $db_opts.db_opts_selector == "db":
+ -db "${db_opts.database.fields.path}"
+#else:
+ -subject "$db_opts.subject"
+#end if
+-evalue $evalue_cutoff
+-out $output1
+##Set the extended list here so if/when we add things, saved workflows are not affected
+#if str($out_format)=="ext":
+ -outfmt "6 std sallseqid score nident positive gaps ppos qframe sframe qseq sseq qlen slen"
+#else:
+ -outfmt $out_format
+#end if
+-num_threads 8
+#if $adv_opts.adv_opts_selector=="advanced":
+-db_gencode $adv_opts.db_gencode
+$adv_opts.filter_query
+-matrix $adv_opts.matrix
+## Need int(str(...)) because $adv_opts.max_hits is an InputValueWrapper object not a string
+## Note -max_target_seqs overrides -num_descriptions and -num_alignments
+#if (str($adv_opts.max_hits) and int(str($adv_opts.max_hits)) > 0):
+-max_target_seqs $adv_opts.max_hits
+#end if
+#if (str($adv_opts.word_size) and int(str($adv_opts.word_size)) > 0):
+-word_size $adv_opts.word_size
+#end if
+##Ungapped disabled for now - see comments below
+##$adv_opts.ungapped
+$adv_opts.parse_deflines
+## End of advanced options:
+#end if
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ tblastn
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: warningmark
+
+**Note**. Database searches may take a substantial amount of time.
+For large input datasets it is advisable to allow overnight processing.
+
+-----
+
+**What it does**
+
+Search a *translated nucleotide database* using a *protein query*,
+using the NCBI BLAST+ tblastn command line tool.
+
+-----
+
+**Output format**
+
+Because Galaxy focuses on processing tabular data, the default output of this
+tool is tabular. The standard BLAST+ tabular output contains 12 columns:
+
+====== ========= ============================================
+Column NCBI name Description
+------ --------- --------------------------------------------
+ 1 qseqid Query Seq-id (ID of your sequence)
+ 2 sseqid Subject Seq-id (ID of the database hit)
+ 3 pident Percentage of identical matches
+ 4 length Alignment length
+ 5 mismatch Number of mismatches
+ 6 gapopen Number of gap openings
+ 7 qstart Start of alignment in query
+ 8 qend End of alignment in query
+ 9 sstart Start of alignment in subject (database hit)
+ 10 send End of alignment in subject (database hit)
+ 11 evalue Expectation value (E-value)
+ 12 bitscore Bit score
+====== ========= ============================================
+
+The BLAST+ tools can optionally output additional columns of information,
+but this takes longer to calculate. Most (but not all) of these columns are
+included by selecting the extended tabular output. The extra columns are
+included *after* the standard 12 columns. This is so that you can write
+workflow filtering steps that accept either the 12 or 24 column tabular
+BLAST output.
+
+====== ============= ===========================================
+Column NCBI name Description
+------ ------------- -------------------------------------------
+ 13 sallseqid All subject Seq-id(s), separated by a ';'
+ 14 score Raw score
+ 15 nident Number of identical matches
+ 16 positive Number of positive-scoring matches
+ 17 gaps Total number of gaps
+ 18 ppos Percentage of positive-scoring matches
+ 19 qframe Query frame
+ 20 sframe Subject frame
+ 21 qseq Aligned part of query sequence
+ 22 sseq Aligned part of subject sequence
+ 23 qlen Query sequence length
+ 24 slen Subject sequence length
+====== ============= ===========================================
+
+The third option is BLAST XML output, which is designed to be parsed by
+another program, and is understood by some Galaxy tools.
+
+You can also choose several plain text or HTML output formats which are designed to be read by a person (not by another program).
+The HTML versions use basic webpage formatting and can include links to the hits on the NCBI website.
+The pairwise output (the default on the NCBI BLAST website) shows each match as a pairwise alignment with the query.
+The two query anchored outputs show a multiple sequence alignment between the query and all the matches,
+and differ in how insertions are shown (marked as insertions or with gap characters added to the other sequences).
+
+-------
+
+**References**
+
+Altschul et al. Gapped BLAST and PSI-BLAST: a new generation of protein database search programs. 1997. Nucleic Acids Res. 25:3389-3402.
+
+
+
diff -r 10dce68b584b -r 45ba7c750bc8 tools/ncbi_blast_plus/ncbi_tblastx_wrapper.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ncbi_blast_plus/ncbi_tblastx_wrapper.xml Thu Sep 20 10:12:43 2012 -0400
@@ -0,0 +1,256 @@
+
+ Search translated nucleotide database with translated nucleotide query sequence(s)
+
+
+ tblastx -version
+
+## The command is a Cheetah template which allows some Python based syntax.
+## Lines starting hash hash are comments. Galaxy will turn newlines into spaces
+tblastx
+-query "$query"
+#if $db_opts.db_opts_selector == "db":
+ -db "${db_opts.database.fields.path}"
+#else:
+ -subject "$db_opts.subject"
+#end if
+-query_gencode $query_gencode
+-evalue $evalue_cutoff
+-out $output1
+##Set the extended list here so if/when we add things, saved workflows are not affected
+#if str($out_format)=="ext":
+ -outfmt "6 std sallseqid score nident positive gaps ppos qframe sframe qseq sseq qlen slen"
+#else:
+ -outfmt $out_format
+#end if
+-num_threads 8
+#if $adv_opts.adv_opts_selector=="advanced":
+-db_gencode $adv_opts.db_gencode
+$adv_opts.filter_query
+$adv_opts.strand
+-matrix $adv_opts.matrix
+## Need int(str(...)) because $adv_opts.max_hits is an InputValueWrapper object not a string
+## Note -max_target_seqs overrides -num_descriptions and -num_alignments
+#if (str($adv_opts.max_hits) and int(str($adv_opts.max_hits)) > 0):
+-max_target_seqs $adv_opts.max_hits
+#end if
+#if (str($adv_opts.word_size) and int(str($adv_opts.word_size)) > 0):
+-word_size $adv_opts.word_size
+#end if
+$adv_opts.parse_deflines
+## End of advanced options:
+#end if
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ tblastx
+
+
+
+.. class:: warningmark
+
+**Note**. Database searches may take a substantial amount of time.
+For large input datasets it is advisable to allow overnight processing.
+
+-----
+
+**What it does**
+
+Search a *translated nucleotide database* using a *protein query*,
+using the NCBI BLAST+ tblastx command line tool.
+
+-----
+
+**Output format**
+
+Because Galaxy focuses on processing tabular data, the default output of this
+tool is tabular. The standard BLAST+ tabular output contains 12 columns:
+
+====== ========= ============================================
+Column NCBI name Description
+------ --------- --------------------------------------------
+ 1 qseqid Query Seq-id (ID of your sequence)
+ 2 sseqid Subject Seq-id (ID of the database hit)
+ 3 pident Percentage of identical matches
+ 4 length Alignment length
+ 5 mismatch Number of mismatches
+ 6 gapopen Number of gap openings
+ 7 qstart Start of alignment in query
+ 8 qend End of alignment in query
+ 9 sstart Start of alignment in subject (database hit)
+ 10 send End of alignment in subject (database hit)
+ 11 evalue Expectation value (E-value)
+ 12 bitscore Bit score
+====== ========= ============================================
+
+The BLAST+ tools can optionally output additional columns of information,
+but this takes longer to calculate. Most (but not all) of these columns are
+included by selecting the extended tabular output. The extra columns are
+included *after* the standard 12 columns. This is so that you can write
+workflow filtering steps that accept either the 12 or 24 column tabular
+BLAST output.
+
+====== ============= ===========================================
+Column NCBI name Description
+------ ------------- -------------------------------------------
+ 13 sallseqid All subject Seq-id(s), separated by a ';'
+ 14 score Raw score
+ 15 nident Number of identical matches
+ 16 positive Number of positive-scoring matches
+ 17 gaps Total number of gaps
+ 18 ppos Percentage of positive-scoring matches
+ 19 qframe Query frame
+ 20 sframe Subject frame
+ 21 qseq Aligned part of query sequence
+ 22 sseq Aligned part of subject sequence
+ 23 qlen Query sequence length
+ 24 slen Subject sequence length
+====== ============= ===========================================
+
+The third option is BLAST XML output, which is designed to be parsed by
+another program, and is understood by some Galaxy tools.
+
+You can also choose several plain text or HTML output formats which are designed to be read by a person (not by another program).
+The HTML versions use basic webpage formatting and can include links to the hits on the NCBI website.
+The pairwise output (the default on the NCBI BLAST website) shows each match as a pairwise alignment with the query.
+The two query anchored outputs show a multiple sequence alignment between the query and all the matches,
+and differ in how insertions are shown (marked as insertions or with gap characters added to the other sequences).
+
+-------
+
+**References**
+
+Altschul et al. Gapped BLAST and PSI-BLAST: a new generation of protein database search programs. 1997. Nucleic Acids Res. 25:3389-3402.
+
+
+
diff -r 10dce68b584b -r 45ba7c750bc8 tools/ncbi_blast_plus/tool_dependencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ncbi_blast_plus/tool_dependencies.xml Thu Sep 20 10:12:43 2012 -0400
@@ -0,0 +1,21 @@
+
+
+
+
+
+ ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/2.2.26/ncbi-blast-2.2.26+-src.tar.gz
+ cd c++ && ./configure --prefix=$INSTALL_DIR && make && make install
+
+ $INSTALL_DIR/bin
+
+
+
+
+These links provide information for building the NCBI Blast+ package in most environments.
+
+System requirements
+http://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=Download
+
+
+
+
diff -r 10dce68b584b -r 45ba7c750bc8 xml.py
--- a/xml.py Thu Aug 23 09:33:16 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,124 +0,0 @@
-"""
-BlastXml class
-"""
-
-from galaxy.datatypes.data import get_file_peek
-from galaxy.datatypes.data import Text
-from galaxy.datatypes.xml import GenericXml
-
-class BlastXml( GenericXml ):
- """NCBI Blast XML Output data"""
- file_ext = "blastxml"
-
- def set_peek( self, dataset, is_multi_byte=False ):
- """Set the peek and blurb text"""
- if not dataset.dataset.purged:
- dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
- dataset.blurb = 'NCBI Blast XML data'
- else:
- dataset.peek = 'file does not exist'
- dataset.blurb = 'file purged from disk'
- def sniff( self, filename ):
- """
- Determines whether the file is blastxml
-
- >>> fname = get_test_fname( 'megablast_xml_parser_test1.blastxml' )
- >>> BlastXml().sniff( fname )
- True
- >>> fname = get_test_fname( 'tblastn_four_human_vs_rhodopsin.xml' )
- >>> BlastXml().sniff( fname )
- True
- >>> fname = get_test_fname( 'interval.interval' )
- >>> BlastXml().sniff( fname )
- False
- """
- #TODO - Use a context manager on Python 2.5+ to close handle
- handle = open(filename)
- line = handle.readline()
- if line.strip() != '':
- handle.close()
- return False
- line = handle.readline()
- if line.strip() not in ['',
- '']:
- handle.close()
- return False
- line = handle.readline()
- if line.strip() != '':
- handle.close()
- return False
- handle.close()
- return True
-
- def merge(split_files, output_file):
- """Merging multiple XML files is non-trivial and must be done in subclasses."""
- if len(split_files) == 1:
- #For one file only, use base class method (move/copy)
- return Text.merge(split_files, output_file)
- out = open(output_file, "w")
- h = None
- for f in split_files:
- h = open(f)
- body = False
- header = h.readline()
- if not header:
- out.close()
- h.close()
- raise ValueError("BLAST XML file %s was empty" % f)
- if header.strip() != '':
- out.write(header) #for diagnosis
- out.close()
- h.close()
- raise ValueError("%s is not an XML file!" % f)
- line = h.readline()
- header += line
- if line.strip() not in ['',
- '']:
- out.write(header) #for diagnosis
- out.close()
- h.close()
- raise ValueError("%s is not a BLAST XML file!" % f)
- while True:
- line = h.readline()
- if not line:
- out.write(header) #for diagnosis
- out.close()
- h.close()
- raise ValueError("BLAST XML file %s ended prematurely" % f)
- header += line
- if "" in line:
- break
- if len(header) > 10000:
- #Something has gone wrong, don't load too much into memory!
- #Write what we have to the merged file for diagnostics
- out.write(header)
- out.close()
- h.close()
- raise ValueError("BLAST XML file %s has too long a header!" % f)
- if "" not in header:
- out.close()
- h.close()
- raise ValueError("%s is not a BLAST XML file:\n%s\n..." % (f, header))
- if f == split_files[0]:
- out.write(header)
- old_header = header
- elif old_header[:300] != header[:300]:
- #Enough to check and match
- out.close()
- h.close()
- raise ValueError("BLAST XML headers don't match for %s and %s - have:\n%s\n...\n\nAnd:\n%s\n...\n" \
- % (split_files[0], f, old_header[:300], header[:300]))
- else:
- out.write(" \n")
- for line in h:
- if "" in line:
- break
- #TODO - Increment and if required automatic query names
- #like Query_3 to be increasing?
- out.write(line)
- h.close()
- out.write(" \n")
- out.write("\n")
- out.close()
- merge = staticmethod(merge)
-