Repository 'megablast_xml_parser'
hg clone https://toolshed.g2.bx.psu.edu/repos/devteam/megablast_xml_parser

Changeset 0:03ca082aeb2e (2014-05-19)
Next changeset 1:fbff957a3655 (2020-09-09)
Commit message:
Imported from capsule None
added:
megablast_xml_parser.py
megablast_xml_parser.xml
repository_dependencies.xml
test-data/megablast_xml_parser_test1.gz
test-data/megablast_xml_parser_test1_out.tabular
b
diff -r 000000000000 -r 03ca082aeb2e megablast_xml_parser.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/megablast_xml_parser.py Mon May 19 12:33:19 2014 -0400
[
@@ -0,0 +1,78 @@
+#!/usr/bin/env python
+    
+import sys, os, re
+
+if sys.version_info[:2] >= ( 2, 5 ):
+    import xml.etree.cElementTree as ElementTree
+else:
+    from galaxy import eggs
+    import pkg_resources; pkg_resources.require( "elementtree" )
+    from elementtree import ElementTree
+
+def stop_err( msg ):
+    sys.stderr.write( "%s\n" % msg )
+    sys.exit()
+
+def __main__():
+    source  = sys.argv[1]
+    hspTags = [
+           "Hsp_bit-score",
+           "Hsp_evalue",
+           "Hsp_query-from",
+           "Hsp_query-to",
+           "Hsp_hit-from",
+           "Hsp_hit-to",
+           "Hsp_query-frame",
+           "Hsp_hit-frame",
+           "Hsp_identity",
+           "Hsp_align-len",
+           "Hsp_qseq",
+           "Hsp_hseq",
+           "Hsp_midline"
+          ]
+    hspData = []
+
+    # get an iterable
+    try: 
+        context = ElementTree.iterparse( source, events=( "start", "end" ) )
+    except:
+        stop_err( "Invalid data format." )
+    # turn it into an iterator
+    context = iter( context )
+    # get the root element
+    try:
+        event, root = context.next()
+    except:
+        stop_err( "Invalid data format." )
+
+    outfile = open( sys.argv[2], 'w' )
+    try:
+        for event, elem in context:
+           # for every <Iteration> tag
+           if event == "end" and elem.tag == "Iteration":
+               query = elem.findtext( "Iteration_query-def" )
+               qLen = elem.findtext( "Iteration_query-len" )
+               # for every <Hit> within <Iteration>
+               for hit in elem.findall( "Iteration_hits/Hit" ):
+                   subject = hit.findtext( "Hit_id" )
+                   if re.search( '^gi', subject ):
+                       subject = subject.split('|')[1]
+                   sLen = hit.findtext( "Hit_len" )
+                   # for every <Hsp> within <Hit>
+                   for hsp in hit.findall( "Hit_hsps/Hsp" ):
+                        outfile.write( "%s\t%s\t%s\t%s" % ( query, qLen, subject, sLen ) )
+                        for tag in hspTags:
+                            outfile.write("\t%s" %(hsp.findtext( tag )))
+                            #hspData.append( hsp.findtext( tag ) )
+                        #hspData = []
+                        outfile.write('\n')
+               # prevents ElementTree from growing large datastructure
+               root.clear()
+               elem.clear()
+    except:
+        outfile.close()
+        stop_err( "The input data is malformed, or there is more than one dataset in the input file. Error: %s" % sys.exc_info()[1] )
+
+    outfile.close()
+
+if __name__ == "__main__": __main__()
b
diff -r 000000000000 -r 03ca082aeb2e megablast_xml_parser.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/megablast_xml_parser.xml Mon May 19 12:33:19 2014 -0400
b
@@ -0,0 +1,60 @@
+<tool id="megablast_xml_parser" name="Parse blast XML output" version="1.0.0">
+<description></description>
+<command interpreter="python">megablast_xml_parser.py $input1 $output1</command>
+<inputs>
+  <param name="input1" type="data" format="blastxml" label="Megablast XML output" />
+</inputs>
+<outputs>
+  <data name="output1" format="tabular"/>
+</outputs>
+<tests>
+  <test>
+    <param name="input1" value="megablast_xml_parser_test1.gz" ftype="blastxml" />
+    <output name="output1" file="megablast_xml_parser_test1_out.tabular" ftype="tabular" />
+  </test>
+</tests>
+<help>
+
+**What it does**
+
+This tool processes the XML output of any NCBI blast tool (if you run your own blast jobs, the XML output can be generated with **-m 7** option).
+
+-----
+
+**Output fields**
+
+This tools returns tab-delimited output with the following fields::
+
+    Description                               Example
+    ----------------------------------------- ----------------- 
+
+    1. Name of the query sequence             Seq1
+    2. Length of the query sequence           30
+    3. Name of target sequence                gnl|BL_ORD_ID|0
+    4. Length of target sequence              5528445
+    5. Alignment bit score                    59.96
+    6. E-value                                8.38112e-11
+    7. Start of alignment within query        1
+    8. End of alignment within query          30
+    9. Start of alignment within target       5436010
+   10. End of alignment within target         5436039
+   11. Query frame                            1
+   12. Target frame                           1
+   13. Number of identical bases within       29 
+       the alignment
+   14. Alignment length                       30 
+   15. Aligned portion (sequence) of query    CGGACAGCGCCGCCACCAACAAAGCCACCA
+   16. Aligned portion (sequence) of target   CGGACAGCGCCGCCACCAACAAAGCCATCA
+   17. Midline indicating positions of        ||||||||||||||||||||||||||| || 
+       matches within the alignment
+
+------
+       
+.. class:: infomark
+
+Note that this form of output does not contain alignment identify value. However, it can be computed by dividing the number of identical bases within the alignment (Field 13) by the alignment length (Field 14) using *Text Manipulation->Compute* tool 
+
+
+
+</help>
+</tool>
b
diff -r 000000000000 -r 03ca082aeb2e repository_dependencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/repository_dependencies.xml Mon May 19 12:33:19 2014 -0400
b
@@ -0,0 +1,4 @@
+<?xml version="1.0"?>
+<repositories>
+  <repository changeset_revision="de11e1a921c4" name="blast_datatypes" owner="devteam" toolshed="http://toolshed.g2.bx.psu.edu" />
+</repositories>
b
diff -r 000000000000 -r 03ca082aeb2e test-data/megablast_xml_parser_test1.gz
b
Binary file test-data/megablast_xml_parser_test1.gz has changed
b
diff -r 000000000000 -r 03ca082aeb2e test-data/megablast_xml_parser_test1_out.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/megablast_xml_parser_test1_out.tabular Mon May 19 12:33:19 2014 -0400
b
b'@@ -0,0 +1,103 @@\n+0_0.666667\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t5436010\t5436039\t1\t1\t30\t30\tCGGACAGCGCCGCCACCAACAAAGCCACCA\tCGGACAGCGCCGCCACCAACAAAGCCACCA\t||||||||||||||||||||||||||||||\n+1_0.600000\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t696993\t697022\t1\t1\t30\t30\tAAAACACCGGATGCTCCGGCGCTGGCAGAT\tAAAACACCGGATGCTCCGGCGCTGGCAGAT\t||||||||||||||||||||||||||||||\n+2_0.400000\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t4100018\t4100047\t1\t1\t30\t30\tTTTGCTTTTAGTACACCGGATTCAGAACCA\tTTTGCTTTTAGTACACCGGATTCAGAACCA\t||||||||||||||||||||||||||||||\n+3_0.566667\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t30\t1\t2305844\t2305873\t1\t-1\t30\t30\tCCGTCCAGAAAGGTGTATTCATGGGGACGG\tCCGTCCAGAAAGGTGTATTCATGGGGACGG\t||||||||||||||||||||||||||||||\n+4_0.766667\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t3457901\t3457930\t1\t1\t30\t30\tCACGCTACGTGCGCCCCCGCCCAGAAGGCG\tCACGCTACGTGCGCCCCCGCCCAGAAGGCG\t||||||||||||||||||||||||||||||\n+5_0.533333\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t30\t1\t1264131\t1264160\t1\t-1\t30\t30\tGCACTTAACCCGCTTCGGCGGGTTTTGTTT\tGCACTTAACCCGCTTCGGCGGGTTTTGTTT\t||||||||||||||||||||||||||||||\n+5_0.533333\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t30\t1\t1636485\t1636514\t1\t-1\t30\t30\tGCACTTAACCCGCTTCGGCGGGTTTTGTTT\tGCACTTAACCCGCTTCGGCGGGTTTTGTTT\t||||||||||||||||||||||||||||||\n+5_0.533333\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t2318694\t2318723\t1\t1\t30\t30\tAAACAAAACCCGCCGAAGCGGGTTAAGTGC\tAAACAAAACCCGCCGAAGCGGGTTAAGTGC\t||||||||||||||||||||||||||||||\n+5_0.533333\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t2777043\t2777072\t1\t1\t30\t30\tAAACAAAACCCGCCGAAGCGGGTTAAGTGC\tAAACAAAACCCGCCGAAGCGGGTTAAGTGC\t||||||||||||||||||||||||||||||\n+6_0.533333\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t1185124\t1185153\t1\t1\t30\t30\tTAAGCCGTTACTGGCAGCAAGTGCAGGCAA\tTAAGCCGTTACTGGCAGCAAGTGCAGGCAA\t||||||||||||||||||||||||||||||\n+7_0.400000\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t2458203\t2458232\t1\t1\t30\t30\tTGAATTTACCGTTATCTATCTTGCCTGCCT\tTGAATTTACCGTTATCTATCTTGCCTGCCT\t||||||||||||||||||||||||||||||\n+9_0.400000\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t5286666\t5286695\t1\t1\t30\t30\tGCGTTTTGCTAAACTTCTGCCGGAATATAA\tGCGTTTTGCTAAACTTCTGCCGGAATATAA\t||||||||||||||||||||||||||||||\n+10_0.500000\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t2787672\t2787701\t1\t1\t30\t30\tAAAGAGGCGAGCAGAGTAAAACGCAGGCAA\tAAAGAGGCGAGCAGAGTAAAACGCAGGCAA\t||||||||||||||||||||||||||||||\n+12_0.700000\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t4116145\t4116174\t1\t1\t30\t30\tGCGGACGATCTTCACGGTCGCCACGCGGAC\tGCGGACGATCTTCACGGTCGCCACGCGGAC\t||||||||||||||||||||||||||||||\n+13_0.533333\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t5442351\t5442380\t1\t1\t30\t30\tTTCTTGTTGGATGGCATACTCCGGCAGCCA\tTTCTTGTTGGATGGCATACTCCGGCAGCCA\t||||||||||||||||||||||||||||||\n+14_0.666667\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t4626492\t4626521\t1\t1\t30\t30\tACCCCGATATCGTCGCAGGCGTTGCCGCAC\tACCCCGATATCGTCGCAGGCGTTGCCGCAC\t||||||||||||||||||||||||||||||\n+15_0.666667\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t346897\t346926\t1\t1\t30\t30\tGGCTGCTGTCTCCGCTGGAAGAGGCGCTTC\tGGCTGCTGTCTCCGCTGGAAGAGGCGCTTC\t||||||||||||||||||||||||||||||\n+15_0.666667\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t30\t1\t1466536\t1466565\t1\t-1\t30\t30\tGAAGCGCCTCTTCCAGCGGAGACAGCAGCC\tGAAGCGCCTCTTCCAGCGGAGACAGCAGCC\t||||||||||||||||||||||||||||||\n+15_0.666667\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t1750132\t1750161\t1\t1\t30\t30\tGGCTGCTGTCTCCGCTGGAAGAGGCGCTTC\tGGCTGCTGTCTCCGCTGGAAGAGGCGCTTC\t||||||||||||||||||||||||||||||\n+15_0.666667\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t1908046\t1908075\t1\t1\t30\t30\tGGCTGCTGTCTCCGCTGGAAGAGGCGCTTC\tGGCTGCTGTCTCCGCTGGAAGAGGCGCTTC\t||||||||||||||||||||||||||||||\n+15_0.666667\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t2279107\t2279136\t1\t1\t30\t30\tGGCTGCTGTCTCCGCTGGAAGAGGCGCTTC\tGGCTGCTGTCTCCGCTGGAAGAGGCGCTTC\t||||||||||||||||||||||||||||||\n+15_0.666667\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t2813407\t2813436\t1\t1\t30\t30\tGGCTGCTGTCTCCGCTGGAAGAGGCGCTTC\tGGCTGCTGTCTCCGCTGGAAGAGGCGCTTC\t||||||||||||||||||||'..b'||||\n+70_0.466667\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t1317239\t1317268\t1\t1\t30\t30\tTGGTGTTCAGCATCTCAACGGTAATTCGCT\tTGGTGTTCAGCATCTCAACGGTAATTCGCT\t||||||||||||||||||||||||||||||\n+71_0.533333\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t5080463\t5080492\t1\t1\t30\t30\tCAGGATGCAAACTGCCGGGAGATCCAGTTA\tCAGGATGCAAACTGCCGGGAGATCCAGTTA\t||||||||||||||||||||||||||||||\n+72_0.533333\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t3605409\t3605438\t1\t1\t30\t30\tAACTGGAAGGGCTTGGGATGACACAACAGC\tAACTGGAAGGGCTTGGGATGACACAACAGC\t||||||||||||||||||||||||||||||\n+73_0.500000\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t3280815\t3280844\t1\t1\t30\t30\tTTTAAGCGCCAACCAGGCTTCTTTGGTTGC\tTTTAAGCGCCAACCAGGCTTCTTTGGTTGC\t||||||||||||||||||||||||||||||\n+75_0.533333\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t2230266\t2230295\t1\t1\t30\t30\tATAACCCTCTGCAACCGCCGCTTCAGCAAA\tATAACCCTCTGCAACCGCCGCTTCAGCAAA\t||||||||||||||||||||||||||||||\n+76_0.600000\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t462631\t462660\t1\t1\t30\t30\tTGAAGCCGTACAACGGGCGCTGGAATTCGC\tTGAAGCCGTACAACGGGCGCTGGAATTCGC\t||||||||||||||||||||||||||||||\n+77_0.700000\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t2939076\t2939105\t1\t1\t30\t30\tGAGCTGCAACGCGGTCAGCCAGCTGGCGGT\tGAGCTGCAACGCGGTCAGCCAGCTGGCGGT\t||||||||||||||||||||||||||||||\n+78_0.566667\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t5422053\t5422082\t1\t1\t30\t30\tCGGAGTATCCGTTCCCCAACGACAAGCATC\tCGGAGTATCCGTTCCCCAACGACAAGCATC\t||||||||||||||||||||||||||||||\n+79_0.500000\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t4387420\t4387449\t1\t1\t30\t30\tAATACCGGGAAGAGACAACGGGGTCTCTTT\tAATACCGGGAAGAGACAACGGGGTCTCTTT\t||||||||||||||||||||||||||||||\n+81_0.433333\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t1037150\t1037179\t1\t1\t30\t30\tATTAATGTTGCCGGCACAACATAATAGGGC\tATTAATGTTGCCGGCACAACATAATAGGGC\t||||||||||||||||||||||||||||||\n+82_0.500000\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t1182188\t1182217\t1\t1\t30\t30\tACTGGGTTGCTCTGAACAAGAAAGGCGCTA\tACTGGGTTGCTCTGAACAAGAAAGGCGCTA\t||||||||||||||||||||||||||||||\n+83_0.533333\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t5430210\t5430239\t1\t1\t30\t30\tCGCCAGGGACGTATCGCGTCGATATCTATT\tCGCCAGGGACGTATCGCGTCGATATCTATT\t||||||||||||||||||||||||||||||\n+84_0.533333\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t310995\t311024\t1\t1\t30\t30\tTGCTCGTTCCCGTCGTGATGAAGCTCGAAA\tTGCTCGTTCCCGTCGTGATGAAGCTCGAAA\t||||||||||||||||||||||||||||||\n+85_0.500000\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t4174173\t4174202\t1\t1\t30\t30\tAGGAAAGCAAACAACACGACCACCATCAGC\tAGGAAAGCAAACAACACGACCACCATCAGC\t||||||||||||||||||||||||||||||\n+86_0.566667\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t163690\t163719\t1\t1\t30\t30\tGGCAACGCAGGCGCATGATTCTGCTTGGAA\tGGCAACGCAGGCGCATGATTCTGCTTGGAA\t||||||||||||||||||||||||||||||\n+88_0.533333\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t3071785\t3071814\t1\t1\t30\t30\tCGTACCGGGCTGAAAGTAGAAGAGCGTTTC\tCGTACCGGGCTGAAAGTAGAAGAGCGTTTC\t||||||||||||||||||||||||||||||\n+90_0.466667\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t692131\t692160\t1\t1\t30\t30\tATCACCGTTTCGCTAACCGGTACGTTTAAC\tATCACCGTTTCGCTAACCGGTACGTTTAAC\t||||||||||||||||||||||||||||||\n+91_0.566667\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t3307050\t3307079\t1\t1\t30\t30\tTTCGCCCGGCAAGCTTACCCAACGCTTATC\tTTCGCCCGGCAAGCTTACCCAACGCTTATC\t||||||||||||||||||||||||||||||\n+94_0.466667\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t1813009\t1813038\t1\t1\t30\t30\tCCACGGTGATATCTGGTGCCATACTGATAA\tCCACGGTGATATCTGGTGCCATACTGATAA\t||||||||||||||||||||||||||||||\n+96_0.533333\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t573782\t573811\t1\t1\t30\t30\tTTGCCGGGAAGAGAGATATCAATGGCAGGC\tTTGCCGGGAAGAGAGATATCAATGGCAGGC\t||||||||||||||||||||||||||||||\n+97_0.566667\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t3570563\t3570592\t1\t1\t30\t30\tTGCGCCGCCGGATTGTTGCTCAACATGCTT\tTGCGCCGCCGGATTGTTGCTCAACATGCTT\t||||||||||||||||||||||||||||||\n+98_0.366667\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t4545136\t4545165\t1\t1\t30\t30\tAACGCGCTAACCGCCAATAATAACAAAATT\tAACGCGCTAACCGCCAATAATAACAAAATT\t||||||||||||||||||||||||||||||\n'