Next changeset 1:fbff957a3655 (2020-09-09) |
Commit message:
Imported from capsule None |
added:
megablast_xml_parser.py megablast_xml_parser.xml repository_dependencies.xml test-data/megablast_xml_parser_test1.gz test-data/megablast_xml_parser_test1_out.tabular |
b |
diff -r 000000000000 -r 03ca082aeb2e megablast_xml_parser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/megablast_xml_parser.py Mon May 19 12:33:19 2014 -0400 |
[ |
@@ -0,0 +1,78 @@ +#!/usr/bin/env python + +import sys, os, re + +if sys.version_info[:2] >= ( 2, 5 ): + import xml.etree.cElementTree as ElementTree +else: + from galaxy import eggs + import pkg_resources; pkg_resources.require( "elementtree" ) + from elementtree import ElementTree + +def stop_err( msg ): + sys.stderr.write( "%s\n" % msg ) + sys.exit() + +def __main__(): + source = sys.argv[1] + hspTags = [ + "Hsp_bit-score", + "Hsp_evalue", + "Hsp_query-from", + "Hsp_query-to", + "Hsp_hit-from", + "Hsp_hit-to", + "Hsp_query-frame", + "Hsp_hit-frame", + "Hsp_identity", + "Hsp_align-len", + "Hsp_qseq", + "Hsp_hseq", + "Hsp_midline" + ] + hspData = [] + + # get an iterable + try: + context = ElementTree.iterparse( source, events=( "start", "end" ) ) + except: + stop_err( "Invalid data format." ) + # turn it into an iterator + context = iter( context ) + # get the root element + try: + event, root = context.next() + except: + stop_err( "Invalid data format." ) + + outfile = open( sys.argv[2], 'w' ) + try: + for event, elem in context: + # for every <Iteration> tag + if event == "end" and elem.tag == "Iteration": + query = elem.findtext( "Iteration_query-def" ) + qLen = elem.findtext( "Iteration_query-len" ) + # for every <Hit> within <Iteration> + for hit in elem.findall( "Iteration_hits/Hit" ): + subject = hit.findtext( "Hit_id" ) + if re.search( '^gi', subject ): + subject = subject.split('|')[1] + sLen = hit.findtext( "Hit_len" ) + # for every <Hsp> within <Hit> + for hsp in hit.findall( "Hit_hsps/Hsp" ): + outfile.write( "%s\t%s\t%s\t%s" % ( query, qLen, subject, sLen ) ) + for tag in hspTags: + outfile.write("\t%s" %(hsp.findtext( tag ))) + #hspData.append( hsp.findtext( tag ) ) + #hspData = [] + outfile.write('\n') + # prevents ElementTree from growing large datastructure + root.clear() + elem.clear() + except: + outfile.close() + stop_err( "The input data is malformed, or there is more than one dataset in the input file. Error: %s" % sys.exc_info()[1] ) + + outfile.close() + +if __name__ == "__main__": __main__() |
b |
diff -r 000000000000 -r 03ca082aeb2e megablast_xml_parser.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/megablast_xml_parser.xml Mon May 19 12:33:19 2014 -0400 |
b |
@@ -0,0 +1,60 @@ +<tool id="megablast_xml_parser" name="Parse blast XML output" version="1.0.0"> +<description></description> +<command interpreter="python">megablast_xml_parser.py $input1 $output1</command> +<inputs> + <param name="input1" type="data" format="blastxml" label="Megablast XML output" /> +</inputs> +<outputs> + <data name="output1" format="tabular"/> +</outputs> +<tests> + <test> + <param name="input1" value="megablast_xml_parser_test1.gz" ftype="blastxml" /> + <output name="output1" file="megablast_xml_parser_test1_out.tabular" ftype="tabular" /> + </test> +</tests> +<help> + +**What it does** + +This tool processes the XML output of any NCBI blast tool (if you run your own blast jobs, the XML output can be generated with **-m 7** option). + +----- + +**Output fields** + +This tools returns tab-delimited output with the following fields:: + + Description Example + ----------------------------------------- ----------------- + + 1. Name of the query sequence Seq1 + 2. Length of the query sequence 30 + 3. Name of target sequence gnl|BL_ORD_ID|0 + 4. Length of target sequence 5528445 + 5. Alignment bit score 59.96 + 6. E-value 8.38112e-11 + 7. Start of alignment within query 1 + 8. End of alignment within query 30 + 9. Start of alignment within target 5436010 + 10. End of alignment within target 5436039 + 11. Query frame 1 + 12. Target frame 1 + 13. Number of identical bases within 29 + the alignment + 14. Alignment length 30 + 15. Aligned portion (sequence) of query CGGACAGCGCCGCCACCAACAAAGCCACCA + 16. Aligned portion (sequence) of target CGGACAGCGCCGCCACCAACAAAGCCATCA + 17. Midline indicating positions of ||||||||||||||||||||||||||| || + matches within the alignment + +------ + +.. class:: infomark + +Note that this form of output does not contain alignment identify value. However, it can be computed by dividing the number of identical bases within the alignment (Field 13) by the alignment length (Field 14) using *Text Manipulation->Compute* tool + + + +</help> +</tool> |
b |
diff -r 000000000000 -r 03ca082aeb2e repository_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/repository_dependencies.xml Mon May 19 12:33:19 2014 -0400 |
b |
@@ -0,0 +1,4 @@ +<?xml version="1.0"?> +<repositories> + <repository changeset_revision="de11e1a921c4" name="blast_datatypes" owner="devteam" toolshed="http://toolshed.g2.bx.psu.edu" /> +</repositories> |
b |
diff -r 000000000000 -r 03ca082aeb2e test-data/megablast_xml_parser_test1.gz |
b |
Binary file test-data/megablast_xml_parser_test1.gz has changed |
b |
diff -r 000000000000 -r 03ca082aeb2e test-data/megablast_xml_parser_test1_out.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/megablast_xml_parser_test1_out.tabular Mon May 19 12:33:19 2014 -0400 |
b |
b'@@ -0,0 +1,103 @@\n+0_0.666667\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t5436010\t5436039\t1\t1\t30\t30\tCGGACAGCGCCGCCACCAACAAAGCCACCA\tCGGACAGCGCCGCCACCAACAAAGCCACCA\t||||||||||||||||||||||||||||||\n+1_0.600000\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t696993\t697022\t1\t1\t30\t30\tAAAACACCGGATGCTCCGGCGCTGGCAGAT\tAAAACACCGGATGCTCCGGCGCTGGCAGAT\t||||||||||||||||||||||||||||||\n+2_0.400000\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t4100018\t4100047\t1\t1\t30\t30\tTTTGCTTTTAGTACACCGGATTCAGAACCA\tTTTGCTTTTAGTACACCGGATTCAGAACCA\t||||||||||||||||||||||||||||||\n+3_0.566667\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t30\t1\t2305844\t2305873\t1\t-1\t30\t30\tCCGTCCAGAAAGGTGTATTCATGGGGACGG\tCCGTCCAGAAAGGTGTATTCATGGGGACGG\t||||||||||||||||||||||||||||||\n+4_0.766667\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t3457901\t3457930\t1\t1\t30\t30\tCACGCTACGTGCGCCCCCGCCCAGAAGGCG\tCACGCTACGTGCGCCCCCGCCCAGAAGGCG\t||||||||||||||||||||||||||||||\n+5_0.533333\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t30\t1\t1264131\t1264160\t1\t-1\t30\t30\tGCACTTAACCCGCTTCGGCGGGTTTTGTTT\tGCACTTAACCCGCTTCGGCGGGTTTTGTTT\t||||||||||||||||||||||||||||||\n+5_0.533333\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t30\t1\t1636485\t1636514\t1\t-1\t30\t30\tGCACTTAACCCGCTTCGGCGGGTTTTGTTT\tGCACTTAACCCGCTTCGGCGGGTTTTGTTT\t||||||||||||||||||||||||||||||\n+5_0.533333\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t2318694\t2318723\t1\t1\t30\t30\tAAACAAAACCCGCCGAAGCGGGTTAAGTGC\tAAACAAAACCCGCCGAAGCGGGTTAAGTGC\t||||||||||||||||||||||||||||||\n+5_0.533333\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t2777043\t2777072\t1\t1\t30\t30\tAAACAAAACCCGCCGAAGCGGGTTAAGTGC\tAAACAAAACCCGCCGAAGCGGGTTAAGTGC\t||||||||||||||||||||||||||||||\n+6_0.533333\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t1185124\t1185153\t1\t1\t30\t30\tTAAGCCGTTACTGGCAGCAAGTGCAGGCAA\tTAAGCCGTTACTGGCAGCAAGTGCAGGCAA\t||||||||||||||||||||||||||||||\n+7_0.400000\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t2458203\t2458232\t1\t1\t30\t30\tTGAATTTACCGTTATCTATCTTGCCTGCCT\tTGAATTTACCGTTATCTATCTTGCCTGCCT\t||||||||||||||||||||||||||||||\n+9_0.400000\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t5286666\t5286695\t1\t1\t30\t30\tGCGTTTTGCTAAACTTCTGCCGGAATATAA\tGCGTTTTGCTAAACTTCTGCCGGAATATAA\t||||||||||||||||||||||||||||||\n+10_0.500000\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t2787672\t2787701\t1\t1\t30\t30\tAAAGAGGCGAGCAGAGTAAAACGCAGGCAA\tAAAGAGGCGAGCAGAGTAAAACGCAGGCAA\t||||||||||||||||||||||||||||||\n+12_0.700000\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t4116145\t4116174\t1\t1\t30\t30\tGCGGACGATCTTCACGGTCGCCACGCGGAC\tGCGGACGATCTTCACGGTCGCCACGCGGAC\t||||||||||||||||||||||||||||||\n+13_0.533333\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t5442351\t5442380\t1\t1\t30\t30\tTTCTTGTTGGATGGCATACTCCGGCAGCCA\tTTCTTGTTGGATGGCATACTCCGGCAGCCA\t||||||||||||||||||||||||||||||\n+14_0.666667\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t4626492\t4626521\t1\t1\t30\t30\tACCCCGATATCGTCGCAGGCGTTGCCGCAC\tACCCCGATATCGTCGCAGGCGTTGCCGCAC\t||||||||||||||||||||||||||||||\n+15_0.666667\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t346897\t346926\t1\t1\t30\t30\tGGCTGCTGTCTCCGCTGGAAGAGGCGCTTC\tGGCTGCTGTCTCCGCTGGAAGAGGCGCTTC\t||||||||||||||||||||||||||||||\n+15_0.666667\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t30\t1\t1466536\t1466565\t1\t-1\t30\t30\tGAAGCGCCTCTTCCAGCGGAGACAGCAGCC\tGAAGCGCCTCTTCCAGCGGAGACAGCAGCC\t||||||||||||||||||||||||||||||\n+15_0.666667\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t1750132\t1750161\t1\t1\t30\t30\tGGCTGCTGTCTCCGCTGGAAGAGGCGCTTC\tGGCTGCTGTCTCCGCTGGAAGAGGCGCTTC\t||||||||||||||||||||||||||||||\n+15_0.666667\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t1908046\t1908075\t1\t1\t30\t30\tGGCTGCTGTCTCCGCTGGAAGAGGCGCTTC\tGGCTGCTGTCTCCGCTGGAAGAGGCGCTTC\t||||||||||||||||||||||||||||||\n+15_0.666667\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t2279107\t2279136\t1\t1\t30\t30\tGGCTGCTGTCTCCGCTGGAAGAGGCGCTTC\tGGCTGCTGTCTCCGCTGGAAGAGGCGCTTC\t||||||||||||||||||||||||||||||\n+15_0.666667\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t2813407\t2813436\t1\t1\t30\t30\tGGCTGCTGTCTCCGCTGGAAGAGGCGCTTC\tGGCTGCTGTCTCCGCTGGAAGAGGCGCTTC\t||||||||||||||||||||'..b'||||\n+70_0.466667\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t1317239\t1317268\t1\t1\t30\t30\tTGGTGTTCAGCATCTCAACGGTAATTCGCT\tTGGTGTTCAGCATCTCAACGGTAATTCGCT\t||||||||||||||||||||||||||||||\n+71_0.533333\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t5080463\t5080492\t1\t1\t30\t30\tCAGGATGCAAACTGCCGGGAGATCCAGTTA\tCAGGATGCAAACTGCCGGGAGATCCAGTTA\t||||||||||||||||||||||||||||||\n+72_0.533333\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t3605409\t3605438\t1\t1\t30\t30\tAACTGGAAGGGCTTGGGATGACACAACAGC\tAACTGGAAGGGCTTGGGATGACACAACAGC\t||||||||||||||||||||||||||||||\n+73_0.500000\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t3280815\t3280844\t1\t1\t30\t30\tTTTAAGCGCCAACCAGGCTTCTTTGGTTGC\tTTTAAGCGCCAACCAGGCTTCTTTGGTTGC\t||||||||||||||||||||||||||||||\n+75_0.533333\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t2230266\t2230295\t1\t1\t30\t30\tATAACCCTCTGCAACCGCCGCTTCAGCAAA\tATAACCCTCTGCAACCGCCGCTTCAGCAAA\t||||||||||||||||||||||||||||||\n+76_0.600000\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t462631\t462660\t1\t1\t30\t30\tTGAAGCCGTACAACGGGCGCTGGAATTCGC\tTGAAGCCGTACAACGGGCGCTGGAATTCGC\t||||||||||||||||||||||||||||||\n+77_0.700000\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t2939076\t2939105\t1\t1\t30\t30\tGAGCTGCAACGCGGTCAGCCAGCTGGCGGT\tGAGCTGCAACGCGGTCAGCCAGCTGGCGGT\t||||||||||||||||||||||||||||||\n+78_0.566667\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t5422053\t5422082\t1\t1\t30\t30\tCGGAGTATCCGTTCCCCAACGACAAGCATC\tCGGAGTATCCGTTCCCCAACGACAAGCATC\t||||||||||||||||||||||||||||||\n+79_0.500000\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t4387420\t4387449\t1\t1\t30\t30\tAATACCGGGAAGAGACAACGGGGTCTCTTT\tAATACCGGGAAGAGACAACGGGGTCTCTTT\t||||||||||||||||||||||||||||||\n+81_0.433333\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t1037150\t1037179\t1\t1\t30\t30\tATTAATGTTGCCGGCACAACATAATAGGGC\tATTAATGTTGCCGGCACAACATAATAGGGC\t||||||||||||||||||||||||||||||\n+82_0.500000\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t1182188\t1182217\t1\t1\t30\t30\tACTGGGTTGCTCTGAACAAGAAAGGCGCTA\tACTGGGTTGCTCTGAACAAGAAAGGCGCTA\t||||||||||||||||||||||||||||||\n+83_0.533333\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t5430210\t5430239\t1\t1\t30\t30\tCGCCAGGGACGTATCGCGTCGATATCTATT\tCGCCAGGGACGTATCGCGTCGATATCTATT\t||||||||||||||||||||||||||||||\n+84_0.533333\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t310995\t311024\t1\t1\t30\t30\tTGCTCGTTCCCGTCGTGATGAAGCTCGAAA\tTGCTCGTTCCCGTCGTGATGAAGCTCGAAA\t||||||||||||||||||||||||||||||\n+85_0.500000\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t4174173\t4174202\t1\t1\t30\t30\tAGGAAAGCAAACAACACGACCACCATCAGC\tAGGAAAGCAAACAACACGACCACCATCAGC\t||||||||||||||||||||||||||||||\n+86_0.566667\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t163690\t163719\t1\t1\t30\t30\tGGCAACGCAGGCGCATGATTCTGCTTGGAA\tGGCAACGCAGGCGCATGATTCTGCTTGGAA\t||||||||||||||||||||||||||||||\n+88_0.533333\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t3071785\t3071814\t1\t1\t30\t30\tCGTACCGGGCTGAAAGTAGAAGAGCGTTTC\tCGTACCGGGCTGAAAGTAGAAGAGCGTTTC\t||||||||||||||||||||||||||||||\n+90_0.466667\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t692131\t692160\t1\t1\t30\t30\tATCACCGTTTCGCTAACCGGTACGTTTAAC\tATCACCGTTTCGCTAACCGGTACGTTTAAC\t||||||||||||||||||||||||||||||\n+91_0.566667\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t3307050\t3307079\t1\t1\t30\t30\tTTCGCCCGGCAAGCTTACCCAACGCTTATC\tTTCGCCCGGCAAGCTTACCCAACGCTTATC\t||||||||||||||||||||||||||||||\n+94_0.466667\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t1813009\t1813038\t1\t1\t30\t30\tCCACGGTGATATCTGGTGCCATACTGATAA\tCCACGGTGATATCTGGTGCCATACTGATAA\t||||||||||||||||||||||||||||||\n+96_0.533333\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t573782\t573811\t1\t1\t30\t30\tTTGCCGGGAAGAGAGATATCAATGGCAGGC\tTTGCCGGGAAGAGAGATATCAATGGCAGGC\t||||||||||||||||||||||||||||||\n+97_0.566667\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t3570563\t3570592\t1\t1\t30\t30\tTGCGCCGCCGGATTGTTGCTCAACATGCTT\tTGCGCCGCCGGATTGTTGCTCAACATGCTT\t||||||||||||||||||||||||||||||\n+98_0.366667\t30\tgnl|BL_ORD_ID|0\t5528445\t59.96\t8.38112e-11\t1\t30\t4545136\t4545165\t1\t1\t30\t30\tAACGCGCTAACCGCCAATAATAACAAAATT\tAACGCGCTAACCGCCAATAATAACAAAATT\t||||||||||||||||||||||||||||||\n' |