comparison tools/blast2go/massage_xml_for_blast2go.py @ 8:e23b621eb7bb draft

Uploaded v0.0.9, embed citation, updated README
author peterjc
date Thu, 26 Mar 2015 11:15:22 -0400
parents
children 887adf823bc0
comparison
equal deleted inserted replaced
7:0ac3ef59ea93 8:e23b621eb7bb
1 #!/usr/bin/env python
2 """Script for reformatting Blast XML to suit Blast2GO.
3
4 This script takes exactly two command line arguments:
5 * Input BLAST XML filename
6 * Output BLAST XML filename
7
8 Sadly b2g4pipe (at least v2.3.5 to v2.5.0) cannot cope with current
9 style large BLAST XML files (e.g. from BLAST 2.2.25+), so we reformat
10 these to avoid it crashing with a Java heap space OutOfMemoryError.
11
12 As part of this reformatting, we check for BLASTP or BLASTX output
13 (otherwise raise an error), and print the query count.
14
15 This script is called from my Galaxy wrapper for Blast2GO for pipelines,
16 available from the Galaxy Tool Shed here:
17 http://toolshed.g2.bx.psu.edu/view/peterjc/blast2go
18
19 This script is under version control here:
20 https://github.com/peterjc/galaxy_blast/tree/master/blast2go
21 """
22 import sys
23 import os
24 import subprocess
25
26 def stop_err(msg, error_level=1):
27 """Print error message to stdout and quit with given error level."""
28 sys.stderr.write("%s\n" % msg)
29 sys.exit(error_level)
30
31 def prepare_xml(original_xml, mangled_xml):
32 """Reformat BLAST XML to suit Blast2GO.
33
34 Blast2GO can't cope with 1000s of <Iteration> tags within a
35 single <BlastResult> tag, so instead split this into one
36 full XML record per interation (i.e. per query). This gives
37 a concatenated XML file mimicing old versions of BLAST.
38
39 This also checks for BLASTP or BLASTX output, and outputs
40 the number of queries. Galaxy will show this as "info".
41 """
42 in_handle = open(original_xml)
43 footer = " </BlastOutput_iterations>\n</BlastOutput>\n"
44 header = ""
45 while True:
46 line = in_handle.readline()
47 if not line:
48 #No hits?
49 stop_err("Problem with XML file?")
50 if line.strip() == "<Iteration>":
51 break
52 header += line
53
54 if "<BlastOutput_program>blastx</BlastOutput_program>" in header:
55 print "BLASTX output identified"
56 elif "<BlastOutput_program>blastp</BlastOutput_program>" in header:
57 print "BLASTP output identified"
58 else:
59 in_handle.close()
60 stop_err("Expect BLASTP or BLASTX output")
61
62 out_handle = open(mangled_xml, "w")
63 out_handle.write(header)
64 out_handle.write(line)
65 count = 1
66 while True:
67 line = in_handle.readline()
68 if not line:
69 break
70 elif line.strip() == "<Iteration>":
71 #Insert footer/header
72 out_handle.write(footer)
73 out_handle.write(header)
74 count += 1
75 out_handle.write(line)
76
77 out_handle.close()
78 in_handle.close()
79 print "Input has %i queries" % count
80
81
82 if __name__ == "__main__":
83 # Run the conversion...
84 if len(sys.argv) != 3:
85 stop_err("Require two arguments: XML input filename, XML output filename")
86
87 xml_file, out_xml_file = sys.argv[1:]
88
89 if not os.path.isfile(xml_file):
90 stop_err("Input BLAST XML file not found: %s" % xml_file)
91
92 prepare_xml(xml_file, out_xml_file)