8
|
1 #!/usr/bin/env python
|
|
2 """Script for reformatting Blast XML to suit Blast2GO.
|
|
3
|
|
4 This script takes exactly two command line arguments:
|
|
5 * Input BLAST XML filename
|
|
6 * Output BLAST XML filename
|
|
7
|
|
8 Sadly b2g4pipe (at least v2.3.5 to v2.5.0) cannot cope with current
|
|
9 style large BLAST XML files (e.g. from BLAST 2.2.25+), so we reformat
|
|
10 these to avoid it crashing with a Java heap space OutOfMemoryError.
|
|
11
|
|
12 As part of this reformatting, we check for BLASTP or BLASTX output
|
|
13 (otherwise raise an error), and print the query count.
|
|
14
|
|
15 This script is called from my Galaxy wrapper for Blast2GO for pipelines,
|
|
16 available from the Galaxy Tool Shed here:
|
|
17 http://toolshed.g2.bx.psu.edu/view/peterjc/blast2go
|
|
18
|
|
19 This script is under version control here:
|
|
20 https://github.com/peterjc/galaxy_blast/tree/master/blast2go
|
|
21 """
|
|
22 import sys
|
|
23 import os
|
|
24 import subprocess
|
|
25
|
|
26 def stop_err(msg, error_level=1):
|
|
27 """Print error message to stdout and quit with given error level."""
|
|
28 sys.stderr.write("%s\n" % msg)
|
|
29 sys.exit(error_level)
|
|
30
|
|
31 def prepare_xml(original_xml, mangled_xml):
|
|
32 """Reformat BLAST XML to suit Blast2GO.
|
|
33
|
|
34 Blast2GO can't cope with 1000s of <Iteration> tags within a
|
|
35 single <BlastResult> tag, so instead split this into one
|
|
36 full XML record per interation (i.e. per query). This gives
|
|
37 a concatenated XML file mimicing old versions of BLAST.
|
|
38
|
|
39 This also checks for BLASTP or BLASTX output, and outputs
|
|
40 the number of queries. Galaxy will show this as "info".
|
|
41 """
|
|
42 in_handle = open(original_xml)
|
|
43 footer = " </BlastOutput_iterations>\n</BlastOutput>\n"
|
|
44 header = ""
|
|
45 while True:
|
|
46 line = in_handle.readline()
|
|
47 if not line:
|
|
48 #No hits?
|
|
49 stop_err("Problem with XML file?")
|
|
50 if line.strip() == "<Iteration>":
|
|
51 break
|
|
52 header += line
|
|
53
|
|
54 if "<BlastOutput_program>blastx</BlastOutput_program>" in header:
|
|
55 print "BLASTX output identified"
|
|
56 elif "<BlastOutput_program>blastp</BlastOutput_program>" in header:
|
|
57 print "BLASTP output identified"
|
|
58 else:
|
|
59 in_handle.close()
|
|
60 stop_err("Expect BLASTP or BLASTX output")
|
|
61
|
|
62 out_handle = open(mangled_xml, "w")
|
|
63 out_handle.write(header)
|
|
64 out_handle.write(line)
|
|
65 count = 1
|
|
66 while True:
|
|
67 line = in_handle.readline()
|
|
68 if not line:
|
|
69 break
|
|
70 elif line.strip() == "<Iteration>":
|
|
71 #Insert footer/header
|
|
72 out_handle.write(footer)
|
|
73 out_handle.write(header)
|
|
74 count += 1
|
|
75 out_handle.write(line)
|
|
76
|
|
77 out_handle.close()
|
|
78 in_handle.close()
|
|
79 print "Input has %i queries" % count
|
|
80
|
|
81
|
|
82 if __name__ == "__main__":
|
|
83 # Run the conversion...
|
|
84 if len(sys.argv) != 3:
|
|
85 stop_err("Require two arguments: XML input filename, XML output filename")
|
|
86
|
|
87 xml_file, out_xml_file = sys.argv[1:]
|
|
88
|
|
89 if not os.path.isfile(xml_file):
|
|
90 stop_err("Input BLAST XML file not found: %s" % xml_file)
|
|
91
|
|
92 prepare_xml(xml_file, out_xml_file)
|