Mercurial > repos > peterjc > blast2go
comparison tools/blast2go/massage_xml_for_blast2go.py @ 8:e23b621eb7bb draft
Uploaded v0.0.9, embed citation, updated README
author | peterjc |
---|---|
date | Thu, 26 Mar 2015 11:15:22 -0400 |
parents | |
children | 887adf823bc0 |
comparison
equal
deleted
inserted
replaced
7:0ac3ef59ea93 | 8:e23b621eb7bb |
---|---|
1 #!/usr/bin/env python | |
2 """Script for reformatting Blast XML to suit Blast2GO. | |
3 | |
4 This script takes exactly two command line arguments: | |
5 * Input BLAST XML filename | |
6 * Output BLAST XML filename | |
7 | |
8 Sadly b2g4pipe (at least v2.3.5 to v2.5.0) cannot cope with current | |
9 style large BLAST XML files (e.g. from BLAST 2.2.25+), so we reformat | |
10 these to avoid it crashing with a Java heap space OutOfMemoryError. | |
11 | |
12 As part of this reformatting, we check for BLASTP or BLASTX output | |
13 (otherwise raise an error), and print the query count. | |
14 | |
15 This script is called from my Galaxy wrapper for Blast2GO for pipelines, | |
16 available from the Galaxy Tool Shed here: | |
17 http://toolshed.g2.bx.psu.edu/view/peterjc/blast2go | |
18 | |
19 This script is under version control here: | |
20 https://github.com/peterjc/galaxy_blast/tree/master/blast2go | |
21 """ | |
22 import sys | |
23 import os | |
24 import subprocess | |
25 | |
26 def stop_err(msg, error_level=1): | |
27 """Print error message to stdout and quit with given error level.""" | |
28 sys.stderr.write("%s\n" % msg) | |
29 sys.exit(error_level) | |
30 | |
31 def prepare_xml(original_xml, mangled_xml): | |
32 """Reformat BLAST XML to suit Blast2GO. | |
33 | |
34 Blast2GO can't cope with 1000s of <Iteration> tags within a | |
35 single <BlastResult> tag, so instead split this into one | |
36 full XML record per interation (i.e. per query). This gives | |
37 a concatenated XML file mimicing old versions of BLAST. | |
38 | |
39 This also checks for BLASTP or BLASTX output, and outputs | |
40 the number of queries. Galaxy will show this as "info". | |
41 """ | |
42 in_handle = open(original_xml) | |
43 footer = " </BlastOutput_iterations>\n</BlastOutput>\n" | |
44 header = "" | |
45 while True: | |
46 line = in_handle.readline() | |
47 if not line: | |
48 #No hits? | |
49 stop_err("Problem with XML file?") | |
50 if line.strip() == "<Iteration>": | |
51 break | |
52 header += line | |
53 | |
54 if "<BlastOutput_program>blastx</BlastOutput_program>" in header: | |
55 print "BLASTX output identified" | |
56 elif "<BlastOutput_program>blastp</BlastOutput_program>" in header: | |
57 print "BLASTP output identified" | |
58 else: | |
59 in_handle.close() | |
60 stop_err("Expect BLASTP or BLASTX output") | |
61 | |
62 out_handle = open(mangled_xml, "w") | |
63 out_handle.write(header) | |
64 out_handle.write(line) | |
65 count = 1 | |
66 while True: | |
67 line = in_handle.readline() | |
68 if not line: | |
69 break | |
70 elif line.strip() == "<Iteration>": | |
71 #Insert footer/header | |
72 out_handle.write(footer) | |
73 out_handle.write(header) | |
74 count += 1 | |
75 out_handle.write(line) | |
76 | |
77 out_handle.close() | |
78 in_handle.close() | |
79 print "Input has %i queries" % count | |
80 | |
81 | |
82 if __name__ == "__main__": | |
83 # Run the conversion... | |
84 if len(sys.argv) != 3: | |
85 stop_err("Require two arguments: XML input filename, XML output filename") | |
86 | |
87 xml_file, out_xml_file = sys.argv[1:] | |
88 | |
89 if not os.path.isfile(xml_file): | |
90 stop_err("Input BLAST XML file not found: %s" % xml_file) | |
91 | |
92 prepare_xml(xml_file, out_xml_file) |