annotate megablast_xml_parser.py @ 1:fbff957a3655 draft default tip

"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
author devteam
date Wed, 09 Sep 2020 10:27:39 +0000
parents 03ca082aeb2e
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
03ca082aeb2e Imported from capsule None
devteam
parents:
diff changeset
1 #!/usr/bin/env python
03ca082aeb2e Imported from capsule None
devteam
parents:
diff changeset
2
1
fbff957a3655 "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
3 import re
fbff957a3655 "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
4 import sys
fbff957a3655 "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
5 import xml.etree.cElementTree as ElementTree
0
03ca082aeb2e Imported from capsule None
devteam
parents:
diff changeset
6
03ca082aeb2e Imported from capsule None
devteam
parents:
diff changeset
7
03ca082aeb2e Imported from capsule None
devteam
parents:
diff changeset
8 def __main__():
1
fbff957a3655 "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
9 source = sys.argv[1]
fbff957a3655 "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
10 hspTags = ["Hsp_bit-score",
fbff957a3655 "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
11 "Hsp_evalue",
fbff957a3655 "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
12 "Hsp_query-from",
fbff957a3655 "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
13 "Hsp_query-to",
fbff957a3655 "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
14 "Hsp_hit-from",
fbff957a3655 "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
15 "Hsp_hit-to",
fbff957a3655 "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
16 "Hsp_query-frame",
fbff957a3655 "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
17 "Hsp_hit-frame",
fbff957a3655 "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
18 "Hsp_identity",
fbff957a3655 "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
19 "Hsp_align-len",
fbff957a3655 "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
20 "Hsp_qseq",
fbff957a3655 "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
21 "Hsp_hseq",
fbff957a3655 "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
22 "Hsp_midline"]
0
03ca082aeb2e Imported from capsule None
devteam
parents:
diff changeset
23
03ca082aeb2e Imported from capsule None
devteam
parents:
diff changeset
24 # get an iterable
1
fbff957a3655 "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
25 try:
fbff957a3655 "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
26 context = ElementTree.iterparse(source, events=("start", "end"))
fbff957a3655 "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
27 except Exception:
fbff957a3655 "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
28 sys.exit("Invalid data format.")
0
03ca082aeb2e Imported from capsule None
devteam
parents:
diff changeset
29 # turn it into an iterator
1
fbff957a3655 "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
30 context = iter(context)
0
03ca082aeb2e Imported from capsule None
devteam
parents:
diff changeset
31 # get the root element
03ca082aeb2e Imported from capsule None
devteam
parents:
diff changeset
32 try:
1
fbff957a3655 "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
33 event, root = next(context)
fbff957a3655 "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
34 except Exception:
fbff957a3655 "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
35 sys.exit("Invalid data format.")
0
03ca082aeb2e Imported from capsule None
devteam
parents:
diff changeset
36
1
fbff957a3655 "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
37 with open(sys.argv[2], 'w') as outfile:
fbff957a3655 "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
38 try:
fbff957a3655 "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
39 for event, elem in context:
fbff957a3655 "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
40 # for every <Iteration> tag
fbff957a3655 "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
41 if event == "end" and elem.tag == "Iteration":
fbff957a3655 "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
42 query = elem.findtext("Iteration_query-def")
fbff957a3655 "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
43 qLen = elem.findtext("Iteration_query-len")
fbff957a3655 "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
44 # for every <Hit> within <Iteration>
fbff957a3655 "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
45 for hit in elem.findall("Iteration_hits/Hit"):
fbff957a3655 "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
46 subject = hit.findtext("Hit_id")
fbff957a3655 "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
47 if re.search('^gi', subject):
fbff957a3655 "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
48 subject = subject.split('|')[1]
fbff957a3655 "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
49 sLen = hit.findtext("Hit_len")
fbff957a3655 "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
50 # for every <Hsp> within <Hit>
fbff957a3655 "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
51 for hsp in hit.findall("Hit_hsps/Hsp"):
fbff957a3655 "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
52 outfile.write("%s\t%s\t%s\t%s" % (query, qLen, subject, sLen))
fbff957a3655 "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
53 for tag in hspTags:
fbff957a3655 "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
54 outfile.write("\t%s" % (hsp.findtext(tag)))
fbff957a3655 "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
55 outfile.write('\n')
fbff957a3655 "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
56 # prevents ElementTree from growing large datastructure
fbff957a3655 "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
57 root.clear()
fbff957a3655 "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
58 elem.clear()
fbff957a3655 "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
59 except Exception:
fbff957a3655 "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
60 sys.exit("The input data is malformed, or there is more than one dataset in the input file. Error: %s" % sys.exc_info()[1])
0
03ca082aeb2e Imported from capsule None
devteam
parents:
diff changeset
61
03ca082aeb2e Imported from capsule None
devteam
parents:
diff changeset
62
1
fbff957a3655 "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
63 if __name__ == "__main__":
fbff957a3655 "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
64 __main__()