0
|
1 #!/usr/bin/env python
|
|
2 import argparse
|
|
3 import copy
|
|
4 import logging
|
|
5 import re
|
|
6 import sys
|
|
7
|
|
8 from BCBio import GFF
|
6
|
9
|
0
|
10 logging.basicConfig(level=logging.INFO)
|
6
|
11 log = logging.getLogger(name="blastxml2gff3")
|
0
|
12
|
|
13 __doc__ = """
|
|
14 BlastXML files, when transformed to GFF3, do not normally show gaps in the
|
|
15 blast hits. This tool aims to fill that "gap".
|
|
16 """
|
|
17
|
|
18
|
|
19 def blastxml2gff3(blastxml, min_gap=3, trim=False, trim_end=False, include_seq=False):
|
|
20 from Bio.Blast import NCBIXML
|
|
21 from Bio.Seq import Seq
|
|
22 from Bio.SeqRecord import SeqRecord
|
|
23 from Bio.SeqFeature import SeqFeature, SimpleLocation
|
|
24
|
|
25 blast_records = NCBIXML.parse(blastxml)
|
|
26 for idx_record, record in enumerate(blast_records):
|
|
27 # http://www.sequenceontology.org/browser/release_2.4/term/SO:0000343
|
|
28 match_type = { # Currently we can only handle BLASTN, BLASTP
|
6
|
29 "BLASTN": "nucleotide_match",
|
|
30 "BLASTP": "protein_match",
|
|
31 }.get(record.application, "match")
|
0
|
32
|
|
33 recid = record.query
|
6
|
34 if " " in recid:
|
7
|
35 recid = recid[0: recid.index(" ")]
|
0
|
36
|
|
37 rec = SeqRecord(Seq("ACTG"), id=recid)
|
|
38 for idx_hit, hit in enumerate(record.alignments):
|
|
39 for idx_hsp, hsp in enumerate(hit.hsps):
|
|
40 qualifiers = {
|
6
|
41 "ID": "b2g.%s.%s.%s" % (idx_record, idx_hit, idx_hsp),
|
0
|
42 "source": "blast",
|
|
43 "score": hsp.expect,
|
|
44 "accession": hit.accession,
|
|
45 "hit_id": hit.hit_id,
|
|
46 "length": hit.length,
|
6
|
47 "hit_titles": hit.title.split(" >"),
|
0
|
48 }
|
|
49 if include_seq:
|
6
|
50 qualifiers.update(
|
|
51 {
|
|
52 "blast_qseq": hsp.query,
|
|
53 "blast_sseq": hsp.sbjct,
|
|
54 "blast_mseq": hsp.match,
|
|
55 }
|
|
56 )
|
0
|
57
|
6
|
58 for prop in (
|
|
59 "score",
|
|
60 "bits",
|
|
61 "identities",
|
|
62 "positives",
|
|
63 "gaps",
|
|
64 "align_length",
|
|
65 "strand",
|
|
66 "frame",
|
|
67 "query_start",
|
|
68 "query_end",
|
|
69 "sbjct_start",
|
|
70 "sbjct_end",
|
|
71 ):
|
|
72 qualifiers["blast_" + prop] = getattr(hsp, prop, None)
|
0
|
73
|
6
|
74 desc = hit.title.split(" >")[0]
|
7
|
75 qualifiers["description"] = desc[desc.index(" "):]
|
0
|
76
|
|
77 # This required a fair bit of sketching out/match to figure out
|
|
78 # the first time.
|
|
79 #
|
|
80 # the match_start location must account for queries and
|
|
81 # subjecst that start at locations other than 1
|
|
82 parent_match_start = hsp.query_start - hsp.sbjct_start
|
|
83 # The end is the start + hit.length because the match itself
|
|
84 # may be longer than the parent feature, so we use the supplied
|
|
85 # subject/hit length to calculate the real ending of the target
|
|
86 # protein.
|
6
|
87 parent_match_end = hsp.query_start + hit.length + hsp.query.count("-")
|
0
|
88
|
|
89 # If we trim the left end, we need to trim without losing information.
|
|
90 used_parent_match_start = parent_match_start
|
|
91 if trim:
|
|
92 if parent_match_start < 1:
|
|
93 used_parent_match_start = 0
|
|
94
|
|
95 if trim or trim_end:
|
|
96 if parent_match_end > hsp.query_end:
|
|
97 parent_match_end = hsp.query_end + 1
|
|
98
|
|
99 # The ``match`` feature will hold one or more ``match_part``s
|
|
100 top_feature = SeqFeature(
|
|
101 SimpleLocation(used_parent_match_start, parent_match_end, strand=0),
|
|
102 type=match_type,
|
6
|
103 qualifiers=qualifiers,
|
0
|
104 )
|
|
105
|
|
106 # Unlike the parent feature, ``match_part``s have sources.
|
|
107 part_qualifiers = {
|
|
108 "source": "blast",
|
|
109 }
|
|
110 top_feature.sub_features = []
|
6
|
111 for idx_part, (start, end, cigar) in enumerate(
|
|
112 generate_parts(
|
|
113 hsp.query, hsp.match, hsp.sbjct, ignore_under=min_gap
|
|
114 )
|
|
115 ):
|
|
116 part_qualifiers["Gap"] = cigar
|
|
117 part_qualifiers["ID"] = qualifiers["ID"] + (".%s" % idx_part)
|
0
|
118
|
|
119 # Otherwise, we have to account for the subject start's location
|
|
120 match_part_start = parent_match_start + hsp.sbjct_start + start - 1
|
|
121
|
|
122 # We used to use hsp.align_length here, but that includes
|
|
123 # gaps in the parent sequence
|
|
124 #
|
|
125 # Furthermore align_length will give calculation errors in weird places
|
|
126 # So we just use (end-start) for simplicity
|
|
127 match_part_end = match_part_start + (end - start)
|
|
128
|
|
129 top_feature.sub_features.append(
|
|
130 SeqFeature(
|
|
131 SimpleLocation(match_part_start, match_part_end, strand=1),
|
|
132 type="match_part",
|
6
|
133 qualifiers=copy.deepcopy(part_qualifiers),
|
|
134 )
|
0
|
135 )
|
|
136
|
|
137 rec.features.append(top_feature)
|
|
138 rec.annotations = {}
|
|
139 yield rec
|
|
140
|
|
141
|
|
142 def __remove_query_gaps(query, match, subject):
|
|
143 """remove positions in all three based on gaps in query
|
|
144
|
|
145 In order to simplify math and calculations...we remove all of the gaps
|
|
146 based on gap locations in the query sequence::
|
|
147
|
|
148 Q:ACTG-ACTGACTG
|
|
149 S:ACTGAAC---CTG
|
|
150
|
|
151 will become::
|
|
152
|
|
153 Q:ACTGACTGACTG
|
|
154 S:ACTGAC---CTG
|
|
155
|
|
156 which greatly simplifies the process of identifying the correct location
|
|
157 for a match_part
|
|
158 """
|
|
159 prev = 0
|
6
|
160 fq = ""
|
|
161 fm = ""
|
|
162 fs = ""
|
|
163 for position in re.finditer("-", query):
|
7
|
164 fq += query[prev: position.start()]
|
|
165 fm += match[prev: position.start()]
|
|
166 fs += subject[prev: position.start()]
|
0
|
167 prev = position.start() + 1
|
|
168 fq += query[prev:]
|
|
169 fm += match[prev:]
|
|
170 fs += subject[prev:]
|
|
171
|
|
172 return (fq, fm, fs)
|
|
173
|
|
174
|
|
175 def generate_parts(query, match, subject, ignore_under=3):
|
|
176 region_q = []
|
|
177 region_m = []
|
|
178 region_s = []
|
|
179
|
|
180 (query, match, subject) = __remove_query_gaps(query, match, subject)
|
|
181
|
|
182 region_start = -1
|
|
183 region_end = -1
|
|
184 mismatch_count = 0
|
|
185 for i, (q, m, s) in enumerate(zip(query, match, subject)):
|
|
186
|
|
187 # If we have a match
|
6
|
188 if m != " " or m == "+":
|
0
|
189 if region_start == -1:
|
|
190 region_start = i
|
|
191 # It's a new region, we need to reset or it's pre-seeded with
|
|
192 # spaces
|
|
193 region_q = []
|
|
194 region_m = []
|
|
195 region_s = []
|
|
196 region_end = i
|
|
197 mismatch_count = 0
|
|
198 else:
|
|
199 mismatch_count += 1
|
|
200
|
|
201 region_q.append(q)
|
|
202 region_m.append(m)
|
|
203 region_s.append(s)
|
|
204
|
|
205 if mismatch_count >= ignore_under and region_start != -1 and region_end != -1:
|
|
206 region_q = region_q[0:-ignore_under]
|
|
207 region_m = region_m[0:-ignore_under]
|
|
208 region_s = region_s[0:-ignore_under]
|
6
|
209 yield region_start, region_end + 1, cigar_from_string(
|
|
210 region_q, region_m, region_s, strict_m=True
|
|
211 )
|
0
|
212 region_q = []
|
|
213 region_m = []
|
|
214 region_s = []
|
|
215
|
|
216 region_start = -1
|
|
217 region_end = -1
|
|
218 mismatch_count = 0
|
|
219
|
6
|
220 yield region_start, region_end + 1, cigar_from_string(
|
|
221 region_q, region_m, region_s, strict_m=True
|
|
222 )
|
0
|
223
|
|
224
|
|
225 def _qms_to_matches(query, match, subject, strict_m=True):
|
|
226 matchline = []
|
|
227
|
|
228 for (q, m, s) in zip(query, match, subject):
|
6
|
229 ret = ""
|
0
|
230
|
6
|
231 if m != " " or m == "+":
|
|
232 ret = "="
|
|
233 elif m == " ":
|
|
234 if q == "-":
|
|
235 ret = "D"
|
|
236 elif s == "-":
|
|
237 ret = "I"
|
0
|
238 else:
|
6
|
239 ret = "X"
|
0
|
240 else:
|
|
241 log.warn("Bad data: \n\t%s\n\t%s\n\t%s\n" % (query, match, subject))
|
|
242
|
|
243 if strict_m:
|
6
|
244 if ret == "=" or ret == "X":
|
|
245 ret = "M"
|
0
|
246
|
|
247 matchline.append(ret)
|
|
248 return matchline
|
|
249
|
|
250
|
|
251 def _matchline_to_cigar(matchline):
|
|
252 cigar_line = []
|
|
253 last_char = matchline[0]
|
|
254 count = 0
|
|
255 for char in matchline:
|
|
256 if char == last_char:
|
|
257 count += 1
|
|
258 else:
|
|
259 cigar_line.append("%s%s" % (last_char, count))
|
|
260 count = 1
|
|
261 last_char = char
|
|
262 cigar_line.append("%s%s" % (last_char, count))
|
6
|
263 return " ".join(cigar_line)
|
0
|
264
|
|
265
|
|
266 def cigar_from_string(query, match, subject, strict_m=True):
|
|
267 matchline = _qms_to_matches(query, match, subject, strict_m=strict_m)
|
|
268 if len(matchline) > 0:
|
|
269 return _matchline_to_cigar(matchline)
|
|
270 else:
|
|
271 return ""
|
|
272
|
|
273
|
6
|
274 if __name__ == "__main__":
|
|
275 parser = argparse.ArgumentParser(
|
|
276 description="Convert Blast XML to gapped GFF3", epilog=""
|
|
277 )
|
|
278 parser.add_argument(
|
|
279 "blastxml", type=argparse.FileType("r"), help="Blast XML Output"
|
|
280 )
|
|
281 parser.add_argument(
|
|
282 "--min_gap",
|
|
283 type=int,
|
|
284 help="Maximum gap size before generating a new match_part",
|
|
285 default=3,
|
|
286 )
|
|
287 parser.add_argument(
|
|
288 "--trim",
|
|
289 action="store_true",
|
|
290 help="Trim blast hits to be only as long as the parent feature",
|
|
291 )
|
|
292 parser.add_argument(
|
|
293 "--trim_end", action="store_true", help="Cut blast results off at end of gene"
|
|
294 )
|
|
295 parser.add_argument("--include_seq", action="store_true", help="Include sequence")
|
0
|
296 args = parser.parse_args()
|
|
297
|
|
298 for rec in blastxml2gff3(**vars(args)):
|
|
299 if len(rec.features):
|
|
300 GFF.write([rec], sys.stdout)
|