0
|
1 #!/usr/bin/env python
|
|
2 import argparse
|
|
3 import copy
|
|
4 import logging
|
|
5 import re
|
|
6 import sys
|
|
7
|
|
8 from BCBio import GFF
|
|
9 logging.basicConfig(level=logging.INFO)
|
|
10 log = logging.getLogger(name='blastxml2gff3')
|
|
11
|
|
12 __doc__ = """
|
|
13 BlastXML files, when transformed to GFF3, do not normally show gaps in the
|
|
14 blast hits. This tool aims to fill that "gap".
|
|
15 """
|
|
16
|
|
17
|
|
18 def blastxml2gff3(blastxml, min_gap=3, trim=False, trim_end=False, include_seq=False):
|
|
19 from Bio.Blast import NCBIXML
|
|
20 from Bio.Seq import Seq
|
|
21 from Bio.SeqRecord import SeqRecord
|
|
22 from Bio.SeqFeature import SeqFeature, SimpleLocation
|
|
23
|
|
24 blast_records = NCBIXML.parse(blastxml)
|
|
25 for idx_record, record in enumerate(blast_records):
|
|
26 # http://www.sequenceontology.org/browser/release_2.4/term/SO:0000343
|
|
27 match_type = { # Currently we can only handle BLASTN, BLASTP
|
|
28 'BLASTN': 'nucleotide_match',
|
|
29 'BLASTP': 'protein_match',
|
|
30 }.get(record.application, 'match')
|
|
31
|
|
32 recid = record.query
|
|
33 if ' ' in recid:
|
|
34 recid = recid[0:recid.index(' ')]
|
|
35
|
|
36 rec = SeqRecord(Seq("ACTG"), id=recid)
|
|
37 for idx_hit, hit in enumerate(record.alignments):
|
|
38 for idx_hsp, hsp in enumerate(hit.hsps):
|
|
39 qualifiers = {
|
|
40 "ID": 'b2g.%s.%s.%s' % (idx_record, idx_hit, idx_hsp),
|
|
41 "source": "blast",
|
|
42 "score": hsp.expect,
|
|
43 "accession": hit.accession,
|
|
44 "hit_id": hit.hit_id,
|
|
45 "length": hit.length,
|
|
46 "hit_titles": hit.title.split(' >'),
|
|
47 }
|
|
48 if include_seq:
|
|
49 qualifiers.update({
|
|
50 'blast_qseq': hsp.query,
|
|
51 'blast_sseq': hsp.sbjct,
|
|
52 'blast_mseq': hsp.match,
|
|
53 })
|
|
54
|
|
55 for prop in ('score', 'bits', 'identities', 'positives',
|
|
56 'gaps', 'align_length', 'strand', 'frame',
|
|
57 'query_start', 'query_end', 'sbjct_start',
|
|
58 'sbjct_end'):
|
|
59 qualifiers['blast_' + prop] = getattr(hsp, prop, None)
|
|
60
|
|
61 desc = hit.title.split(' >')[0]
|
|
62 qualifiers['description'] = desc[desc.index(' '):]
|
|
63
|
|
64 # This required a fair bit of sketching out/match to figure out
|
|
65 # the first time.
|
|
66 #
|
|
67 # the match_start location must account for queries and
|
|
68 # subjecst that start at locations other than 1
|
|
69 parent_match_start = hsp.query_start - hsp.sbjct_start
|
|
70 # The end is the start + hit.length because the match itself
|
|
71 # may be longer than the parent feature, so we use the supplied
|
|
72 # subject/hit length to calculate the real ending of the target
|
|
73 # protein.
|
|
74 parent_match_end = hsp.query_start + hit.length + hsp.query.count('-')
|
|
75
|
|
76 # If we trim the left end, we need to trim without losing information.
|
|
77 used_parent_match_start = parent_match_start
|
|
78 if trim:
|
|
79 if parent_match_start < 1:
|
|
80 used_parent_match_start = 0
|
|
81
|
|
82 if trim or trim_end:
|
|
83 if parent_match_end > hsp.query_end:
|
|
84 parent_match_end = hsp.query_end + 1
|
|
85
|
|
86 # The ``match`` feature will hold one or more ``match_part``s
|
|
87 top_feature = SeqFeature(
|
|
88 SimpleLocation(used_parent_match_start, parent_match_end, strand=0),
|
|
89 type=match_type,
|
|
90 qualifiers=qualifiers
|
|
91 )
|
|
92
|
|
93 # Unlike the parent feature, ``match_part``s have sources.
|
|
94 part_qualifiers = {
|
|
95 "source": "blast",
|
|
96 }
|
|
97 top_feature.sub_features = []
|
|
98 for idx_part, (start, end, cigar) in \
|
|
99 enumerate(generate_parts(hsp.query, hsp.match,
|
|
100 hsp.sbjct,
|
|
101 ignore_under=min_gap)):
|
|
102 part_qualifiers['Gap'] = cigar
|
|
103 part_qualifiers['ID'] = qualifiers['ID'] + ('.%s' % idx_part)
|
|
104
|
|
105 # Otherwise, we have to account for the subject start's location
|
|
106 match_part_start = parent_match_start + hsp.sbjct_start + start - 1
|
|
107
|
|
108 # We used to use hsp.align_length here, but that includes
|
|
109 # gaps in the parent sequence
|
|
110 #
|
|
111 # Furthermore align_length will give calculation errors in weird places
|
|
112 # So we just use (end-start) for simplicity
|
|
113 match_part_end = match_part_start + (end - start)
|
|
114
|
|
115 top_feature.sub_features.append(
|
|
116 SeqFeature(
|
|
117 SimpleLocation(match_part_start, match_part_end, strand=1),
|
|
118 type="match_part",
|
|
119 qualifiers=copy.deepcopy(part_qualifiers))
|
|
120 )
|
|
121
|
|
122 rec.features.append(top_feature)
|
|
123 rec.annotations = {}
|
|
124 yield rec
|
|
125
|
|
126
|
|
127 def __remove_query_gaps(query, match, subject):
|
|
128 """remove positions in all three based on gaps in query
|
|
129
|
|
130 In order to simplify math and calculations...we remove all of the gaps
|
|
131 based on gap locations in the query sequence::
|
|
132
|
|
133 Q:ACTG-ACTGACTG
|
|
134 S:ACTGAAC---CTG
|
|
135
|
|
136 will become::
|
|
137
|
|
138 Q:ACTGACTGACTG
|
|
139 S:ACTGAC---CTG
|
|
140
|
|
141 which greatly simplifies the process of identifying the correct location
|
|
142 for a match_part
|
|
143 """
|
|
144 prev = 0
|
|
145 fq = ''
|
|
146 fm = ''
|
|
147 fs = ''
|
|
148 for position in re.finditer('-', query):
|
|
149 fq += query[prev:position.start()]
|
|
150 fm += match[prev:position.start()]
|
|
151 fs += subject[prev:position.start()]
|
|
152 prev = position.start() + 1
|
|
153 fq += query[prev:]
|
|
154 fm += match[prev:]
|
|
155 fs += subject[prev:]
|
|
156
|
|
157 return (fq, fm, fs)
|
|
158
|
|
159
|
|
160 def generate_parts(query, match, subject, ignore_under=3):
|
|
161 region_q = []
|
|
162 region_m = []
|
|
163 region_s = []
|
|
164
|
|
165 (query, match, subject) = __remove_query_gaps(query, match, subject)
|
|
166
|
|
167 region_start = -1
|
|
168 region_end = -1
|
|
169 mismatch_count = 0
|
|
170 for i, (q, m, s) in enumerate(zip(query, match, subject)):
|
|
171
|
|
172 # If we have a match
|
|
173 if m != ' ' or m == '+':
|
|
174 if region_start == -1:
|
|
175 region_start = i
|
|
176 # It's a new region, we need to reset or it's pre-seeded with
|
|
177 # spaces
|
|
178 region_q = []
|
|
179 region_m = []
|
|
180 region_s = []
|
|
181 region_end = i
|
|
182 mismatch_count = 0
|
|
183 else:
|
|
184 mismatch_count += 1
|
|
185
|
|
186 region_q.append(q)
|
|
187 region_m.append(m)
|
|
188 region_s.append(s)
|
|
189
|
|
190 if mismatch_count >= ignore_under and region_start != -1 and region_end != -1:
|
|
191 region_q = region_q[0:-ignore_under]
|
|
192 region_m = region_m[0:-ignore_under]
|
|
193 region_s = region_s[0:-ignore_under]
|
|
194 yield region_start, region_end + 1, \
|
|
195 cigar_from_string(region_q, region_m, region_s, strict_m=True)
|
|
196 region_q = []
|
|
197 region_m = []
|
|
198 region_s = []
|
|
199
|
|
200 region_start = -1
|
|
201 region_end = -1
|
|
202 mismatch_count = 0
|
|
203
|
|
204 yield region_start, region_end + 1, \
|
|
205 cigar_from_string(region_q, region_m, region_s, strict_m=True)
|
|
206
|
|
207
|
|
208 def _qms_to_matches(query, match, subject, strict_m=True):
|
|
209 matchline = []
|
|
210
|
|
211 for (q, m, s) in zip(query, match, subject):
|
|
212 ret = ''
|
|
213
|
|
214 if m != ' ' or m == '+':
|
|
215 ret = '='
|
|
216 elif m == ' ':
|
|
217 if q == '-':
|
|
218 ret = 'D'
|
|
219 elif s == '-':
|
|
220 ret = 'I'
|
|
221 else:
|
|
222 ret = 'X'
|
|
223 else:
|
|
224 log.warn("Bad data: \n\t%s\n\t%s\n\t%s\n" % (query, match, subject))
|
|
225
|
|
226 if strict_m:
|
|
227 if ret == '=' or ret == 'X':
|
|
228 ret = 'M'
|
|
229
|
|
230 matchline.append(ret)
|
|
231 return matchline
|
|
232
|
|
233
|
|
234 def _matchline_to_cigar(matchline):
|
|
235 cigar_line = []
|
|
236 last_char = matchline[0]
|
|
237 count = 0
|
|
238 for char in matchline:
|
|
239 if char == last_char:
|
|
240 count += 1
|
|
241 else:
|
|
242 cigar_line.append("%s%s" % (last_char, count))
|
|
243 count = 1
|
|
244 last_char = char
|
|
245 cigar_line.append("%s%s" % (last_char, count))
|
|
246 return ' '.join(cigar_line)
|
|
247
|
|
248
|
|
249 def cigar_from_string(query, match, subject, strict_m=True):
|
|
250 matchline = _qms_to_matches(query, match, subject, strict_m=strict_m)
|
|
251 if len(matchline) > 0:
|
|
252 return _matchline_to_cigar(matchline)
|
|
253 else:
|
|
254 return ""
|
|
255
|
|
256
|
|
257 if __name__ == '__main__':
|
|
258 parser = argparse.ArgumentParser(description='Convert Blast XML to gapped GFF3', epilog='')
|
|
259 parser.add_argument('blastxml', type=argparse.FileType("r"), help='Blast XML Output')
|
|
260 parser.add_argument('--min_gap', type=int, help='Maximum gap size before generating a new match_part', default=3)
|
|
261 parser.add_argument('--trim', action='store_true', help='Trim blast hits to be only as long as the parent feature')
|
|
262 parser.add_argument('--trim_end', action='store_true', help='Cut blast results off at end of gene')
|
|
263 parser.add_argument('--include_seq', action='store_true', help='Include sequence')
|
|
264 args = parser.parse_args()
|
|
265
|
|
266 for rec in blastxml2gff3(**vars(args)):
|
|
267 if len(rec.features):
|
|
268 GFF.write([rec], sys.stdout)
|