comparison jbrowse2/blastxml_to_gapped_gff3.py @ 0:cd5d63cd0eb5 draft

Uploaded
author fubar
date Wed, 03 Jan 2024 01:36:39 +0000
parents
children 88b9b105c09b
comparison
equal deleted inserted replaced
-1:000000000000 0:cd5d63cd0eb5
1 #!/usr/bin/env python
2 import argparse
3 import copy
4 import logging
5 import re
6 import sys
7
8 from BCBio import GFF
9 logging.basicConfig(level=logging.INFO)
10 log = logging.getLogger(name='blastxml2gff3')
11
12 __doc__ = """
13 BlastXML files, when transformed to GFF3, do not normally show gaps in the
14 blast hits. This tool aims to fill that "gap".
15 """
16
17
18 def blastxml2gff3(blastxml, min_gap=3, trim=False, trim_end=False, include_seq=False):
19 from Bio.Blast import NCBIXML
20 from Bio.Seq import Seq
21 from Bio.SeqRecord import SeqRecord
22 from Bio.SeqFeature import SeqFeature, SimpleLocation
23
24 blast_records = NCBIXML.parse(blastxml)
25 for idx_record, record in enumerate(blast_records):
26 # http://www.sequenceontology.org/browser/release_2.4/term/SO:0000343
27 match_type = { # Currently we can only handle BLASTN, BLASTP
28 'BLASTN': 'nucleotide_match',
29 'BLASTP': 'protein_match',
30 }.get(record.application, 'match')
31
32 recid = record.query
33 if ' ' in recid:
34 recid = recid[0:recid.index(' ')]
35
36 rec = SeqRecord(Seq("ACTG"), id=recid)
37 for idx_hit, hit in enumerate(record.alignments):
38 for idx_hsp, hsp in enumerate(hit.hsps):
39 qualifiers = {
40 "ID": 'b2g.%s.%s.%s' % (idx_record, idx_hit, idx_hsp),
41 "source": "blast",
42 "score": hsp.expect,
43 "accession": hit.accession,
44 "hit_id": hit.hit_id,
45 "length": hit.length,
46 "hit_titles": hit.title.split(' >'),
47 }
48 if include_seq:
49 qualifiers.update({
50 'blast_qseq': hsp.query,
51 'blast_sseq': hsp.sbjct,
52 'blast_mseq': hsp.match,
53 })
54
55 for prop in ('score', 'bits', 'identities', 'positives',
56 'gaps', 'align_length', 'strand', 'frame',
57 'query_start', 'query_end', 'sbjct_start',
58 'sbjct_end'):
59 qualifiers['blast_' + prop] = getattr(hsp, prop, None)
60
61 desc = hit.title.split(' >')[0]
62 qualifiers['description'] = desc[desc.index(' '):]
63
64 # This required a fair bit of sketching out/match to figure out
65 # the first time.
66 #
67 # the match_start location must account for queries and
68 # subjecst that start at locations other than 1
69 parent_match_start = hsp.query_start - hsp.sbjct_start
70 # The end is the start + hit.length because the match itself
71 # may be longer than the parent feature, so we use the supplied
72 # subject/hit length to calculate the real ending of the target
73 # protein.
74 parent_match_end = hsp.query_start + hit.length + hsp.query.count('-')
75
76 # If we trim the left end, we need to trim without losing information.
77 used_parent_match_start = parent_match_start
78 if trim:
79 if parent_match_start < 1:
80 used_parent_match_start = 0
81
82 if trim or trim_end:
83 if parent_match_end > hsp.query_end:
84 parent_match_end = hsp.query_end + 1
85
86 # The ``match`` feature will hold one or more ``match_part``s
87 top_feature = SeqFeature(
88 SimpleLocation(used_parent_match_start, parent_match_end, strand=0),
89 type=match_type,
90 qualifiers=qualifiers
91 )
92
93 # Unlike the parent feature, ``match_part``s have sources.
94 part_qualifiers = {
95 "source": "blast",
96 }
97 top_feature.sub_features = []
98 for idx_part, (start, end, cigar) in \
99 enumerate(generate_parts(hsp.query, hsp.match,
100 hsp.sbjct,
101 ignore_under=min_gap)):
102 part_qualifiers['Gap'] = cigar
103 part_qualifiers['ID'] = qualifiers['ID'] + ('.%s' % idx_part)
104
105 # Otherwise, we have to account for the subject start's location
106 match_part_start = parent_match_start + hsp.sbjct_start + start - 1
107
108 # We used to use hsp.align_length here, but that includes
109 # gaps in the parent sequence
110 #
111 # Furthermore align_length will give calculation errors in weird places
112 # So we just use (end-start) for simplicity
113 match_part_end = match_part_start + (end - start)
114
115 top_feature.sub_features.append(
116 SeqFeature(
117 SimpleLocation(match_part_start, match_part_end, strand=1),
118 type="match_part",
119 qualifiers=copy.deepcopy(part_qualifiers))
120 )
121
122 rec.features.append(top_feature)
123 rec.annotations = {}
124 yield rec
125
126
127 def __remove_query_gaps(query, match, subject):
128 """remove positions in all three based on gaps in query
129
130 In order to simplify math and calculations...we remove all of the gaps
131 based on gap locations in the query sequence::
132
133 Q:ACTG-ACTGACTG
134 S:ACTGAAC---CTG
135
136 will become::
137
138 Q:ACTGACTGACTG
139 S:ACTGAC---CTG
140
141 which greatly simplifies the process of identifying the correct location
142 for a match_part
143 """
144 prev = 0
145 fq = ''
146 fm = ''
147 fs = ''
148 for position in re.finditer('-', query):
149 fq += query[prev:position.start()]
150 fm += match[prev:position.start()]
151 fs += subject[prev:position.start()]
152 prev = position.start() + 1
153 fq += query[prev:]
154 fm += match[prev:]
155 fs += subject[prev:]
156
157 return (fq, fm, fs)
158
159
160 def generate_parts(query, match, subject, ignore_under=3):
161 region_q = []
162 region_m = []
163 region_s = []
164
165 (query, match, subject) = __remove_query_gaps(query, match, subject)
166
167 region_start = -1
168 region_end = -1
169 mismatch_count = 0
170 for i, (q, m, s) in enumerate(zip(query, match, subject)):
171
172 # If we have a match
173 if m != ' ' or m == '+':
174 if region_start == -1:
175 region_start = i
176 # It's a new region, we need to reset or it's pre-seeded with
177 # spaces
178 region_q = []
179 region_m = []
180 region_s = []
181 region_end = i
182 mismatch_count = 0
183 else:
184 mismatch_count += 1
185
186 region_q.append(q)
187 region_m.append(m)
188 region_s.append(s)
189
190 if mismatch_count >= ignore_under and region_start != -1 and region_end != -1:
191 region_q = region_q[0:-ignore_under]
192 region_m = region_m[0:-ignore_under]
193 region_s = region_s[0:-ignore_under]
194 yield region_start, region_end + 1, \
195 cigar_from_string(region_q, region_m, region_s, strict_m=True)
196 region_q = []
197 region_m = []
198 region_s = []
199
200 region_start = -1
201 region_end = -1
202 mismatch_count = 0
203
204 yield region_start, region_end + 1, \
205 cigar_from_string(region_q, region_m, region_s, strict_m=True)
206
207
208 def _qms_to_matches(query, match, subject, strict_m=True):
209 matchline = []
210
211 for (q, m, s) in zip(query, match, subject):
212 ret = ''
213
214 if m != ' ' or m == '+':
215 ret = '='
216 elif m == ' ':
217 if q == '-':
218 ret = 'D'
219 elif s == '-':
220 ret = 'I'
221 else:
222 ret = 'X'
223 else:
224 log.warn("Bad data: \n\t%s\n\t%s\n\t%s\n" % (query, match, subject))
225
226 if strict_m:
227 if ret == '=' or ret == 'X':
228 ret = 'M'
229
230 matchline.append(ret)
231 return matchline
232
233
234 def _matchline_to_cigar(matchline):
235 cigar_line = []
236 last_char = matchline[0]
237 count = 0
238 for char in matchline:
239 if char == last_char:
240 count += 1
241 else:
242 cigar_line.append("%s%s" % (last_char, count))
243 count = 1
244 last_char = char
245 cigar_line.append("%s%s" % (last_char, count))
246 return ' '.join(cigar_line)
247
248
249 def cigar_from_string(query, match, subject, strict_m=True):
250 matchline = _qms_to_matches(query, match, subject, strict_m=strict_m)
251 if len(matchline) > 0:
252 return _matchline_to_cigar(matchline)
253 else:
254 return ""
255
256
257 if __name__ == '__main__':
258 parser = argparse.ArgumentParser(description='Convert Blast XML to gapped GFF3', epilog='')
259 parser.add_argument('blastxml', type=argparse.FileType("r"), help='Blast XML Output')
260 parser.add_argument('--min_gap', type=int, help='Maximum gap size before generating a new match_part', default=3)
261 parser.add_argument('--trim', action='store_true', help='Trim blast hits to be only as long as the parent feature')
262 parser.add_argument('--trim_end', action='store_true', help='Cut blast results off at end of gene')
263 parser.add_argument('--include_seq', action='store_true', help='Include sequence')
264 args = parser.parse_args()
265
266 for rec in blastxml2gff3(**vars(args)):
267 if len(rec.features):
268 GFF.write([rec], sys.stdout)