Mercurial > repos > fubar > jbrowse2dev
comparison jbrowse2/blastxml_to_gapped_gff3.py @ 0:cd5d63cd0eb5 draft
Uploaded
author | fubar |
---|---|
date | Wed, 03 Jan 2024 01:36:39 +0000 |
parents | |
children | 88b9b105c09b |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:cd5d63cd0eb5 |
---|---|
1 #!/usr/bin/env python | |
2 import argparse | |
3 import copy | |
4 import logging | |
5 import re | |
6 import sys | |
7 | |
8 from BCBio import GFF | |
9 logging.basicConfig(level=logging.INFO) | |
10 log = logging.getLogger(name='blastxml2gff3') | |
11 | |
12 __doc__ = """ | |
13 BlastXML files, when transformed to GFF3, do not normally show gaps in the | |
14 blast hits. This tool aims to fill that "gap". | |
15 """ | |
16 | |
17 | |
18 def blastxml2gff3(blastxml, min_gap=3, trim=False, trim_end=False, include_seq=False): | |
19 from Bio.Blast import NCBIXML | |
20 from Bio.Seq import Seq | |
21 from Bio.SeqRecord import SeqRecord | |
22 from Bio.SeqFeature import SeqFeature, SimpleLocation | |
23 | |
24 blast_records = NCBIXML.parse(blastxml) | |
25 for idx_record, record in enumerate(blast_records): | |
26 # http://www.sequenceontology.org/browser/release_2.4/term/SO:0000343 | |
27 match_type = { # Currently we can only handle BLASTN, BLASTP | |
28 'BLASTN': 'nucleotide_match', | |
29 'BLASTP': 'protein_match', | |
30 }.get(record.application, 'match') | |
31 | |
32 recid = record.query | |
33 if ' ' in recid: | |
34 recid = recid[0:recid.index(' ')] | |
35 | |
36 rec = SeqRecord(Seq("ACTG"), id=recid) | |
37 for idx_hit, hit in enumerate(record.alignments): | |
38 for idx_hsp, hsp in enumerate(hit.hsps): | |
39 qualifiers = { | |
40 "ID": 'b2g.%s.%s.%s' % (idx_record, idx_hit, idx_hsp), | |
41 "source": "blast", | |
42 "score": hsp.expect, | |
43 "accession": hit.accession, | |
44 "hit_id": hit.hit_id, | |
45 "length": hit.length, | |
46 "hit_titles": hit.title.split(' >'), | |
47 } | |
48 if include_seq: | |
49 qualifiers.update({ | |
50 'blast_qseq': hsp.query, | |
51 'blast_sseq': hsp.sbjct, | |
52 'blast_mseq': hsp.match, | |
53 }) | |
54 | |
55 for prop in ('score', 'bits', 'identities', 'positives', | |
56 'gaps', 'align_length', 'strand', 'frame', | |
57 'query_start', 'query_end', 'sbjct_start', | |
58 'sbjct_end'): | |
59 qualifiers['blast_' + prop] = getattr(hsp, prop, None) | |
60 | |
61 desc = hit.title.split(' >')[0] | |
62 qualifiers['description'] = desc[desc.index(' '):] | |
63 | |
64 # This required a fair bit of sketching out/match to figure out | |
65 # the first time. | |
66 # | |
67 # the match_start location must account for queries and | |
68 # subjecst that start at locations other than 1 | |
69 parent_match_start = hsp.query_start - hsp.sbjct_start | |
70 # The end is the start + hit.length because the match itself | |
71 # may be longer than the parent feature, so we use the supplied | |
72 # subject/hit length to calculate the real ending of the target | |
73 # protein. | |
74 parent_match_end = hsp.query_start + hit.length + hsp.query.count('-') | |
75 | |
76 # If we trim the left end, we need to trim without losing information. | |
77 used_parent_match_start = parent_match_start | |
78 if trim: | |
79 if parent_match_start < 1: | |
80 used_parent_match_start = 0 | |
81 | |
82 if trim or trim_end: | |
83 if parent_match_end > hsp.query_end: | |
84 parent_match_end = hsp.query_end + 1 | |
85 | |
86 # The ``match`` feature will hold one or more ``match_part``s | |
87 top_feature = SeqFeature( | |
88 SimpleLocation(used_parent_match_start, parent_match_end, strand=0), | |
89 type=match_type, | |
90 qualifiers=qualifiers | |
91 ) | |
92 | |
93 # Unlike the parent feature, ``match_part``s have sources. | |
94 part_qualifiers = { | |
95 "source": "blast", | |
96 } | |
97 top_feature.sub_features = [] | |
98 for idx_part, (start, end, cigar) in \ | |
99 enumerate(generate_parts(hsp.query, hsp.match, | |
100 hsp.sbjct, | |
101 ignore_under=min_gap)): | |
102 part_qualifiers['Gap'] = cigar | |
103 part_qualifiers['ID'] = qualifiers['ID'] + ('.%s' % idx_part) | |
104 | |
105 # Otherwise, we have to account for the subject start's location | |
106 match_part_start = parent_match_start + hsp.sbjct_start + start - 1 | |
107 | |
108 # We used to use hsp.align_length here, but that includes | |
109 # gaps in the parent sequence | |
110 # | |
111 # Furthermore align_length will give calculation errors in weird places | |
112 # So we just use (end-start) for simplicity | |
113 match_part_end = match_part_start + (end - start) | |
114 | |
115 top_feature.sub_features.append( | |
116 SeqFeature( | |
117 SimpleLocation(match_part_start, match_part_end, strand=1), | |
118 type="match_part", | |
119 qualifiers=copy.deepcopy(part_qualifiers)) | |
120 ) | |
121 | |
122 rec.features.append(top_feature) | |
123 rec.annotations = {} | |
124 yield rec | |
125 | |
126 | |
127 def __remove_query_gaps(query, match, subject): | |
128 """remove positions in all three based on gaps in query | |
129 | |
130 In order to simplify math and calculations...we remove all of the gaps | |
131 based on gap locations in the query sequence:: | |
132 | |
133 Q:ACTG-ACTGACTG | |
134 S:ACTGAAC---CTG | |
135 | |
136 will become:: | |
137 | |
138 Q:ACTGACTGACTG | |
139 S:ACTGAC---CTG | |
140 | |
141 which greatly simplifies the process of identifying the correct location | |
142 for a match_part | |
143 """ | |
144 prev = 0 | |
145 fq = '' | |
146 fm = '' | |
147 fs = '' | |
148 for position in re.finditer('-', query): | |
149 fq += query[prev:position.start()] | |
150 fm += match[prev:position.start()] | |
151 fs += subject[prev:position.start()] | |
152 prev = position.start() + 1 | |
153 fq += query[prev:] | |
154 fm += match[prev:] | |
155 fs += subject[prev:] | |
156 | |
157 return (fq, fm, fs) | |
158 | |
159 | |
160 def generate_parts(query, match, subject, ignore_under=3): | |
161 region_q = [] | |
162 region_m = [] | |
163 region_s = [] | |
164 | |
165 (query, match, subject) = __remove_query_gaps(query, match, subject) | |
166 | |
167 region_start = -1 | |
168 region_end = -1 | |
169 mismatch_count = 0 | |
170 for i, (q, m, s) in enumerate(zip(query, match, subject)): | |
171 | |
172 # If we have a match | |
173 if m != ' ' or m == '+': | |
174 if region_start == -1: | |
175 region_start = i | |
176 # It's a new region, we need to reset or it's pre-seeded with | |
177 # spaces | |
178 region_q = [] | |
179 region_m = [] | |
180 region_s = [] | |
181 region_end = i | |
182 mismatch_count = 0 | |
183 else: | |
184 mismatch_count += 1 | |
185 | |
186 region_q.append(q) | |
187 region_m.append(m) | |
188 region_s.append(s) | |
189 | |
190 if mismatch_count >= ignore_under and region_start != -1 and region_end != -1: | |
191 region_q = region_q[0:-ignore_under] | |
192 region_m = region_m[0:-ignore_under] | |
193 region_s = region_s[0:-ignore_under] | |
194 yield region_start, region_end + 1, \ | |
195 cigar_from_string(region_q, region_m, region_s, strict_m=True) | |
196 region_q = [] | |
197 region_m = [] | |
198 region_s = [] | |
199 | |
200 region_start = -1 | |
201 region_end = -1 | |
202 mismatch_count = 0 | |
203 | |
204 yield region_start, region_end + 1, \ | |
205 cigar_from_string(region_q, region_m, region_s, strict_m=True) | |
206 | |
207 | |
208 def _qms_to_matches(query, match, subject, strict_m=True): | |
209 matchline = [] | |
210 | |
211 for (q, m, s) in zip(query, match, subject): | |
212 ret = '' | |
213 | |
214 if m != ' ' or m == '+': | |
215 ret = '=' | |
216 elif m == ' ': | |
217 if q == '-': | |
218 ret = 'D' | |
219 elif s == '-': | |
220 ret = 'I' | |
221 else: | |
222 ret = 'X' | |
223 else: | |
224 log.warn("Bad data: \n\t%s\n\t%s\n\t%s\n" % (query, match, subject)) | |
225 | |
226 if strict_m: | |
227 if ret == '=' or ret == 'X': | |
228 ret = 'M' | |
229 | |
230 matchline.append(ret) | |
231 return matchline | |
232 | |
233 | |
234 def _matchline_to_cigar(matchline): | |
235 cigar_line = [] | |
236 last_char = matchline[0] | |
237 count = 0 | |
238 for char in matchline: | |
239 if char == last_char: | |
240 count += 1 | |
241 else: | |
242 cigar_line.append("%s%s" % (last_char, count)) | |
243 count = 1 | |
244 last_char = char | |
245 cigar_line.append("%s%s" % (last_char, count)) | |
246 return ' '.join(cigar_line) | |
247 | |
248 | |
249 def cigar_from_string(query, match, subject, strict_m=True): | |
250 matchline = _qms_to_matches(query, match, subject, strict_m=strict_m) | |
251 if len(matchline) > 0: | |
252 return _matchline_to_cigar(matchline) | |
253 else: | |
254 return "" | |
255 | |
256 | |
257 if __name__ == '__main__': | |
258 parser = argparse.ArgumentParser(description='Convert Blast XML to gapped GFF3', epilog='') | |
259 parser.add_argument('blastxml', type=argparse.FileType("r"), help='Blast XML Output') | |
260 parser.add_argument('--min_gap', type=int, help='Maximum gap size before generating a new match_part', default=3) | |
261 parser.add_argument('--trim', action='store_true', help='Trim blast hits to be only as long as the parent feature') | |
262 parser.add_argument('--trim_end', action='store_true', help='Cut blast results off at end of gene') | |
263 parser.add_argument('--include_seq', action='store_true', help='Include sequence') | |
264 args = parser.parse_args() | |
265 | |
266 for rec in blastxml2gff3(**vars(args)): | |
267 if len(rec.features): | |
268 GFF.write([rec], sys.stdout) |