comparison blastxml_to_gapped_gff3.py @ 0:d78175596286 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/jbrowse2 commit cd77dffaad652cfb75b98bde5231beaa6d63cd5b-dirty
author fubar
date Mon, 08 Jan 2024 09:20:33 +0000
parents
children 4c201a3d4755
comparison
equal deleted inserted replaced
-1:000000000000 0:d78175596286
1 #!/usr/bin/env python
2 import argparse
3 import copy
4 import logging
5 import re
6 import sys
7
8 from BCBio import GFF
9
10 logging.basicConfig(level=logging.INFO)
11 log = logging.getLogger(name="blastxml2gff3")
12
13 __doc__ = """
14 BlastXML files, when transformed to GFF3, do not normally show gaps in the
15 blast hits. This tool aims to fill that "gap".
16 """
17
18
19 def blastxml2gff3(blastxml, min_gap=3, trim=False, trim_end=False, include_seq=False):
20 from Bio.Blast import NCBIXML
21 from Bio.Seq import Seq
22 from Bio.SeqRecord import SeqRecord
23 from Bio.SeqFeature import SeqFeature, SimpleLocation
24
25 blast_records = NCBIXML.parse(blastxml)
26 for idx_record, record in enumerate(blast_records):
27 # http://www.sequenceontology.org/browser/release_2.4/term/SO:0000343
28 match_type = { # Currently we can only handle BLASTN, BLASTP
29 "BLASTN": "nucleotide_match",
30 "BLASTP": "protein_match",
31 }.get(record.application, "match")
32
33 recid = record.query
34 if " " in recid:
35 recid = recid[0: recid.index(" ")]
36
37 rec = SeqRecord(Seq("ACTG"), id=recid)
38 for idx_hit, hit in enumerate(record.alignments):
39 for idx_hsp, hsp in enumerate(hit.hsps):
40 qualifiers = {
41 "ID": "b2g.%s.%s.%s" % (idx_record, idx_hit, idx_hsp),
42 "source": "blast",
43 "score": hsp.expect,
44 "accession": hit.accession,
45 "hit_id": hit.hit_id,
46 "length": hit.length,
47 "hit_titles": hit.title.split(" >"),
48 }
49 if include_seq:
50 qualifiers.update(
51 {
52 "blast_qseq": hsp.query,
53 "blast_sseq": hsp.sbjct,
54 "blast_mseq": hsp.match,
55 }
56 )
57
58 for prop in (
59 "score",
60 "bits",
61 "identities",
62 "positives",
63 "gaps",
64 "align_length",
65 "strand",
66 "frame",
67 "query_start",
68 "query_end",
69 "sbjct_start",
70 "sbjct_end",
71 ):
72 qualifiers["blast_" + prop] = getattr(hsp, prop, None)
73
74 desc = hit.title.split(" >")[0]
75 qualifiers["description"] = desc[desc.index(" "):]
76
77 # This required a fair bit of sketching out/match to figure out
78 # the first time.
79 #
80 # the match_start location must account for queries and
81 # subjecst that start at locations other than 1
82 parent_match_start = hsp.query_start - hsp.sbjct_start
83 # The end is the start + hit.length because the match itself
84 # may be longer than the parent feature, so we use the supplied
85 # subject/hit length to calculate the real ending of the target
86 # protein.
87 parent_match_end = hsp.query_start + hit.length + hsp.query.count("-")
88
89 # If we trim the left end, we need to trim without losing information.
90 used_parent_match_start = parent_match_start
91 if trim:
92 if parent_match_start < 1:
93 used_parent_match_start = 0
94
95 if trim or trim_end:
96 if parent_match_end > hsp.query_end:
97 parent_match_end = hsp.query_end + 1
98
99 # The ``match`` feature will hold one or more ``match_part``s
100 top_feature = SeqFeature(
101 SimpleLocation(used_parent_match_start, parent_match_end, strand=0),
102 type=match_type,
103 qualifiers=qualifiers,
104 )
105
106 # Unlike the parent feature, ``match_part``s have sources.
107 part_qualifiers = {
108 "source": "blast",
109 }
110 top_feature.sub_features = []
111 for idx_part, (start, end, cigar) in enumerate(
112 generate_parts(
113 hsp.query, hsp.match, hsp.sbjct, ignore_under=min_gap
114 )
115 ):
116 part_qualifiers["Gap"] = cigar
117 part_qualifiers["ID"] = qualifiers["ID"] + (".%s" % idx_part)
118
119 # Otherwise, we have to account for the subject start's location
120 match_part_start = parent_match_start + hsp.sbjct_start + start - 1
121
122 # We used to use hsp.align_length here, but that includes
123 # gaps in the parent sequence
124 #
125 # Furthermore align_length will give calculation errors in weird places
126 # So we just use (end-start) for simplicity
127 match_part_end = match_part_start + (end - start)
128
129 top_feature.sub_features.append(
130 SeqFeature(
131 SimpleLocation(match_part_start, match_part_end, strand=1),
132 type="match_part",
133 qualifiers=copy.deepcopy(part_qualifiers),
134 )
135 )
136
137 rec.features.append(top_feature)
138 rec.annotations = {}
139 yield rec
140
141
142 def __remove_query_gaps(query, match, subject):
143 """remove positions in all three based on gaps in query
144
145 In order to simplify math and calculations...we remove all of the gaps
146 based on gap locations in the query sequence::
147
148 Q:ACTG-ACTGACTG
149 S:ACTGAAC---CTG
150
151 will become::
152
153 Q:ACTGACTGACTG
154 S:ACTGAC---CTG
155
156 which greatly simplifies the process of identifying the correct location
157 for a match_part
158 """
159 prev = 0
160 fq = ""
161 fm = ""
162 fs = ""
163 for position in re.finditer("-", query):
164 fq += query[prev: position.start()]
165 fm += match[prev: position.start()]
166 fs += subject[prev: position.start()]
167 prev = position.start() + 1
168 fq += query[prev:]
169 fm += match[prev:]
170 fs += subject[prev:]
171
172 return (fq, fm, fs)
173
174
175 def generate_parts(query, match, subject, ignore_under=3):
176 region_q = []
177 region_m = []
178 region_s = []
179
180 (query, match, subject) = __remove_query_gaps(query, match, subject)
181
182 region_start = -1
183 region_end = -1
184 mismatch_count = 0
185 for i, (q, m, s) in enumerate(zip(query, match, subject)):
186
187 # If we have a match
188 if m != " " or m == "+":
189 if region_start == -1:
190 region_start = i
191 # It's a new region, we need to reset or it's pre-seeded with
192 # spaces
193 region_q = []
194 region_m = []
195 region_s = []
196 region_end = i
197 mismatch_count = 0
198 else:
199 mismatch_count += 1
200
201 region_q.append(q)
202 region_m.append(m)
203 region_s.append(s)
204
205 if mismatch_count >= ignore_under and region_start != -1 and region_end != -1:
206 region_q = region_q[0:-ignore_under]
207 region_m = region_m[0:-ignore_under]
208 region_s = region_s[0:-ignore_under]
209 yield region_start, region_end + 1, cigar_from_string(
210 region_q, region_m, region_s, strict_m=True
211 )
212 region_q = []
213 region_m = []
214 region_s = []
215
216 region_start = -1
217 region_end = -1
218 mismatch_count = 0
219
220 yield region_start, region_end + 1, cigar_from_string(
221 region_q, region_m, region_s, strict_m=True
222 )
223
224
225 def _qms_to_matches(query, match, subject, strict_m=True):
226 matchline = []
227
228 for (q, m, s) in zip(query, match, subject):
229 ret = ""
230
231 if m != " " or m == "+":
232 ret = "="
233 elif m == " ":
234 if q == "-":
235 ret = "D"
236 elif s == "-":
237 ret = "I"
238 else:
239 ret = "X"
240 else:
241 log.warn("Bad data: \n\t%s\n\t%s\n\t%s\n" % (query, match, subject))
242
243 if strict_m:
244 if ret == "=" or ret == "X":
245 ret = "M"
246
247 matchline.append(ret)
248 return matchline
249
250
251 def _matchline_to_cigar(matchline):
252 cigar_line = []
253 last_char = matchline[0]
254 count = 0
255 for char in matchline:
256 if char == last_char:
257 count += 1
258 else:
259 cigar_line.append("%s%s" % (last_char, count))
260 count = 1
261 last_char = char
262 cigar_line.append("%s%s" % (last_char, count))
263 return " ".join(cigar_line)
264
265
266 def cigar_from_string(query, match, subject, strict_m=True):
267 matchline = _qms_to_matches(query, match, subject, strict_m=strict_m)
268 if len(matchline) > 0:
269 return _matchline_to_cigar(matchline)
270 else:
271 return ""
272
273
274 if __name__ == "__main__":
275 parser = argparse.ArgumentParser(
276 description="Convert Blast XML to gapped GFF3", epilog=""
277 )
278 parser.add_argument(
279 "blastxml", type=argparse.FileType("r"), help="Blast XML Output"
280 )
281 parser.add_argument(
282 "--min_gap",
283 type=int,
284 help="Maximum gap size before generating a new match_part",
285 default=3,
286 )
287 parser.add_argument(
288 "--trim",
289 action="store_true",
290 help="Trim blast hits to be only as long as the parent feature",
291 )
292 parser.add_argument(
293 "--trim_end", action="store_true", help="Cut blast results off at end of gene"
294 )
295 parser.add_argument("--include_seq", action="store_true", help="Include sequence")
296 args = parser.parse_args()
297
298 for rec in blastxml2gff3(**vars(args)):
299 if len(rec.features):
300 GFF.write([rec], sys.stdout)