comparison blastxml_to_gapped_gff3.py @ 0:bd47051afe98 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/blastxml_to_gapped_gff3 commit 8f38145c94ecb1e23c3ff6f0243213dc49d2287e
author iuc
date Tue, 20 Dec 2016 09:21:11 -0500
parents
children 877cd0833221
comparison
equal deleted inserted replaced
-1:000000000000 0:bd47051afe98
1 #!/usr/bin/perl
2 import argparse
3 import copy
4 import logging
5 import re
6 import sys
7
8 from BCBio import GFF
9
10 logging.basicConfig(level=logging.INFO)
11 log = logging.getLogger(name='blastxml2gff3')
12
13 __author__ = "Eric Rasche"
14 __version__ = "0.4.0"
15 __maintainer__ = "Eric Rasche"
16 __email__ = "esr@tamu.edu"
17
18 __doc__ = """
19 BlastXML files, when transformed to GFF3, do not normally show gaps in the
20 blast hits. This tool aims to fill that "gap".
21 """
22
23
24 def blastxml2gff3(blastxml, min_gap=3, trim=False, trim_end=False):
25 from Bio.Blast import NCBIXML
26 from Bio.Seq import Seq
27 from Bio.SeqRecord import SeqRecord
28 from Bio.SeqFeature import SeqFeature, FeatureLocation
29
30 blast_records = NCBIXML.parse(blastxml)
31 records = []
32 for record in blast_records:
33 # http://www.sequenceontology.org/browser/release_2.4/term/SO:0000343
34 match_type = { # Currently we can only handle BLASTN, BLASTP
35 'BLASTN': 'nucleotide_match',
36 'BLASTP': 'protein_match',
37 }.get(record.application, 'match')
38
39 rec = SeqRecord(Seq("ACTG"), id=record.query)
40 for hit in record.alignments:
41 for hsp in hit.hsps:
42 qualifiers = {
43 "source": "blast",
44 "score": hsp.expect,
45 "accession": hit.accession,
46 "hit_id": hit.hit_id,
47 "length": hit.length,
48 "hit_titles": hit.title.split(' >')
49 }
50 desc = hit.title.split(' >')[0]
51 qualifiers['description'] = desc[desc.index(' '):]
52
53 # This required a fair bit of sketching out/match to figure out
54 # the first time.
55 #
56 # the match_start location must account for queries and
57 # subjecst that start at locations other than 1
58 parent_match_start = hsp.query_start - hsp.sbjct_start
59 # The end is the start + hit.length because the match itself
60 # may be longer than the parent feature, so we use the supplied
61 # subject/hit length to calculate the real ending of the target
62 # protein.
63 parent_match_end = hsp.query_start + hit.length + hsp.query.count('-')
64
65 # However, if the user requests that we trim the feature, then
66 # we need to cut the ``match`` start to 0 to match the parent feature.
67 # We'll also need to cut the end to match the query's end. It (maybe)
68 # should be the feature end? But we don't have access to that data, so
69 # We settle for this.
70 if trim:
71 if parent_match_start < 1:
72 parent_match_start = 0
73
74 if trim or trim_end:
75 if parent_match_end > hsp.query_end:
76 parent_match_end = hsp.query_end + 1
77
78 # The ``match`` feature will hold one or more ``match_part``s
79 top_feature = SeqFeature(
80 FeatureLocation(parent_match_start, parent_match_end),
81 type=match_type, strand=0,
82 qualifiers=qualifiers
83 )
84
85 # Unlike the parent feature, ``match_part``s have sources.
86 part_qualifiers = {
87 "source": "blast",
88 }
89 top_feature.sub_features = []
90 for start, end, cigar in generate_parts(hsp.query, hsp.match,
91 hsp.sbjct,
92 ignore_under=min_gap):
93 part_qualifiers['Gap'] = cigar
94 part_qualifiers['ID'] = hit.hit_id
95
96 if trim:
97 # If trimming, then we start relative to the
98 # match's start
99 match_part_start = parent_match_start + start
100 else:
101 # Otherwise, we have to account for the subject start's location
102 match_part_start = parent_match_start + hsp.sbjct_start + start - 1
103
104 # We used to use hsp.align_length here, but that includes
105 # gaps in the parent sequence
106 #
107 # Furthermore align_length will give calculation errors in weird places
108 # So we just use (end-start) for simplicity
109 match_part_end = match_part_start + (end - start)
110
111 top_feature.sub_features.append(
112 SeqFeature(
113 FeatureLocation(match_part_start, match_part_end),
114 type="match_part", strand=0,
115 qualifiers=copy.deepcopy(part_qualifiers))
116 )
117
118 rec.features.append(top_feature)
119 rec.annotations = {}
120 records.append(rec)
121 return records
122
123
124 def __remove_query_gaps(query, match, subject):
125 """remove positions in all three based on gaps in query
126
127 In order to simplify math and calculations...we remove all of the gaps
128 based on gap locations in the query sequence::
129
130 Q:ACTG-ACTGACTG
131 S:ACTGAAC---CTG
132
133 will become::
134
135 Q:ACTGACTGACTG
136 S:ACTGAC---CTG
137
138 which greatly simplifies the process of identifying the correct location
139 for a match_part
140 """
141 prev = 0
142 fq = ''
143 fm = ''
144 fs = ''
145 for position in re.finditer('-', query):
146 fq += query[prev:position.start()]
147 fm += match[prev:position.start()]
148 fs += subject[prev:position.start()]
149 prev = position.start() + 1
150 fq += query[prev:]
151 fm += match[prev:]
152 fs += subject[prev:]
153
154 return (fq, fm, fs)
155
156
157 def generate_parts(query, match, subject, ignore_under=3):
158 region_q = []
159 region_m = []
160 region_s = []
161
162 (query, match, subject) = __remove_query_gaps(query, match, subject)
163
164 region_start = -1
165 region_end = -1
166 mismatch_count = 0
167 for i, (q, m, s) in enumerate(zip(query, match, subject)):
168
169 # If we have a match
170 if m != ' ' or m == '+':
171 if region_start == -1:
172 region_start = i
173 # It's a new region, we need to reset or it's pre-seeded with
174 # spaces
175 region_q = []
176 region_m = []
177 region_s = []
178 region_end = i
179 mismatch_count = 0
180 else:
181 mismatch_count += 1
182
183 region_q.append(q)
184 region_m.append(m)
185 region_s.append(s)
186
187 if mismatch_count >= ignore_under and region_start != -1 and region_end != -1:
188 region_q = region_q[0:-ignore_under]
189 region_m = region_m[0:-ignore_under]
190 region_s = region_s[0:-ignore_under]
191 yield region_start, region_end + 1, \
192 cigar_from_string(region_q, region_m, region_s, strict_m=True)
193 region_q = []
194 region_m = []
195 region_s = []
196
197 region_start = -1
198 region_end = -1
199 mismatch_count = 0
200
201 yield region_start, region_end + 1, \
202 cigar_from_string(region_q, region_m, region_s, strict_m=True)
203
204
205 def _qms_to_matches(query, match, subject, strict_m=True):
206 matchline = []
207
208 for (q, m, s) in zip(query, match, subject):
209 ret = ''
210
211 if m != ' ' or m == '+':
212 ret = '='
213 elif m == ' ':
214 if q == '-':
215 ret = 'D'
216 elif s == '-':
217 ret = 'I'
218 else:
219 ret = 'X'
220 else:
221 log.warn("Bad data: \n\t%s\n\t%s\n\t%s\n" % (query, match, subject))
222
223 if strict_m:
224 if ret == '=' or ret == 'X':
225 ret = 'M'
226
227 matchline.append(ret)
228 return matchline
229
230
231 def _matchline_to_cigar(matchline):
232 cigar_line = []
233 last_char = matchline[0]
234 count = 0
235 for char in matchline:
236 if char == last_char:
237 count += 1
238 else:
239 cigar_line.append("%s%s" % (last_char, count))
240 count = 1
241 last_char = char
242 cigar_line.append("%s%s" % (last_char, count))
243 return ' '.join(cigar_line)
244
245
246 def cigar_from_string(query, match, subject, strict_m=True):
247 matchline = _qms_to_matches(query, match, subject, strict_m=strict_m)
248 if len(matchline) > 0:
249 return _matchline_to_cigar(matchline)
250 else:
251 return ""
252
253
254 if __name__ == '__main__':
255 parser = argparse.ArgumentParser(description='Convert Blast XML to gapped GFF3', epilog='')
256 parser.add_argument('blastxml', type=open, help='Blast XML Output')
257 parser.add_argument('--min_gap', type=int, help='Maximum gap size before generating a new match_part', default=3)
258 parser.add_argument('--trim', action='store_true', help='Trim blast hits to be only as long as the parent feature')
259 parser.add_argument('--trim_end', action='store_true', help='Cut blast results off at end of gene')
260 args = parser.parse_args()
261
262 result = blastxml2gff3(**vars(args))
263 GFF.write(result, sys.stdout)