Mercurial > repos > iuc > jbrowse
comparison blastxml_to_gapped_gff3.py @ 1:497c6bb3b717 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/jbrowse commit 0887009a23d176b21536c9fd8a18c4fecc417d4f
author | iuc |
---|---|
date | Thu, 18 Jun 2015 12:10:51 -0400 |
parents | |
children | 7342f467507b |
comparison
equal
deleted
inserted
replaced
0:2c9e5136b416 | 1:497c6bb3b717 |
---|---|
1 #!/usr/bin/perl | |
2 import re | |
3 import sys | |
4 import copy | |
5 import argparse | |
6 from BCBio import GFF | |
7 import logging | |
8 logging.basicConfig(level=logging.INFO) | |
9 log = logging.getLogger(name='blastxml2gff3') | |
10 | |
11 __author__ = "Eric Rasche" | |
12 __version__ = "0.4.0" | |
13 __maintainer__ = "Eric Rasche" | |
14 __email__ = "esr@tamu.edu" | |
15 | |
16 __doc__ = """ | |
17 BlastXML files, when transformed to GFF3, do not normally show gaps in the | |
18 blast hits. This tool aims to fill that "gap". | |
19 """ | |
20 | |
21 | |
22 def blastxml2gff3(blastxml, min_gap=3, trim=False, trim_end=False): | |
23 from Bio.Blast import NCBIXML | |
24 from Bio.Seq import Seq | |
25 from Bio.SeqRecord import SeqRecord | |
26 from Bio.SeqFeature import SeqFeature, FeatureLocation | |
27 | |
28 blast_records = NCBIXML.parse(blastxml) | |
29 records = [] | |
30 for record in blast_records: | |
31 rec = SeqRecord(Seq("ACTG"), id=record.query) | |
32 for hit in record.alignments: | |
33 for hsp in hit.hsps: | |
34 qualifiers = { | |
35 "source": "blast", | |
36 "score": hsp.expect, | |
37 "accession": hit.accession, | |
38 "hit_id": hit.hit_id, | |
39 "length": hit.length, | |
40 "hit_titles": hit.title.split(' >') | |
41 } | |
42 desc = hit.title.split(' >')[0] | |
43 qualifiers['description'] = desc[desc.index(' '):] | |
44 | |
45 # This required a fair bit of sketching out/match to figure out | |
46 # the first time. | |
47 # | |
48 # the match_start location must account for queries and | |
49 # subjecst that start at locations other than 1 | |
50 parent_match_start = hsp.query_start - hsp.sbjct_start | |
51 # The end is the start + hit.length because the match itself | |
52 # may be longer than the parent feature, so we use the supplied | |
53 # subject/hit length to calculate the real ending of the target | |
54 # protein. | |
55 parent_match_end = hsp.query_start + hit.length + hsp.query.count('-') | |
56 | |
57 # However, if the user requests that we trim the feature, then | |
58 # we need to cut the ``match`` start to 0 to match the parent feature. | |
59 # We'll also need to cut the end to match the query's end. It (maybe) | |
60 # should be the feature end? But we don't have access to that data, so | |
61 # We settle for this. | |
62 if trim: | |
63 if parent_match_start < 1: | |
64 parent_match_start = 0 | |
65 | |
66 if trim or trim_end: | |
67 if parent_match_end > hsp.query_end: | |
68 parent_match_end = hsp.query_end + 1 | |
69 | |
70 # The ``protein_match`` feature will hold one or more ``match_part``s | |
71 top_feature = SeqFeature( | |
72 FeatureLocation(parent_match_start, parent_match_end), | |
73 type="protein_match", strand=0, | |
74 qualifiers=qualifiers | |
75 ) | |
76 | |
77 # Unlike the parent feature, ``match_part``s have sources. | |
78 part_qualifiers = { | |
79 "source": "blast", | |
80 } | |
81 top_feature.sub_features = [] | |
82 for start, end, cigar in generate_parts(hsp.query, hsp.match, | |
83 hsp.sbjct, | |
84 ignore_under=min_gap): | |
85 part_qualifiers['Gap'] = cigar | |
86 part_qualifiers['ID'] = hit.hit_id | |
87 | |
88 if trim: | |
89 # If trimming, then we start relative to the | |
90 # protein_match's start | |
91 match_part_start = parent_match_start + start | |
92 else: | |
93 # Otherwise, we have to account for the subject start's location | |
94 match_part_start = parent_match_start + hsp.sbjct_start + start - 1 | |
95 | |
96 # We used to use hsp.align_length here, but that includes | |
97 # gaps in the parent sequence | |
98 # | |
99 # Furthermore align_length will give calculation errors in weird places | |
100 # So we just use (end-start) for simplicity | |
101 match_part_end = match_part_start + (end - start) | |
102 | |
103 top_feature.sub_features.append( | |
104 SeqFeature( | |
105 FeatureLocation(match_part_start, match_part_end), | |
106 type="match_part", strand=0, | |
107 qualifiers=copy.deepcopy(part_qualifiers)) | |
108 ) | |
109 | |
110 rec.features.append(top_feature) | |
111 records.append(rec) | |
112 return records | |
113 | |
114 | |
115 def __remove_query_gaps(query, match, subject): | |
116 """remove positions in all three based on gaps in query | |
117 | |
118 In order to simplify math and calculations...we remove all of the gaps | |
119 based on gap locations in the query sequence:: | |
120 | |
121 Q:ACTG-ACTGACTG | |
122 S:ACTGAAC---CTG | |
123 | |
124 will become:: | |
125 | |
126 Q:ACTGACTGACTG | |
127 S:ACTGAC---CTG | |
128 | |
129 which greatly simplifies the process of identifying the correct location | |
130 for a match_part | |
131 """ | |
132 prev = 0 | |
133 fq = '' | |
134 fm = '' | |
135 fs = '' | |
136 for position in re.finditer('-', query): | |
137 fq += query[prev:position.start()] | |
138 fm += match[prev:position.start()] | |
139 fs += subject[prev:position.start()] | |
140 prev = position.start() + 1 | |
141 fq += query[prev:] | |
142 fm += match[prev:] | |
143 fs += subject[prev:] | |
144 | |
145 return (fq, fm, fs) | |
146 | |
147 | |
148 def generate_parts(query, match, subject, ignore_under=3): | |
149 region_q = [] | |
150 region_m = [] | |
151 region_s = [] | |
152 | |
153 (query, match, subject) = __remove_query_gaps(query, match, subject) | |
154 | |
155 region_start = -1 | |
156 region_end = -1 | |
157 mismatch_count = 0 | |
158 for i, (q, m, s) in enumerate(zip(query, match, subject)): | |
159 | |
160 # If we have a match | |
161 if m != ' ' or m == '+': | |
162 if region_start == -1: | |
163 region_start = i | |
164 # It's a new region, we need to reset or it's pre-seeded with | |
165 # spaces | |
166 region_q = [] | |
167 region_m = [] | |
168 region_s = [] | |
169 region_end = i | |
170 mismatch_count = 0 | |
171 else: | |
172 mismatch_count += 1 | |
173 | |
174 region_q.append(q) | |
175 region_m.append(m) | |
176 region_s.append(s) | |
177 | |
178 if mismatch_count >= ignore_under and region_start != -1 and region_end != -1: | |
179 region_q = region_q[0:-ignore_under] | |
180 region_m = region_m[0:-ignore_under] | |
181 region_s = region_s[0:-ignore_under] | |
182 yield region_start, region_end + 1, \ | |
183 cigar_from_string(region_q, region_m, region_s, strict_m=True) | |
184 region_q = [] | |
185 region_m = [] | |
186 region_s = [] | |
187 | |
188 region_start = -1 | |
189 region_end = -1 | |
190 mismatch_count = 0 | |
191 | |
192 yield region_start, region_end + 1, \ | |
193 cigar_from_string(region_q, region_m, region_s, strict_m=True) | |
194 | |
195 | |
196 def _qms_to_matches(query, match, subject, strict_m=True): | |
197 matchline = [] | |
198 | |
199 for (q, m, s) in zip(query, match, subject): | |
200 ret = '' | |
201 | |
202 if m != ' ' or m == '+': | |
203 ret = '=' | |
204 elif m == ' ': | |
205 if q == '-': | |
206 ret = 'D' | |
207 elif s == '-': | |
208 ret = 'I' | |
209 else: | |
210 ret = 'X' | |
211 else: | |
212 log.warn("Bad data: \n\t%s\n\t%s\n\t%s\n" % (query, match, subject)) | |
213 | |
214 | |
215 if strict_m: | |
216 if ret == '=' or ret == 'X': | |
217 ret = 'M' | |
218 | |
219 matchline.append(ret) | |
220 return matchline | |
221 | |
222 | |
223 def _matchline_to_cigar(matchline): | |
224 cigar_line = [] | |
225 last_char = matchline[0] | |
226 count = 0 | |
227 for char in matchline: | |
228 if char == last_char: | |
229 count += 1 | |
230 else: | |
231 cigar_line.append("%s%s" % (last_char, count)) | |
232 count = 1 | |
233 last_char = char | |
234 cigar_line.append("%s%s" % (last_char, count)) | |
235 return ' '.join(cigar_line) | |
236 | |
237 | |
238 def cigar_from_string(query, match, subject, strict_m=True): | |
239 matchline = _qms_to_matches(query, match, subject, strict_m=strict_m) | |
240 if len(matchline) > 0: | |
241 return _matchline_to_cigar(matchline) | |
242 else: | |
243 return "" | |
244 | |
245 | |
246 if __name__ == '__main__': | |
247 parser = argparse.ArgumentParser(description='Convert Blast XML to gapped GFF3', epilog='') | |
248 parser.add_argument('blastxml', type=file, help='Blast XML Output') | |
249 parser.add_argument('--min_gap', type=int, help='Maximum gap size before generating a new match_part', default=3) | |
250 parser.add_argument('--trim', action='store_true', help='Trim blast hits to be only as long as the parent feature') | |
251 parser.add_argument('--trim_end', action='store_true', help='Cut blast results off at end of gene') | |
252 args = parser.parse_args() | |
253 | |
254 result = blastxml2gff3(**vars(args)) | |
255 | |
256 GFF.write(result, sys.stdout) |