comparison blastxml_to_gapped_gff3.py @ 2:561e827baa5f draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/blastxml_to_gapped_gff3 commit 908f16ea4eb082227437dc93e06e8cb742f5a257
author iuc
date Wed, 15 Nov 2017 15:14:58 -0500
parents 877cd0833221
children
comparison
equal deleted inserted replaced
1:877cd0833221 2:561e827baa5f
4 import logging 4 import logging
5 import re 5 import re
6 import sys 6 import sys
7 7
8 from BCBio import GFF 8 from BCBio import GFF
9
10 logging.basicConfig(level=logging.INFO) 9 logging.basicConfig(level=logging.INFO)
11 log = logging.getLogger(name='blastxml2gff3') 10 log = logging.getLogger(name='blastxml2gff3')
12
13 __author__ = "Eric Rasche"
14 __version__ = "0.4.0"
15 __maintainer__ = "Eric Rasche"
16 __email__ = "esr@tamu.edu"
17 11
18 __doc__ = """ 12 __doc__ = """
19 BlastXML files, when transformed to GFF3, do not normally show gaps in the 13 BlastXML files, when transformed to GFF3, do not normally show gaps in the
20 blast hits. This tool aims to fill that "gap". 14 blast hits. This tool aims to fill that "gap".
21 """ 15 """
22 16
23 17
24 def blastxml2gff3(blastxml, min_gap=3, trim=False, trim_end=False): 18 def blastxml2gff3(blastxml, min_gap=3, trim=False, trim_end=False, include_seq=False):
25 from Bio.Blast import NCBIXML 19 from Bio.Blast import NCBIXML
26 from Bio.Seq import Seq 20 from Bio.Seq import Seq
27 from Bio.SeqRecord import SeqRecord 21 from Bio.SeqRecord import SeqRecord
28 from Bio.SeqFeature import SeqFeature, FeatureLocation 22 from Bio.SeqFeature import SeqFeature, FeatureLocation
29 23
30 blast_records = NCBIXML.parse(blastxml) 24 blast_records = NCBIXML.parse(blastxml)
31 records = [] 25 for idx_record, record in enumerate(blast_records):
32 for record in blast_records:
33 # http://www.sequenceontology.org/browser/release_2.4/term/SO:0000343 26 # http://www.sequenceontology.org/browser/release_2.4/term/SO:0000343
34 match_type = { # Currently we can only handle BLASTN, BLASTP 27 match_type = { # Currently we can only handle BLASTN, BLASTP
35 'BLASTN': 'nucleotide_match', 28 'BLASTN': 'nucleotide_match',
36 'BLASTP': 'protein_match', 29 'BLASTP': 'protein_match',
37 }.get(record.application, 'match') 30 }.get(record.application, 'match')
38 31
39 rec = SeqRecord(Seq("ACTG"), id=record.query) 32 recid = record.query
40 for hit in record.alignments: 33 if ' ' in recid:
41 for hsp in hit.hsps: 34 recid = recid[0:recid.index(' ')]
35
36 rec = SeqRecord(Seq("ACTG"), id=recid)
37 for idx_hit, hit in enumerate(record.alignments):
38 for idx_hsp, hsp in enumerate(hit.hsps):
42 qualifiers = { 39 qualifiers = {
40 "ID": 'b2g.%s.%s.%s' % (idx_record, idx_hit, idx_hsp),
43 "source": "blast", 41 "source": "blast",
44 "score": hsp.expect, 42 "score": hsp.expect,
45 "accession": hit.accession, 43 "accession": hit.accession,
46 "hit_id": hit.hit_id, 44 "hit_id": hit.hit_id,
47 "length": hit.length, 45 "length": hit.length,
48 "hit_titles": hit.title.split(' >') 46 "hit_titles": hit.title.split(' >'),
49 } 47 }
48 if include_seq:
49 qualifiers.update({
50 'blast_qseq': hsp.query,
51 'blast_sseq': hsp.sbjct,
52 'blast_mseq': hsp.match,
53 })
54
55 for prop in ('score', 'bits', 'identities', 'positives',
56 'gaps', 'align_length', 'strand', 'frame',
57 'query_start', 'query_end', 'sbjct_start',
58 'sbjct_end'):
59 qualifiers['blast_' + prop] = getattr(hsp, prop, None)
60
50 desc = hit.title.split(' >')[0] 61 desc = hit.title.split(' >')[0]
51 qualifiers['description'] = desc[desc.index(' '):] 62 qualifiers['description'] = desc[desc.index(' '):]
52 63
53 # This required a fair bit of sketching out/match to figure out 64 # This required a fair bit of sketching out/match to figure out
54 # the first time. 65 # the first time.
60 # may be longer than the parent feature, so we use the supplied 71 # may be longer than the parent feature, so we use the supplied
61 # subject/hit length to calculate the real ending of the target 72 # subject/hit length to calculate the real ending of the target
62 # protein. 73 # protein.
63 parent_match_end = hsp.query_start + hit.length + hsp.query.count('-') 74 parent_match_end = hsp.query_start + hit.length + hsp.query.count('-')
64 75
65 # However, if the user requests that we trim the feature, then 76 # If we trim the left end, we need to trim without losing information.
66 # we need to cut the ``match`` start to 0 to match the parent feature. 77 used_parent_match_start = parent_match_start
67 # We'll also need to cut the end to match the query's end. It (maybe)
68 # should be the feature end? But we don't have access to that data, so
69 # We settle for this.
70 if trim: 78 if trim:
71 if parent_match_start < 1: 79 if parent_match_start < 1:
72 parent_match_start = 0 80 used_parent_match_start = 0
73 81
74 if trim or trim_end: 82 if trim or trim_end:
75 if parent_match_end > hsp.query_end: 83 if parent_match_end > hsp.query_end:
76 parent_match_end = hsp.query_end + 1 84 parent_match_end = hsp.query_end + 1
77 85
78 # The ``match`` feature will hold one or more ``match_part``s 86 # The ``match`` feature will hold one or more ``match_part``s
79 top_feature = SeqFeature( 87 top_feature = SeqFeature(
80 FeatureLocation(parent_match_start, parent_match_end), 88 FeatureLocation(used_parent_match_start, parent_match_end),
81 type=match_type, strand=0, 89 type=match_type, strand=0,
82 qualifiers=qualifiers 90 qualifiers=qualifiers
83 ) 91 )
84 92
85 # Unlike the parent feature, ``match_part``s have sources. 93 # Unlike the parent feature, ``match_part``s have sources.
86 part_qualifiers = { 94 part_qualifiers = {
87 "source": "blast", 95 "source": "blast",
88 } 96 }
89 top_feature.sub_features = [] 97 top_feature.sub_features = []
90 for start, end, cigar in generate_parts(hsp.query, hsp.match, 98 for idx_part, (start, end, cigar) in \
91 hsp.sbjct, 99 enumerate(generate_parts(hsp.query, hsp.match,
92 ignore_under=min_gap): 100 hsp.sbjct,
101 ignore_under=min_gap)):
93 part_qualifiers['Gap'] = cigar 102 part_qualifiers['Gap'] = cigar
94 part_qualifiers['ID'] = hit.hit_id 103 part_qualifiers['ID'] = qualifiers['ID'] + ('.%s' % idx_part)
95 104
96 if trim: 105 # Otherwise, we have to account for the subject start's location
97 # If trimming, then we start relative to the 106 match_part_start = parent_match_start + hsp.sbjct_start + start - 1
98 # match's start
99 match_part_start = parent_match_start + start
100 else:
101 # Otherwise, we have to account for the subject start's location
102 match_part_start = parent_match_start + hsp.sbjct_start + start - 1
103 107
104 # We used to use hsp.align_length here, but that includes 108 # We used to use hsp.align_length here, but that includes
105 # gaps in the parent sequence 109 # gaps in the parent sequence
106 # 110 #
107 # Furthermore align_length will give calculation errors in weird places 111 # Furthermore align_length will give calculation errors in weird places
115 qualifiers=copy.deepcopy(part_qualifiers)) 119 qualifiers=copy.deepcopy(part_qualifiers))
116 ) 120 )
117 121
118 rec.features.append(top_feature) 122 rec.features.append(top_feature)
119 rec.annotations = {} 123 rec.annotations = {}
120 records.append(rec) 124 yield rec
121 return records
122 125
123 126
124 def __remove_query_gaps(query, match, subject): 127 def __remove_query_gaps(query, match, subject):
125 """remove positions in all three based on gaps in query 128 """remove positions in all three based on gaps in query
126 129
251 return "" 254 return ""
252 255
253 256
254 if __name__ == '__main__': 257 if __name__ == '__main__':
255 parser = argparse.ArgumentParser(description='Convert Blast XML to gapped GFF3', epilog='') 258 parser = argparse.ArgumentParser(description='Convert Blast XML to gapped GFF3', epilog='')
256 parser.add_argument('blastxml', type=open, help='Blast XML Output') 259 parser.add_argument('blastxml', type=argparse.FileType("r"), help='Blast XML Output')
257 parser.add_argument('--min_gap', type=int, help='Maximum gap size before generating a new match_part', default=3) 260 parser.add_argument('--min_gap', type=int, help='Maximum gap size before generating a new match_part', default=3)
258 parser.add_argument('--trim', action='store_true', help='Trim blast hits to be only as long as the parent feature') 261 parser.add_argument('--trim', action='store_true', help='Trim blast hits to be only as long as the parent feature')
259 parser.add_argument('--trim_end', action='store_true', help='Cut blast results off at end of gene') 262 parser.add_argument('--trim_end', action='store_true', help='Cut blast results off at end of gene')
263 parser.add_argument('--include_seq', action='store_true', help='Include sequence')
260 args = parser.parse_args() 264 args = parser.parse_args()
261 265
262 result = blastxml2gff3(**vars(args)) 266 for rec in blastxml2gff3(**vars(args)):
263 GFF.write(result, sys.stdout) 267 if len(rec.features):
268 GFF.write([rec], sys.stdout)