comparison gbk_to_gff.py @ 5:6e589f267c14

Uploaded
author devteam
date Tue, 04 Nov 2014 12:15:19 -0500
parents
children
comparison
equal deleted inserted replaced
4:619e0fcd9126 5:6e589f267c14
1 #!/usr/bin/env python
2 """
3 Convert data from Genbank format to GFF.
4
5 Usage:
6 python gbk_to_gff.py in.gbk > out.gff
7
8 Requirements:
9 BioPython:- http://biopython.org/
10 helper.py : https://github.com/vipints/GFFtools-GX/blob/master/helper.py
11
12 Copyright (C)
13 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany.
14 2012-2014 Memorial Sloan Kettering Cancer Center New York City, USA.
15 """
16
17 import os
18 import re
19 import sys
20 import collections
21 from Bio import SeqIO
22 import helper
23
24 def feature_table(chr_id, source, orient, genes, transcripts, cds, exons, unk):
25 """
26 Write the feature information
27 """
28
29 for gname, ginfo in genes.items():
30 line = [str(chr_id),
31 'gbk_to_gff',
32 ginfo[3],
33 str(ginfo[0]),
34 str(ginfo[1]),
35 '.',
36 ginfo[2],
37 '.',
38 'ID=%s;Name=%s' % (str(gname), str(gname))]
39 print '\t'.join(line)
40 ## construct the transcript line is not defined in the original file
41 t_line = [str(chr_id), 'gbk_to_gff', source, 0, 1, '.', ginfo[2], '.']
42
43 if not transcripts:
44 t_line.append('ID=Transcript:%s;Parent=%s' % (str(gname), str(gname)))
45
46 if exons: ## get the entire transcript region from the defined feature
47 t_line[3] = str(exons[gname][0][0])
48 t_line[4] = str(exons[gname][0][-1])
49 elif cds:
50 t_line[3] = str(cds[gname][0][0])
51 t_line[4] = str(cds[gname][0][-1])
52 print '\t'.join(t_line)
53
54 if exons:
55 exon_line_print(t_line, exons[gname], 'Transcript:'+str(gname), 'exon')
56
57 if cds:
58 exon_line_print(t_line, cds[gname], 'Transcript:'+str(gname), 'CDS')
59 if not exons:
60 exon_line_print(t_line, cds[gname], 'Transcript:'+str(gname), 'exon')
61
62 else: ## transcript is defined
63 for idx in transcripts[gname]:
64 t_line[2] = idx[3]
65 t_line[3] = str(idx[0])
66 t_line[4] = str(idx[1])
67 t_line.append('ID='+str(idx[2])+';Parent='+str(gname))
68 print '\t'.join(t_line)
69
70 ## feature line print call
71 if exons:
72 exon_line_print(t_line, exons[gname], str(idx[2]), 'exon')
73 if cds:
74 exon_line_print(t_line, cds[gname], str(idx[2]), 'CDS')
75 if not exons:
76 exon_line_print(t_line, cds[gname], str(idx[2]), 'exon')
77
78 if len(genes) == 0: ## feature entry with fragment information
79
80 line = [str(chr_id), 'gbk_to_gff', source, 0, 1, '.', orient, '.']
81 fStart = fStop = None
82
83 for eid, ex in cds.items():
84 fStart = ex[0][0]
85 fStop = ex[0][-1]
86
87 for eid, ex in exons.items():
88 fStart = ex[0][0]
89 fStop = ex[0][-1]
90
91 if fStart or fStart:
92
93 line[2] = 'gene'
94 line[3] = str(fStart)
95 line[4] = str(fStop)
96 line.append('ID=Unknown_Gene_' + str(unk) + ';Name=Unknown_Gene_' + str(unk))
97 print "\t".join(line)
98
99 if not cds:
100 line[2] = 'transcript'
101 else:
102 line[2] = 'mRNA'
103
104 line[8] = 'ID=Unknown_Transcript_' + str(unk) + ';Parent=Unknown_Gene_' + str(unk)
105 print "\t".join(line)
106
107 if exons:
108 exon_line_print(line, cds[None], 'Unknown_Transcript_' + str(unk), 'exon')
109
110 if cds:
111 exon_line_print(line, cds[None], 'Unknown_Transcript_' + str(unk), 'CDS')
112 if not exons:
113 exon_line_print(line, cds[None], 'Unknown_Transcript_' + str(unk), 'exon')
114
115 unk +=1
116
117 return unk
118
119 def exon_line_print(temp_line, trx_exons, parent, ftype):
120 """
121 Print the EXON feature line
122 """
123
124 for ex in trx_exons:
125 temp_line[2] = ftype
126 temp_line[3] = str(ex[0])
127 temp_line[4] = str(ex[1])
128 temp_line[8] = 'Parent=%s' % parent
129 print '\t'.join(temp_line)
130
131 def gbk_parse(fname):
132 """
133 Extract genome annotation recods from genbank format
134
135 @args fname: gbk file name
136 @type fname: str
137 """
138
139 fhand = helper.open_file(gbkfname)
140 unk = 1
141
142 for record in SeqIO.parse(fhand, "genbank"):
143
144 gene_tags = dict()
145 tx_tags = collections.defaultdict(list)
146 exon = collections.defaultdict(list)
147 cds = collections.defaultdict(list)
148 mol_type, chr_id = None, None
149
150 for rec in record.features:
151
152 if rec.type == 'source':
153 try:
154 mol_type = rec.qualifiers['mol_type'][0]
155 except:
156 mol_type = '.'
157 pass
158 try:
159 chr_id = rec.qualifiers['chromosome'][0]
160 except:
161 chr_id = record.name
162 continue
163
164 strand='-'
165 strand='+' if rec.strand>0 else strand
166
167 fid = None
168 try:
169 fid = rec.qualifiers['gene'][0]
170 except:
171 pass
172
173 transcript_id = None
174 try:
175 transcript_id = rec.qualifiers['transcript_id'][0]
176 except:
177 pass
178
179 if re.search(r'gene', rec.type):
180 gene_tags[fid] = (rec.location._start.position+1,
181 rec.location._end.position,
182 strand,
183 rec.type
184 )
185 elif rec.type == 'exon':
186 exon[fid].append((rec.location._start.position+1,
187 rec.location._end.position))
188 elif rec.type=='CDS':
189 cds[fid].append((rec.location._start.position+1,
190 rec.location._end.position))
191 else:
192 # get all transcripts
193 if transcript_id:
194 tx_tags[fid].append((rec.location._start.position+1,
195 rec.location._end.position,
196 transcript_id,
197 rec.type))
198 # record extracted, generate feature table
199 unk = feature_table(chr_id, mol_type, strand, gene_tags, tx_tags, cds, exon, unk)
200
201 fhand.close()
202
203
204 if __name__=='__main__':
205
206 try:
207 gbkfname = sys.argv[1]
208 except:
209 print __doc__
210 sys.exit(-1)
211
212 ## extract gbk records
213 gbk_parse(gbkfname)