comparison gbk_to_gff.py @ 10:c42c69aa81f8

fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
author vipints <vipin@cbio.mskcc.org>
date Thu, 23 Apr 2015 18:01:45 -0400
parents
children
comparison
equal deleted inserted replaced
9:7d67331368f3 10:c42c69aa81f8
1 #!/usr/bin/env python
2 """
3 Convert data from Genbank format to GFF.
4
5 Usage:
6 python gbk_to_gff.py in.gbk > out.gff
7
8 Requirements:
9 BioPython:- http://biopython.org/
10 helper.py:- https://github.com/vipints/GFFtools-GX/blob/master/helper.py
11
12 Copyright (C)
13 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany.
14 2012-2015 Memorial Sloan Kettering Cancer Center New York City, USA.
15 """
16
17 import os
18 import re
19 import sys
20 import helper
21 import collections
22 from Bio import SeqIO
23
24 def feature_table(chr_id, source, orient, genes, transcripts, cds, exons, unk):
25 """
26 Write the feature information
27 """
28 for gname, ginfo in genes.items():
29 line = [str(chr_id),
30 'gbk2gff',
31 ginfo[3],
32 str(ginfo[0]),
33 str(ginfo[1]),
34 '.',
35 ginfo[2],
36 '.',
37 'ID=%s;Name=%s' % (str(gname), str(gname))]
38 sys.stdout.write('\t'.join(line)+"\n")
39 ## construct the transcript line is not defined in the original file
40 t_line = [str(chr_id), 'gbk2gff', source, 0, 1, '.', ginfo[2], '.']
41
42 if not transcripts:
43 t_line.append('ID=Transcript:%s;Parent=%s' % (str(gname), str(gname)))
44
45 if exons: ## get the entire transcript region from the defined feature
46 t_line[3] = str(exons[gname][0][0])
47 t_line[4] = str(exons[gname][0][-1])
48 elif cds:
49 t_line[3] = str(cds[gname][0][0])
50 t_line[4] = str(cds[gname][0][-1])
51
52 if not cds:
53 t_line[2] = 'transcript'
54 else:
55 t_line[2] = 'mRNA'
56 sys.stdout.write('\t'.join(t_line)+"\n")
57
58 if exons:
59 exon_line_print(t_line, exons[gname], 'Transcript:'+str(gname), 'exon')
60
61 if cds:
62 exon_line_print(t_line, cds[gname], 'Transcript:'+str(gname), 'CDS')
63 if not exons:
64 exon_line_print(t_line, cds[gname], 'Transcript:'+str(gname), 'exon')
65
66 else: ## transcript is defined
67 for idx in transcripts[gname]:
68 t_line[2] = idx[3]
69 t_line[3] = str(idx[0])
70 t_line[4] = str(idx[1])
71 t_line.append('ID='+str(idx[2])+';Parent='+str(gname))
72 sys.stdout.write('\t'.join(t_line)+"\n")
73
74 ## feature line print call
75 if exons:
76 exon_line_print(t_line, exons[gname], str(idx[2]), 'exon')
77 if cds:
78 exon_line_print(t_line, cds[gname], str(idx[2]), 'CDS')
79 if not exons:
80 exon_line_print(t_line, cds[gname], str(idx[2]), 'exon')
81
82 if len(genes) == 0: ## feature entry with fragment information
83
84 line = [str(chr_id), 'gbk2gff', source, 0, 1, '.', orient, '.']
85 fStart = fStop = None
86
87 for eid, ex in cds.items():
88 fStart = ex[0][0]
89 fStop = ex[0][-1]
90
91 for eid, ex in exons.items():
92 fStart = ex[0][0]
93 fStop = ex[0][-1]
94
95 if fStart or fStart:
96
97 line[2] = 'gene'
98 line[3] = str(fStart)
99 line[4] = str(fStop)
100 line.append('ID=Unknown_Gene_' + str(unk) + ';Name=Unknown_Gene_' + str(unk))
101 sys.stdout.write('\t'.join(line)+"\n")
102
103 if not cds:
104 line[2] = 'transcript'
105 else:
106 line[2] = 'mRNA'
107
108 line[8] = 'ID=Unknown_Transcript_' + str(unk) + ';Parent=Unknown_Gene_' + str(unk)
109 sys.stdout.write('\t'.join(line)+"\n")
110
111 if exons:
112 exon_line_print(line, cds[None], 'Unknown_Transcript_' + str(unk), 'exon')
113
114 if cds:
115 exon_line_print(line, cds[None], 'Unknown_Transcript_' + str(unk), 'CDS')
116 if not exons:
117 exon_line_print(line, cds[None], 'Unknown_Transcript_' + str(unk), 'exon')
118
119 unk +=1
120
121 return unk
122
123
124 def exon_line_print(temp_line, trx_exons, parent, ftype):
125 """
126 Print the EXON feature line
127 """
128 for ex in trx_exons:
129 temp_line[2] = ftype
130 temp_line[3] = str(ex[0])
131 temp_line[4] = str(ex[1])
132 temp_line[8] = 'Parent=%s' % parent
133 sys.stdout.write('\t'.join(temp_line)+"\n")
134
135
136 def gbk_parse(fname):
137 """
138 Extract genome annotation recods from genbank format
139
140 @args fname: gbk file name
141 @type fname: str
142 """
143 fhand = helper.open_file(gbkfname)
144 unk = 1
145
146 for record in SeqIO.parse(fhand, "genbank"):
147 gene_tags = dict()
148 tx_tags = collections.defaultdict(list)
149 exon = collections.defaultdict(list)
150 cds = collections.defaultdict(list)
151 mol_type, chr_id = None, None
152
153 for rec in record.features:
154
155 if rec.type == 'source':
156 try:
157 mol_type = rec.qualifiers['mol_type'][0]
158 except:
159 mol_type = '.'
160 pass
161 try:
162 chr_id = rec.qualifiers['chromosome'][0]
163 except:
164 chr_id = record.name
165 continue
166
167 strand='-'
168 strand='+' if rec.strand>0 else strand
169
170 fid = None
171 try:
172 fid = rec.qualifiers['gene'][0]
173 except:
174 pass
175
176 transcript_id = None
177 try:
178 transcript_id = rec.qualifiers['transcript_id'][0]
179 except:
180 pass
181
182 if re.search(r'gene', rec.type):
183 gene_tags[fid] = (rec.location._start.position+1,
184 rec.location._end.position,
185 strand,
186 rec.type
187 )
188 elif rec.type == 'exon':
189 exon[fid].append((rec.location._start.position+1,
190 rec.location._end.position))
191 elif rec.type=='CDS':
192 cds[fid].append((rec.location._start.position+1,
193 rec.location._end.position))
194 else:
195 # get all transcripts
196 if transcript_id:
197 tx_tags[fid].append((rec.location._start.position+1,
198 rec.location._end.position,
199 transcript_id,
200 rec.type))
201 # record extracted, generate feature table
202 unk = feature_table(chr_id, mol_type, strand, gene_tags, tx_tags, cds, exon, unk)
203
204 fhand.close()
205
206
207 if __name__=='__main__':
208
209 try:
210 gbkfname = sys.argv[1]
211 except:
212 print __doc__
213 sys.exit(-1)
214
215 ## extract gbk records
216 gbk_parse(gbkfname)