Mercurial > repos > vipints > fml_gff3togtf
comparison gbk_to_gff.py @ 5:6e589f267c14
Uploaded
author | devteam |
---|---|
date | Tue, 04 Nov 2014 12:15:19 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
4:619e0fcd9126 | 5:6e589f267c14 |
---|---|
1 #!/usr/bin/env python | |
2 """ | |
3 Convert data from Genbank format to GFF. | |
4 | |
5 Usage: | |
6 python gbk_to_gff.py in.gbk > out.gff | |
7 | |
8 Requirements: | |
9 BioPython:- http://biopython.org/ | |
10 helper.py : https://github.com/vipints/GFFtools-GX/blob/master/helper.py | |
11 | |
12 Copyright (C) | |
13 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany. | |
14 2012-2014 Memorial Sloan Kettering Cancer Center New York City, USA. | |
15 """ | |
16 | |
17 import os | |
18 import re | |
19 import sys | |
20 import collections | |
21 from Bio import SeqIO | |
22 import helper | |
23 | |
24 def feature_table(chr_id, source, orient, genes, transcripts, cds, exons, unk): | |
25 """ | |
26 Write the feature information | |
27 """ | |
28 | |
29 for gname, ginfo in genes.items(): | |
30 line = [str(chr_id), | |
31 'gbk_to_gff', | |
32 ginfo[3], | |
33 str(ginfo[0]), | |
34 str(ginfo[1]), | |
35 '.', | |
36 ginfo[2], | |
37 '.', | |
38 'ID=%s;Name=%s' % (str(gname), str(gname))] | |
39 print '\t'.join(line) | |
40 ## construct the transcript line is not defined in the original file | |
41 t_line = [str(chr_id), 'gbk_to_gff', source, 0, 1, '.', ginfo[2], '.'] | |
42 | |
43 if not transcripts: | |
44 t_line.append('ID=Transcript:%s;Parent=%s' % (str(gname), str(gname))) | |
45 | |
46 if exons: ## get the entire transcript region from the defined feature | |
47 t_line[3] = str(exons[gname][0][0]) | |
48 t_line[4] = str(exons[gname][0][-1]) | |
49 elif cds: | |
50 t_line[3] = str(cds[gname][0][0]) | |
51 t_line[4] = str(cds[gname][0][-1]) | |
52 print '\t'.join(t_line) | |
53 | |
54 if exons: | |
55 exon_line_print(t_line, exons[gname], 'Transcript:'+str(gname), 'exon') | |
56 | |
57 if cds: | |
58 exon_line_print(t_line, cds[gname], 'Transcript:'+str(gname), 'CDS') | |
59 if not exons: | |
60 exon_line_print(t_line, cds[gname], 'Transcript:'+str(gname), 'exon') | |
61 | |
62 else: ## transcript is defined | |
63 for idx in transcripts[gname]: | |
64 t_line[2] = idx[3] | |
65 t_line[3] = str(idx[0]) | |
66 t_line[4] = str(idx[1]) | |
67 t_line.append('ID='+str(idx[2])+';Parent='+str(gname)) | |
68 print '\t'.join(t_line) | |
69 | |
70 ## feature line print call | |
71 if exons: | |
72 exon_line_print(t_line, exons[gname], str(idx[2]), 'exon') | |
73 if cds: | |
74 exon_line_print(t_line, cds[gname], str(idx[2]), 'CDS') | |
75 if not exons: | |
76 exon_line_print(t_line, cds[gname], str(idx[2]), 'exon') | |
77 | |
78 if len(genes) == 0: ## feature entry with fragment information | |
79 | |
80 line = [str(chr_id), 'gbk_to_gff', source, 0, 1, '.', orient, '.'] | |
81 fStart = fStop = None | |
82 | |
83 for eid, ex in cds.items(): | |
84 fStart = ex[0][0] | |
85 fStop = ex[0][-1] | |
86 | |
87 for eid, ex in exons.items(): | |
88 fStart = ex[0][0] | |
89 fStop = ex[0][-1] | |
90 | |
91 if fStart or fStart: | |
92 | |
93 line[2] = 'gene' | |
94 line[3] = str(fStart) | |
95 line[4] = str(fStop) | |
96 line.append('ID=Unknown_Gene_' + str(unk) + ';Name=Unknown_Gene_' + str(unk)) | |
97 print "\t".join(line) | |
98 | |
99 if not cds: | |
100 line[2] = 'transcript' | |
101 else: | |
102 line[2] = 'mRNA' | |
103 | |
104 line[8] = 'ID=Unknown_Transcript_' + str(unk) + ';Parent=Unknown_Gene_' + str(unk) | |
105 print "\t".join(line) | |
106 | |
107 if exons: | |
108 exon_line_print(line, cds[None], 'Unknown_Transcript_' + str(unk), 'exon') | |
109 | |
110 if cds: | |
111 exon_line_print(line, cds[None], 'Unknown_Transcript_' + str(unk), 'CDS') | |
112 if not exons: | |
113 exon_line_print(line, cds[None], 'Unknown_Transcript_' + str(unk), 'exon') | |
114 | |
115 unk +=1 | |
116 | |
117 return unk | |
118 | |
119 def exon_line_print(temp_line, trx_exons, parent, ftype): | |
120 """ | |
121 Print the EXON feature line | |
122 """ | |
123 | |
124 for ex in trx_exons: | |
125 temp_line[2] = ftype | |
126 temp_line[3] = str(ex[0]) | |
127 temp_line[4] = str(ex[1]) | |
128 temp_line[8] = 'Parent=%s' % parent | |
129 print '\t'.join(temp_line) | |
130 | |
131 def gbk_parse(fname): | |
132 """ | |
133 Extract genome annotation recods from genbank format | |
134 | |
135 @args fname: gbk file name | |
136 @type fname: str | |
137 """ | |
138 | |
139 fhand = helper.open_file(gbkfname) | |
140 unk = 1 | |
141 | |
142 for record in SeqIO.parse(fhand, "genbank"): | |
143 | |
144 gene_tags = dict() | |
145 tx_tags = collections.defaultdict(list) | |
146 exon = collections.defaultdict(list) | |
147 cds = collections.defaultdict(list) | |
148 mol_type, chr_id = None, None | |
149 | |
150 for rec in record.features: | |
151 | |
152 if rec.type == 'source': | |
153 try: | |
154 mol_type = rec.qualifiers['mol_type'][0] | |
155 except: | |
156 mol_type = '.' | |
157 pass | |
158 try: | |
159 chr_id = rec.qualifiers['chromosome'][0] | |
160 except: | |
161 chr_id = record.name | |
162 continue | |
163 | |
164 strand='-' | |
165 strand='+' if rec.strand>0 else strand | |
166 | |
167 fid = None | |
168 try: | |
169 fid = rec.qualifiers['gene'][0] | |
170 except: | |
171 pass | |
172 | |
173 transcript_id = None | |
174 try: | |
175 transcript_id = rec.qualifiers['transcript_id'][0] | |
176 except: | |
177 pass | |
178 | |
179 if re.search(r'gene', rec.type): | |
180 gene_tags[fid] = (rec.location._start.position+1, | |
181 rec.location._end.position, | |
182 strand, | |
183 rec.type | |
184 ) | |
185 elif rec.type == 'exon': | |
186 exon[fid].append((rec.location._start.position+1, | |
187 rec.location._end.position)) | |
188 elif rec.type=='CDS': | |
189 cds[fid].append((rec.location._start.position+1, | |
190 rec.location._end.position)) | |
191 else: | |
192 # get all transcripts | |
193 if transcript_id: | |
194 tx_tags[fid].append((rec.location._start.position+1, | |
195 rec.location._end.position, | |
196 transcript_id, | |
197 rec.type)) | |
198 # record extracted, generate feature table | |
199 unk = feature_table(chr_id, mol_type, strand, gene_tags, tx_tags, cds, exon, unk) | |
200 | |
201 fhand.close() | |
202 | |
203 | |
204 if __name__=='__main__': | |
205 | |
206 try: | |
207 gbkfname = sys.argv[1] | |
208 except: | |
209 print __doc__ | |
210 sys.exit(-1) | |
211 | |
212 ## extract gbk records | |
213 gbk_parse(gbkfname) |