5
|
1 #!/usr/bin/env python
|
|
2 """
|
|
3 Extract genome annotation from a GFF (a tab delimited format for storing sequence features and annotations) file.
|
|
4
|
|
5 Requirements:
|
|
6 Numpy :- http://numpy.org/
|
|
7 Scipy :- http://scipy.org/
|
|
8
|
|
9 Copyright (C)
|
|
10
|
|
11 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany.
|
|
12 2012-2014 Memorial Sloan Kettering Cancer Center, New York City, USA.
|
|
13 """
|
|
14
|
|
15 import re
|
|
16 import os
|
|
17 import sys
|
|
18 import urllib
|
|
19 import numpy as np
|
|
20 import scipy.io as sio
|
|
21 from collections import defaultdict
|
|
22 import helper as utils
|
|
23
|
|
24 def attribute_tags(col9):
|
|
25 """
|
|
26 Split the key-value tags from the attribute column, it takes column number 9 from GTF/GFF file
|
|
27
|
|
28 @args col9: attribute column from GFF file
|
|
29 @type col9: str
|
|
30 """
|
|
31 info = defaultdict(list)
|
|
32 is_gff = False
|
|
33
|
|
34 if not col9:
|
|
35 return is_gff, info
|
|
36
|
|
37 # trim the line ending semi-colon ucsc may have some white-space
|
|
38 col9 = col9.rstrip(';| ')
|
|
39 # attributes from 9th column
|
|
40 atbs = col9.split(" ; ")
|
|
41 if len(atbs) == 1:
|
|
42 atbs = col9.split("; ")
|
|
43 if len(atbs) == 1:
|
|
44 atbs = col9.split(";")
|
|
45 # check the GFF3 pattern which has key value pairs like:
|
|
46 gff3_pat = re.compile("\w+=")
|
|
47 # sometime GTF have: gene_id uc002zkg.1;
|
|
48 gtf_pat = re.compile("\s?\w+\s")
|
|
49
|
|
50 key_vals = []
|
|
51
|
|
52 if gff3_pat.match(atbs[0]): # gff3 pattern
|
|
53 is_gff = True
|
|
54 key_vals = [at.split('=') for at in atbs]
|
|
55 elif gtf_pat.match(atbs[0]): # gtf pattern
|
|
56 for at in atbs:
|
|
57 key_vals.append(at.strip().split(" ",1))
|
|
58 else:
|
|
59 # to handle attribute column has only single value
|
|
60 key_vals.append(['ID', atbs[0]])
|
|
61 # get key, val items
|
|
62 for item in key_vals:
|
|
63 key, val = item
|
|
64 # replace the double qoutes from feature identifier
|
|
65 val = re.sub('"', '', val)
|
|
66 # replace the web formating place holders to plain text format
|
|
67 info[key].extend([urllib.unquote(v) for v in val.split(',') if v])
|
|
68
|
|
69 return is_gff, info
|
|
70
|
|
71 def spec_features_keywd(gff_parts):
|
|
72 """
|
|
73 Specify the feature key word according to the GFF specifications
|
|
74
|
|
75 @args gff_parts: attribute field key
|
|
76 @type gff_parts: str
|
|
77 """
|
|
78 for t_id in ["transcript_id", "transcriptId", "proteinId"]:
|
|
79 try:
|
|
80 gff_parts["info"]["Parent"] = gff_parts["info"][t_id]
|
|
81 break
|
|
82 except KeyError:
|
|
83 pass
|
|
84 for g_id in ["gene_id", "geneid", "geneId", "name", "gene_name", "genename"]:
|
|
85 try:
|
|
86 gff_parts["info"]["GParent"] = gff_parts["info"][g_id]
|
|
87 break
|
|
88 except KeyError:
|
|
89 pass
|
|
90 ## TODO key words
|
|
91 for flat_name in ["Transcript", "CDS"]:
|
|
92 if gff_parts["info"].has_key(flat_name):
|
|
93 # parents
|
|
94 if gff_parts['type'] in [flat_name] or re.search(r'transcript', gff_parts['type'], re.IGNORECASE):
|
|
95 if not gff_parts['id']:
|
|
96 gff_parts['id'] = gff_parts['info'][flat_name][0]
|
|
97 #gff_parts["info"]["ID"] = [gff_parts["id"]]
|
|
98 # children
|
|
99 elif gff_parts["type"] in ["intron", "exon", "three_prime_UTR",
|
|
100 "coding_exon", "five_prime_UTR", "CDS", "stop_codon",
|
|
101 "start_codon"]:
|
|
102 gff_parts["info"]["Parent"] = gff_parts["info"][flat_name]
|
|
103 break
|
|
104 return gff_parts
|
|
105
|
|
106 def Parse(ga_file):
|
|
107 """
|
|
108 Parsing GFF/GTF file based on feature relationship, it takes the input file.
|
|
109
|
|
110 @args ga_file: input file name
|
|
111 @type ga_file: str
|
|
112 """
|
|
113 child_map = defaultdict(list)
|
|
114 parent_map = dict()
|
|
115
|
|
116 ga_handle = utils.open_file(ga_file)
|
|
117
|
|
118 for rec in ga_handle:
|
|
119 rec = rec.strip('\n\r')
|
|
120
|
|
121 # skip empty line fasta identifier and commented line
|
|
122 if not rec or rec[0] in ['#', '>']:
|
|
123 continue
|
|
124 # skip the genome sequence
|
|
125 if not re.search('\t', rec):
|
|
126 continue
|
|
127
|
|
128 parts = rec.split('\t')
|
|
129 assert len(parts) >= 8, rec
|
|
130
|
|
131 # process the attribute column (9th column)
|
|
132 ftype, tags = attribute_tags(parts[-1])
|
|
133 if not tags: # skip the line if no attribute column.
|
|
134 continue
|
|
135
|
|
136 # extract fields
|
|
137 if parts[1]:
|
|
138 tags["source"] = parts[1]
|
|
139 if parts[7]:
|
|
140 tags["phase"] = parts[7]
|
|
141
|
|
142 gff_info = dict()
|
|
143 gff_info['info'] = dict(tags)
|
|
144 gff_info["is_gff3"] = ftype
|
|
145 gff_info['chr'] = parts[0]
|
|
146 gff_info['score'] = parts[5]
|
|
147
|
|
148 if parts[3] and parts[4]:
|
|
149 gff_info['location'] = [int(parts[3]) ,
|
|
150 int(parts[4])]
|
|
151 gff_info['type'] = parts[2]
|
|
152 gff_info['id'] = tags.get('ID', [''])[0]
|
|
153 if parts[6] in ['?', '.']:
|
|
154 parts[6] = None
|
|
155 gff_info['strand'] = parts[6]
|
|
156
|
|
157 # key word according to the GFF spec.
|
|
158 # is_gff3 flag is false check this condition and get the attribute fields
|
|
159 if not ftype:
|
|
160 gff_info = spec_features_keywd(gff_info)
|
|
161
|
|
162 # link the feature relationships
|
|
163 if gff_info['info'].has_key('Parent'):
|
|
164 for p in gff_info['info']['Parent']:
|
|
165 if p == gff_info['id']:
|
|
166 gff_info['id'] = ''
|
|
167 break
|
|
168 rec_category = 'child'
|
|
169 elif gff_info['id']:
|
|
170 rec_category = 'parent'
|
|
171 else:
|
|
172 rec_category = 'record'
|
|
173
|
|
174 # depends on the record category organize the features
|
|
175 if rec_category == 'child':
|
|
176 for p in gff_info['info']['Parent']:
|
|
177 # create the data structure based on source and feature id
|
|
178 child_map[(gff_info['chr'], gff_info['info']['source'], p)].append(
|
|
179 dict( type = gff_info['type'],
|
|
180 location = gff_info['location'],
|
|
181 strand = gff_info['strand'],
|
|
182 score = gff_info['score'],
|
|
183 ID = gff_info['id'],
|
|
184 gene_id = gff_info['info'].get('GParent', '')
|
|
185 ))
|
|
186 elif rec_category == 'parent':
|
|
187 parent_map[(gff_info['chr'], gff_info['info']['source'], gff_info['id'])] = dict(
|
|
188 type = gff_info['type'],
|
|
189 location = gff_info['location'],
|
|
190 strand = gff_info['strand'],
|
|
191 score = gff_info['score'],
|
|
192 name = tags.get('Name', [''])[0])
|
|
193 elif rec_category == 'record':
|
|
194 #TODO how to handle plain records?
|
|
195 c = 1
|
|
196 ga_handle.close()
|
|
197
|
|
198 # depends on file type create parent feature
|
|
199 if not ftype:
|
|
200 parent_map, child_map = create_missing_feature_type(parent_map, child_map)
|
|
201
|
|
202 # connecting parent child relations
|
|
203 # essentially the parent child features are here from any type of GTF/GFF2/GFF3 file
|
|
204 gene_mat = format_gene_models(parent_map, child_map)
|
|
205
|
|
206 return gene_mat
|
|
207
|
|
208 def format_gene_models(parent_nf_map, child_nf_map):
|
|
209 """
|
|
210 Genarate GeneObject based on the parsed file contents
|
|
211
|
|
212 @args parent_nf_map: parent features with source and chromosome information
|
|
213 @type parent_nf_map: collections defaultdict
|
|
214 @args child_nf_map: transctipt and exon information are encoded
|
|
215 @type child_nf_map: collections defaultdict
|
|
216 """
|
|
217
|
|
218 g_cnt = 0
|
|
219 gene = np.zeros((len(parent_nf_map),), dtype = utils.init_gene())
|
|
220
|
|
221 for pkey, pdet in parent_nf_map.items():
|
|
222 # considering only gene features
|
|
223 #if not re.search(r'gene', pdet.get('type', '')):
|
|
224 # continue
|
|
225
|
|
226 # infer the gene start and stop if not there in the
|
|
227 if not pdet.get('location', []):
|
|
228 GNS, GNE = [], []
|
|
229 # multiple number of transcripts
|
|
230 for L1 in child_nf_map[pkey]:
|
|
231 GNS.append(L1.get('location', [])[0])
|
|
232 GNE.append(L1.get('location', [])[1])
|
|
233 GNS.sort()
|
|
234 GNE.sort()
|
|
235 pdet['location'] = [GNS[0], GNE[-1]]
|
|
236
|
|
237 orient = pdet.get('strand', '')
|
|
238 gene[g_cnt]['id'] = g_cnt +1
|
|
239 gene[g_cnt]['chr'] = pkey[0]
|
|
240 gene[g_cnt]['source'] = pkey[1]
|
|
241 gene[g_cnt]['name'] = pkey[-1]
|
|
242 gene[g_cnt]['start'] = pdet.get('location', [])[0]
|
|
243 gene[g_cnt]['stop'] = pdet.get('location', [])[1]
|
|
244 gene[g_cnt]['strand'] = orient
|
|
245 gene[g_cnt]['score'] = pdet.get('score','')
|
|
246
|
|
247 # default value
|
|
248 gene[g_cnt]['is_alt_spliced'] = gene[g_cnt]['is_alt'] = 0
|
|
249 if len(child_nf_map[pkey]) > 1:
|
|
250 gene[g_cnt]['is_alt_spliced'] = gene[g_cnt]['is_alt'] = 1
|
|
251
|
|
252 # complete sub-feature for all transcripts
|
|
253 dim = len(child_nf_map[pkey])
|
|
254 TRS = np.zeros((dim,), dtype=np.object)
|
|
255 TR_TYP = np.zeros((dim,), dtype=np.object)
|
|
256 EXON = np.zeros((dim,), dtype=np.object)
|
|
257 UTR5 = np.zeros((dim,), dtype=np.object)
|
|
258 UTR3 = np.zeros((dim,), dtype=np.object)
|
|
259 CDS = np.zeros((dim,), dtype=np.object)
|
|
260 TISc = np.zeros((dim,), dtype=np.object)
|
|
261 TSSc = np.zeros((dim,), dtype=np.object)
|
|
262 CLV = np.zeros((dim,), dtype=np.object)
|
|
263 CSTOP = np.zeros((dim,), dtype=np.object)
|
|
264 TSTAT = np.zeros((dim,), dtype=np.object)
|
|
265
|
|
266 # fetching corresponding transcripts
|
|
267 for xq, Lv1 in enumerate(child_nf_map[pkey]):
|
|
268
|
|
269 TID = Lv1.get('ID', '')
|
|
270 TRS[xq]= np.array([TID])
|
|
271
|
|
272 TYPE = Lv1.get('type', '')
|
|
273 TR_TYP[xq] = np.array('')
|
|
274 TR_TYP[xq] = np.array(TYPE) if TYPE else TR_TYP[xq]
|
|
275
|
|
276 orient = Lv1.get('strand', '')
|
|
277
|
|
278 # fetching different sub-features
|
|
279 child_feat = defaultdict(list)
|
|
280 for Lv2 in child_nf_map[(pkey[0], pkey[1], TID)]:
|
|
281 E_TYP = Lv2.get('type', '')
|
|
282 child_feat[E_TYP].append(Lv2.get('location'))
|
|
283
|
|
284 # make general ascending order of coordinates
|
|
285 if orient == '-':
|
|
286 for etype, excod in child_feat.items():
|
|
287 if len(excod) > 1:
|
|
288 if excod[0][0] > excod[-1][0]:
|
|
289 excod.reverse()
|
|
290 child_feat[etype] = excod
|
|
291
|
|
292 # make exon coordinate from cds and utr regions
|
|
293 if not child_feat.get('exon'):
|
|
294 if child_feat.get('CDS'):
|
|
295 exon_cod = utils.make_Exon_cod( orient,
|
|
296 NonetoemptyList(child_feat.get('five_prime_UTR')),
|
|
297 NonetoemptyList(child_feat.get('CDS')),
|
|
298 NonetoemptyList(child_feat.get('three_prime_UTR')))
|
|
299 child_feat['exon'] = exon_cod
|
|
300 else:
|
|
301 # TODO only UTR's
|
|
302 # searching through keys to find a pattern describing exon feature
|
|
303 ex_key_pattern = [k for k in child_feat if k.endswith("exon")]
|
|
304 if ex_key_pattern:
|
|
305 child_feat['exon'] = child_feat[ex_key_pattern[0]]
|
|
306
|
|
307 # stop_codon are seperated from CDS, add the coordinates based on strand
|
|
308 if child_feat.get('stop_codon'):
|
|
309 if orient == '+':
|
|
310 if child_feat.get('stop_codon')[0][0] - child_feat.get('CDS')[-1][1] == 1:
|
|
311 child_feat['CDS'][-1] = [child_feat.get('CDS')[-1][0], child_feat.get('stop_codon')[0][1]]
|
|
312 else:
|
|
313 child_feat['CDS'].append(child_feat.get('stop_codon')[0])
|
|
314 elif orient == '-':
|
|
315 if child_feat.get('CDS')[0][0] - child_feat.get('stop_codon')[0][1] == 1:
|
|
316 child_feat['CDS'][0] = [child_feat.get('stop_codon')[0][0], child_feat.get('CDS')[0][1]]
|
|
317 else:
|
|
318 child_feat['CDS'].insert(0, child_feat.get('stop_codon')[0])
|
|
319
|
|
320 # transcript signal sites
|
|
321 TIS, cdsStop, TSS, cleave = [], [], [], []
|
|
322 cds_status, exon_status, utr_status = 0, 0, 0
|
|
323
|
|
324 if child_feat.get('exon'):
|
|
325 TSS = [child_feat.get('exon')[-1][1]]
|
|
326 TSS = [child_feat.get('exon')[0][0]] if orient == '+' else TSS
|
|
327 cleave = [child_feat.get('exon')[0][0]]
|
|
328 cleave = [child_feat.get('exon')[-1][1]] if orient == '+' else cleave
|
|
329 exon_status = 1
|
|
330
|
|
331 if child_feat.get('CDS'):
|
|
332 if orient == '+':
|
|
333 TIS = [child_feat.get('CDS')[0][0]]
|
|
334 cdsStop = [child_feat.get('CDS')[-1][1]-3]
|
|
335 else:
|
|
336 TIS = [child_feat.get('CDS')[-1][1]]
|
|
337 cdsStop = [child_feat.get('CDS')[0][0]+3]
|
|
338 cds_status = 1
|
|
339 # cds phase calculation
|
|
340 child_feat['CDS'] = utils.add_CDS_phase(orient, child_feat.get('CDS'))
|
|
341
|
|
342 # sub-feature status
|
|
343 if child_feat.get('three_prime_UTR') or child_feat.get('five_prime_UTR'):
|
|
344 utr_status =1
|
|
345
|
|
346 if utr_status == cds_status == exon_status == 1:
|
|
347 t_status = 1
|
|
348 else:
|
|
349 t_status = 0
|
|
350
|
|
351 # add sub-feature # make array for export to different out
|
|
352 TSTAT[xq] = t_status
|
|
353 EXON[xq] = np.array(child_feat.get('exon'), np.float64)
|
|
354 UTR5[xq] = np.array(NonetoemptyList(child_feat.get('five_prime_UTR')))
|
|
355 UTR3[xq] = np.array(NonetoemptyList(child_feat.get('three_prime_UTR')))
|
|
356 CDS[xq] = np.array(NonetoemptyList(child_feat.get('CDS')))
|
|
357 TISc[xq] = np.array(TIS)
|
|
358 CSTOP[xq] = np.array(cdsStop)
|
|
359 TSSc[xq] = np.array(TSS)
|
|
360 CLV[xq] = np.array(cleave)
|
|
361
|
|
362 # add sub-features to the parent gene feature
|
|
363 gene[g_cnt]['transcript_status'] = TSTAT
|
|
364 gene[g_cnt]['transcripts'] = TRS
|
|
365 gene[g_cnt]['exons'] = EXON
|
|
366 gene[g_cnt]['utr5_exons'] = UTR5
|
|
367 gene[g_cnt]['cds_exons'] = CDS
|
|
368 gene[g_cnt]['utr3_exons'] = UTR3
|
|
369 gene[g_cnt]['transcript_type'] = TR_TYP
|
|
370 gene[g_cnt]['tis'] = TISc
|
|
371 gene[g_cnt]['cdsStop'] = CSTOP
|
|
372 gene[g_cnt]['tss'] = TSSc
|
|
373 gene[g_cnt]['cleave'] = CLV
|
|
374
|
|
375 gene[g_cnt]['gene_info'] = dict( ID = pkey[-1],
|
|
376 Name = pdet.get('name'),
|
|
377 Source = pkey[1])
|
|
378 # few empty fields // TODO fill this:
|
|
379 gene[g_cnt]['anno_id'] = []
|
|
380 gene[g_cnt]['confgenes_id'] = []
|
|
381 gene[g_cnt]['alias'] = ''
|
|
382 gene[g_cnt]['name2'] = []
|
|
383 gene[g_cnt]['chr_num'] = []
|
|
384 gene[g_cnt]['paralogs'] = []
|
|
385 gene[g_cnt]['transcript_info'] = []
|
|
386 gene[g_cnt]['transcript_valid'] = []
|
|
387 gene[g_cnt]['exons_confirmed'] = []
|
|
388 gene[g_cnt]['tis_conf'] = []
|
|
389 gene[g_cnt]['tis_info'] = []
|
|
390 gene[g_cnt]['cdsStop_conf'] = []
|
|
391 gene[g_cnt]['cdsStop_info'] = []
|
|
392 gene[g_cnt]['tss_info'] = []
|
|
393 gene[g_cnt]['tss_conf'] = []
|
|
394 gene[g_cnt]['cleave_info'] = []
|
|
395 gene[g_cnt]['cleave_conf'] = []
|
|
396 gene[g_cnt]['polya_info'] = []
|
|
397 gene[g_cnt]['polya_conf'] = []
|
|
398 gene[g_cnt]['is_valid'] = []
|
|
399 gene[g_cnt]['transcript_complete'] = []
|
|
400 gene[g_cnt]['is_complete'] = []
|
|
401 gene[g_cnt]['is_correctly_gff3_referenced'] = ''
|
|
402 gene[g_cnt]['splicegraph'] = []
|
|
403 g_cnt += 1
|
|
404
|
|
405 ## deleting empty gene records from the main array
|
|
406 XPFLG=0
|
|
407 for XP, ens in enumerate(gene):
|
|
408 if ens[0]==0:
|
|
409 XPFLG=1
|
|
410 break
|
|
411
|
|
412 if XPFLG==1:
|
|
413 XQC = range(XP, len(gene)+1)
|
|
414 gene = np.delete(gene, XQC)
|
|
415
|
|
416 return gene
|
|
417
|
|
418 def NonetoemptyList(XS):
|
|
419 """
|
|
420 Convert a None type to empty list
|
|
421
|
|
422 @args XS: None type
|
|
423 @type XS: str
|
|
424 """
|
|
425 return [] if XS is None else XS
|
|
426
|
|
427 def create_missing_feature_type(p_feat, c_feat):
|
|
428 """
|
|
429 GFF/GTF file defines only child features. This function tries to create
|
|
430 the parent feature from the information provided in the attribute column.
|
|
431
|
|
432 example:
|
|
433 chr21 hg19_knownGene exon 9690071 9690100 0.000000 + . gene_id "uc002zkg.1"; transcript_id "uc002zkg.1";
|
|
434 chr21 hg19_knownGene exon 9692178 9692207 0.000000 + . gene_id "uc021wgt.1"; transcript_id "uc021wgt.1";
|
|
435 chr21 hg19_knownGene exon 9711935 9712038 0.000000 + . gene_id "uc011abu.2"; transcript_id "uc011abu.2";
|
|
436
|
|
437 This function gets the parsed feature annotations.
|
|
438
|
|
439 @args p_feat: Parent feature map
|
|
440 @type p_feat: collections defaultdict
|
|
441 @args c_feat: Child feature map
|
|
442 @type c_feat: collections defaultdict
|
|
443 """
|
|
444
|
|
445 child_n_map = defaultdict(list)
|
|
446 for fid, det in c_feat.items():
|
|
447 # get the details from grand child
|
|
448 GID = STRD = SCR = None
|
|
449 SPOS, EPOS = [], []
|
|
450 TYP = dict()
|
|
451 for gchild in det:
|
|
452 GID = gchild.get('gene_id', [''])[0]
|
|
453 SPOS.append(gchild.get('location', [])[0])
|
|
454 EPOS.append(gchild.get('location', [])[1])
|
|
455 STRD = gchild.get('strand', '')
|
|
456 SCR = gchild.get('score', '')
|
|
457 if gchild.get('type', '') == "gene": ## gencode GTF file has this problem
|
|
458 continue
|
|
459 TYP[gchild.get('type', '')] = 1
|
|
460 SPOS.sort()
|
|
461 EPOS.sort()
|
|
462
|
|
463 # infer transcript type
|
|
464 transcript_type = 'transcript'
|
|
465 transcript_type = 'mRNA' if TYP.get('CDS', '') or TYP.get('cds', '') else transcript_type
|
|
466
|
|
467 # gene id and transcript id are same
|
|
468 transcript_id = fid[-1]
|
|
469 if GID == transcript_id:
|
|
470 transcript_id = 'Transcript:' + str(GID)
|
|
471
|
|
472 # level -1 feature type
|
|
473 p_feat[(fid[0], fid[1], GID)] = dict( type = 'gene',
|
|
474 location = [], ## infer location based on multiple transcripts
|
|
475 strand = STRD,
|
|
476 name = GID )
|
|
477 # level -2 feature type
|
|
478 child_n_map[(fid[0], fid[1], GID)].append(
|
|
479 dict( type = transcript_type,
|
|
480 location = [SPOS[0], EPOS[-1]],
|
|
481 strand = STRD,
|
|
482 score = SCR,
|
|
483 ID = transcript_id,
|
|
484 gene_id = '' ))
|
|
485 # reorganizing the grand child
|
|
486 for gchild in det:
|
|
487 child_n_map[(fid[0], fid[1], transcript_id)].append(
|
|
488 dict( type = gchild.get('type', ''),
|
|
489 location = gchild.get('location'),
|
|
490 strand = gchild.get('strand'),
|
|
491 ID = gchild.get('ID'),
|
|
492 score = gchild.get('score'),
|
|
493 gene_id = '' ))
|
|
494 return p_feat, child_n_map
|
|
495
|