comparison GFFParser.py @ 5:6e589f267c14

Uploaded
author devteam
date Tue, 04 Nov 2014 12:15:19 -0500
parents
children
comparison
equal deleted inserted replaced
4:619e0fcd9126 5:6e589f267c14
1 #!/usr/bin/env python
2 """
3 Extract genome annotation from a GFF (a tab delimited format for storing sequence features and annotations) file.
4
5 Requirements:
6 Numpy :- http://numpy.org/
7 Scipy :- http://scipy.org/
8
9 Copyright (C)
10
11 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany.
12 2012-2014 Memorial Sloan Kettering Cancer Center, New York City, USA.
13 """
14
15 import re
16 import os
17 import sys
18 import urllib
19 import numpy as np
20 import scipy.io as sio
21 from collections import defaultdict
22 import helper as utils
23
24 def attribute_tags(col9):
25 """
26 Split the key-value tags from the attribute column, it takes column number 9 from GTF/GFF file
27
28 @args col9: attribute column from GFF file
29 @type col9: str
30 """
31 info = defaultdict(list)
32 is_gff = False
33
34 if not col9:
35 return is_gff, info
36
37 # trim the line ending semi-colon ucsc may have some white-space
38 col9 = col9.rstrip(';| ')
39 # attributes from 9th column
40 atbs = col9.split(" ; ")
41 if len(atbs) == 1:
42 atbs = col9.split("; ")
43 if len(atbs) == 1:
44 atbs = col9.split(";")
45 # check the GFF3 pattern which has key value pairs like:
46 gff3_pat = re.compile("\w+=")
47 # sometime GTF have: gene_id uc002zkg.1;
48 gtf_pat = re.compile("\s?\w+\s")
49
50 key_vals = []
51
52 if gff3_pat.match(atbs[0]): # gff3 pattern
53 is_gff = True
54 key_vals = [at.split('=') for at in atbs]
55 elif gtf_pat.match(atbs[0]): # gtf pattern
56 for at in atbs:
57 key_vals.append(at.strip().split(" ",1))
58 else:
59 # to handle attribute column has only single value
60 key_vals.append(['ID', atbs[0]])
61 # get key, val items
62 for item in key_vals:
63 key, val = item
64 # replace the double qoutes from feature identifier
65 val = re.sub('"', '', val)
66 # replace the web formating place holders to plain text format
67 info[key].extend([urllib.unquote(v) for v in val.split(',') if v])
68
69 return is_gff, info
70
71 def spec_features_keywd(gff_parts):
72 """
73 Specify the feature key word according to the GFF specifications
74
75 @args gff_parts: attribute field key
76 @type gff_parts: str
77 """
78 for t_id in ["transcript_id", "transcriptId", "proteinId"]:
79 try:
80 gff_parts["info"]["Parent"] = gff_parts["info"][t_id]
81 break
82 except KeyError:
83 pass
84 for g_id in ["gene_id", "geneid", "geneId", "name", "gene_name", "genename"]:
85 try:
86 gff_parts["info"]["GParent"] = gff_parts["info"][g_id]
87 break
88 except KeyError:
89 pass
90 ## TODO key words
91 for flat_name in ["Transcript", "CDS"]:
92 if gff_parts["info"].has_key(flat_name):
93 # parents
94 if gff_parts['type'] in [flat_name] or re.search(r'transcript', gff_parts['type'], re.IGNORECASE):
95 if not gff_parts['id']:
96 gff_parts['id'] = gff_parts['info'][flat_name][0]
97 #gff_parts["info"]["ID"] = [gff_parts["id"]]
98 # children
99 elif gff_parts["type"] in ["intron", "exon", "three_prime_UTR",
100 "coding_exon", "five_prime_UTR", "CDS", "stop_codon",
101 "start_codon"]:
102 gff_parts["info"]["Parent"] = gff_parts["info"][flat_name]
103 break
104 return gff_parts
105
106 def Parse(ga_file):
107 """
108 Parsing GFF/GTF file based on feature relationship, it takes the input file.
109
110 @args ga_file: input file name
111 @type ga_file: str
112 """
113 child_map = defaultdict(list)
114 parent_map = dict()
115
116 ga_handle = utils.open_file(ga_file)
117
118 for rec in ga_handle:
119 rec = rec.strip('\n\r')
120
121 # skip empty line fasta identifier and commented line
122 if not rec or rec[0] in ['#', '>']:
123 continue
124 # skip the genome sequence
125 if not re.search('\t', rec):
126 continue
127
128 parts = rec.split('\t')
129 assert len(parts) >= 8, rec
130
131 # process the attribute column (9th column)
132 ftype, tags = attribute_tags(parts[-1])
133 if not tags: # skip the line if no attribute column.
134 continue
135
136 # extract fields
137 if parts[1]:
138 tags["source"] = parts[1]
139 if parts[7]:
140 tags["phase"] = parts[7]
141
142 gff_info = dict()
143 gff_info['info'] = dict(tags)
144 gff_info["is_gff3"] = ftype
145 gff_info['chr'] = parts[0]
146 gff_info['score'] = parts[5]
147
148 if parts[3] and parts[4]:
149 gff_info['location'] = [int(parts[3]) ,
150 int(parts[4])]
151 gff_info['type'] = parts[2]
152 gff_info['id'] = tags.get('ID', [''])[0]
153 if parts[6] in ['?', '.']:
154 parts[6] = None
155 gff_info['strand'] = parts[6]
156
157 # key word according to the GFF spec.
158 # is_gff3 flag is false check this condition and get the attribute fields
159 if not ftype:
160 gff_info = spec_features_keywd(gff_info)
161
162 # link the feature relationships
163 if gff_info['info'].has_key('Parent'):
164 for p in gff_info['info']['Parent']:
165 if p == gff_info['id']:
166 gff_info['id'] = ''
167 break
168 rec_category = 'child'
169 elif gff_info['id']:
170 rec_category = 'parent'
171 else:
172 rec_category = 'record'
173
174 # depends on the record category organize the features
175 if rec_category == 'child':
176 for p in gff_info['info']['Parent']:
177 # create the data structure based on source and feature id
178 child_map[(gff_info['chr'], gff_info['info']['source'], p)].append(
179 dict( type = gff_info['type'],
180 location = gff_info['location'],
181 strand = gff_info['strand'],
182 score = gff_info['score'],
183 ID = gff_info['id'],
184 gene_id = gff_info['info'].get('GParent', '')
185 ))
186 elif rec_category == 'parent':
187 parent_map[(gff_info['chr'], gff_info['info']['source'], gff_info['id'])] = dict(
188 type = gff_info['type'],
189 location = gff_info['location'],
190 strand = gff_info['strand'],
191 score = gff_info['score'],
192 name = tags.get('Name', [''])[0])
193 elif rec_category == 'record':
194 #TODO how to handle plain records?
195 c = 1
196 ga_handle.close()
197
198 # depends on file type create parent feature
199 if not ftype:
200 parent_map, child_map = create_missing_feature_type(parent_map, child_map)
201
202 # connecting parent child relations
203 # essentially the parent child features are here from any type of GTF/GFF2/GFF3 file
204 gene_mat = format_gene_models(parent_map, child_map)
205
206 return gene_mat
207
208 def format_gene_models(parent_nf_map, child_nf_map):
209 """
210 Genarate GeneObject based on the parsed file contents
211
212 @args parent_nf_map: parent features with source and chromosome information
213 @type parent_nf_map: collections defaultdict
214 @args child_nf_map: transctipt and exon information are encoded
215 @type child_nf_map: collections defaultdict
216 """
217
218 g_cnt = 0
219 gene = np.zeros((len(parent_nf_map),), dtype = utils.init_gene())
220
221 for pkey, pdet in parent_nf_map.items():
222 # considering only gene features
223 #if not re.search(r'gene', pdet.get('type', '')):
224 # continue
225
226 # infer the gene start and stop if not there in the
227 if not pdet.get('location', []):
228 GNS, GNE = [], []
229 # multiple number of transcripts
230 for L1 in child_nf_map[pkey]:
231 GNS.append(L1.get('location', [])[0])
232 GNE.append(L1.get('location', [])[1])
233 GNS.sort()
234 GNE.sort()
235 pdet['location'] = [GNS[0], GNE[-1]]
236
237 orient = pdet.get('strand', '')
238 gene[g_cnt]['id'] = g_cnt +1
239 gene[g_cnt]['chr'] = pkey[0]
240 gene[g_cnt]['source'] = pkey[1]
241 gene[g_cnt]['name'] = pkey[-1]
242 gene[g_cnt]['start'] = pdet.get('location', [])[0]
243 gene[g_cnt]['stop'] = pdet.get('location', [])[1]
244 gene[g_cnt]['strand'] = orient
245 gene[g_cnt]['score'] = pdet.get('score','')
246
247 # default value
248 gene[g_cnt]['is_alt_spliced'] = gene[g_cnt]['is_alt'] = 0
249 if len(child_nf_map[pkey]) > 1:
250 gene[g_cnt]['is_alt_spliced'] = gene[g_cnt]['is_alt'] = 1
251
252 # complete sub-feature for all transcripts
253 dim = len(child_nf_map[pkey])
254 TRS = np.zeros((dim,), dtype=np.object)
255 TR_TYP = np.zeros((dim,), dtype=np.object)
256 EXON = np.zeros((dim,), dtype=np.object)
257 UTR5 = np.zeros((dim,), dtype=np.object)
258 UTR3 = np.zeros((dim,), dtype=np.object)
259 CDS = np.zeros((dim,), dtype=np.object)
260 TISc = np.zeros((dim,), dtype=np.object)
261 TSSc = np.zeros((dim,), dtype=np.object)
262 CLV = np.zeros((dim,), dtype=np.object)
263 CSTOP = np.zeros((dim,), dtype=np.object)
264 TSTAT = np.zeros((dim,), dtype=np.object)
265
266 # fetching corresponding transcripts
267 for xq, Lv1 in enumerate(child_nf_map[pkey]):
268
269 TID = Lv1.get('ID', '')
270 TRS[xq]= np.array([TID])
271
272 TYPE = Lv1.get('type', '')
273 TR_TYP[xq] = np.array('')
274 TR_TYP[xq] = np.array(TYPE) if TYPE else TR_TYP[xq]
275
276 orient = Lv1.get('strand', '')
277
278 # fetching different sub-features
279 child_feat = defaultdict(list)
280 for Lv2 in child_nf_map[(pkey[0], pkey[1], TID)]:
281 E_TYP = Lv2.get('type', '')
282 child_feat[E_TYP].append(Lv2.get('location'))
283
284 # make general ascending order of coordinates
285 if orient == '-':
286 for etype, excod in child_feat.items():
287 if len(excod) > 1:
288 if excod[0][0] > excod[-1][0]:
289 excod.reverse()
290 child_feat[etype] = excod
291
292 # make exon coordinate from cds and utr regions
293 if not child_feat.get('exon'):
294 if child_feat.get('CDS'):
295 exon_cod = utils.make_Exon_cod( orient,
296 NonetoemptyList(child_feat.get('five_prime_UTR')),
297 NonetoemptyList(child_feat.get('CDS')),
298 NonetoemptyList(child_feat.get('three_prime_UTR')))
299 child_feat['exon'] = exon_cod
300 else:
301 # TODO only UTR's
302 # searching through keys to find a pattern describing exon feature
303 ex_key_pattern = [k for k in child_feat if k.endswith("exon")]
304 if ex_key_pattern:
305 child_feat['exon'] = child_feat[ex_key_pattern[0]]
306
307 # stop_codon are seperated from CDS, add the coordinates based on strand
308 if child_feat.get('stop_codon'):
309 if orient == '+':
310 if child_feat.get('stop_codon')[0][0] - child_feat.get('CDS')[-1][1] == 1:
311 child_feat['CDS'][-1] = [child_feat.get('CDS')[-1][0], child_feat.get('stop_codon')[0][1]]
312 else:
313 child_feat['CDS'].append(child_feat.get('stop_codon')[0])
314 elif orient == '-':
315 if child_feat.get('CDS')[0][0] - child_feat.get('stop_codon')[0][1] == 1:
316 child_feat['CDS'][0] = [child_feat.get('stop_codon')[0][0], child_feat.get('CDS')[0][1]]
317 else:
318 child_feat['CDS'].insert(0, child_feat.get('stop_codon')[0])
319
320 # transcript signal sites
321 TIS, cdsStop, TSS, cleave = [], [], [], []
322 cds_status, exon_status, utr_status = 0, 0, 0
323
324 if child_feat.get('exon'):
325 TSS = [child_feat.get('exon')[-1][1]]
326 TSS = [child_feat.get('exon')[0][0]] if orient == '+' else TSS
327 cleave = [child_feat.get('exon')[0][0]]
328 cleave = [child_feat.get('exon')[-1][1]] if orient == '+' else cleave
329 exon_status = 1
330
331 if child_feat.get('CDS'):
332 if orient == '+':
333 TIS = [child_feat.get('CDS')[0][0]]
334 cdsStop = [child_feat.get('CDS')[-1][1]-3]
335 else:
336 TIS = [child_feat.get('CDS')[-1][1]]
337 cdsStop = [child_feat.get('CDS')[0][0]+3]
338 cds_status = 1
339 # cds phase calculation
340 child_feat['CDS'] = utils.add_CDS_phase(orient, child_feat.get('CDS'))
341
342 # sub-feature status
343 if child_feat.get('three_prime_UTR') or child_feat.get('five_prime_UTR'):
344 utr_status =1
345
346 if utr_status == cds_status == exon_status == 1:
347 t_status = 1
348 else:
349 t_status = 0
350
351 # add sub-feature # make array for export to different out
352 TSTAT[xq] = t_status
353 EXON[xq] = np.array(child_feat.get('exon'), np.float64)
354 UTR5[xq] = np.array(NonetoemptyList(child_feat.get('five_prime_UTR')))
355 UTR3[xq] = np.array(NonetoemptyList(child_feat.get('three_prime_UTR')))
356 CDS[xq] = np.array(NonetoemptyList(child_feat.get('CDS')))
357 TISc[xq] = np.array(TIS)
358 CSTOP[xq] = np.array(cdsStop)
359 TSSc[xq] = np.array(TSS)
360 CLV[xq] = np.array(cleave)
361
362 # add sub-features to the parent gene feature
363 gene[g_cnt]['transcript_status'] = TSTAT
364 gene[g_cnt]['transcripts'] = TRS
365 gene[g_cnt]['exons'] = EXON
366 gene[g_cnt]['utr5_exons'] = UTR5
367 gene[g_cnt]['cds_exons'] = CDS
368 gene[g_cnt]['utr3_exons'] = UTR3
369 gene[g_cnt]['transcript_type'] = TR_TYP
370 gene[g_cnt]['tis'] = TISc
371 gene[g_cnt]['cdsStop'] = CSTOP
372 gene[g_cnt]['tss'] = TSSc
373 gene[g_cnt]['cleave'] = CLV
374
375 gene[g_cnt]['gene_info'] = dict( ID = pkey[-1],
376 Name = pdet.get('name'),
377 Source = pkey[1])
378 # few empty fields // TODO fill this:
379 gene[g_cnt]['anno_id'] = []
380 gene[g_cnt]['confgenes_id'] = []
381 gene[g_cnt]['alias'] = ''
382 gene[g_cnt]['name2'] = []
383 gene[g_cnt]['chr_num'] = []
384 gene[g_cnt]['paralogs'] = []
385 gene[g_cnt]['transcript_info'] = []
386 gene[g_cnt]['transcript_valid'] = []
387 gene[g_cnt]['exons_confirmed'] = []
388 gene[g_cnt]['tis_conf'] = []
389 gene[g_cnt]['tis_info'] = []
390 gene[g_cnt]['cdsStop_conf'] = []
391 gene[g_cnt]['cdsStop_info'] = []
392 gene[g_cnt]['tss_info'] = []
393 gene[g_cnt]['tss_conf'] = []
394 gene[g_cnt]['cleave_info'] = []
395 gene[g_cnt]['cleave_conf'] = []
396 gene[g_cnt]['polya_info'] = []
397 gene[g_cnt]['polya_conf'] = []
398 gene[g_cnt]['is_valid'] = []
399 gene[g_cnt]['transcript_complete'] = []
400 gene[g_cnt]['is_complete'] = []
401 gene[g_cnt]['is_correctly_gff3_referenced'] = ''
402 gene[g_cnt]['splicegraph'] = []
403 g_cnt += 1
404
405 ## deleting empty gene records from the main array
406 XPFLG=0
407 for XP, ens in enumerate(gene):
408 if ens[0]==0:
409 XPFLG=1
410 break
411
412 if XPFLG==1:
413 XQC = range(XP, len(gene)+1)
414 gene = np.delete(gene, XQC)
415
416 return gene
417
418 def NonetoemptyList(XS):
419 """
420 Convert a None type to empty list
421
422 @args XS: None type
423 @type XS: str
424 """
425 return [] if XS is None else XS
426
427 def create_missing_feature_type(p_feat, c_feat):
428 """
429 GFF/GTF file defines only child features. This function tries to create
430 the parent feature from the information provided in the attribute column.
431
432 example:
433 chr21 hg19_knownGene exon 9690071 9690100 0.000000 + . gene_id "uc002zkg.1"; transcript_id "uc002zkg.1";
434 chr21 hg19_knownGene exon 9692178 9692207 0.000000 + . gene_id "uc021wgt.1"; transcript_id "uc021wgt.1";
435 chr21 hg19_knownGene exon 9711935 9712038 0.000000 + . gene_id "uc011abu.2"; transcript_id "uc011abu.2";
436
437 This function gets the parsed feature annotations.
438
439 @args p_feat: Parent feature map
440 @type p_feat: collections defaultdict
441 @args c_feat: Child feature map
442 @type c_feat: collections defaultdict
443 """
444
445 child_n_map = defaultdict(list)
446 for fid, det in c_feat.items():
447 # get the details from grand child
448 GID = STRD = SCR = None
449 SPOS, EPOS = [], []
450 TYP = dict()
451 for gchild in det:
452 GID = gchild.get('gene_id', [''])[0]
453 SPOS.append(gchild.get('location', [])[0])
454 EPOS.append(gchild.get('location', [])[1])
455 STRD = gchild.get('strand', '')
456 SCR = gchild.get('score', '')
457 if gchild.get('type', '') == "gene": ## gencode GTF file has this problem
458 continue
459 TYP[gchild.get('type', '')] = 1
460 SPOS.sort()
461 EPOS.sort()
462
463 # infer transcript type
464 transcript_type = 'transcript'
465 transcript_type = 'mRNA' if TYP.get('CDS', '') or TYP.get('cds', '') else transcript_type
466
467 # gene id and transcript id are same
468 transcript_id = fid[-1]
469 if GID == transcript_id:
470 transcript_id = 'Transcript:' + str(GID)
471
472 # level -1 feature type
473 p_feat[(fid[0], fid[1], GID)] = dict( type = 'gene',
474 location = [], ## infer location based on multiple transcripts
475 strand = STRD,
476 name = GID )
477 # level -2 feature type
478 child_n_map[(fid[0], fid[1], GID)].append(
479 dict( type = transcript_type,
480 location = [SPOS[0], EPOS[-1]],
481 strand = STRD,
482 score = SCR,
483 ID = transcript_id,
484 gene_id = '' ))
485 # reorganizing the grand child
486 for gchild in det:
487 child_n_map[(fid[0], fid[1], transcript_id)].append(
488 dict( type = gchild.get('type', ''),
489 location = gchild.get('location'),
490 strand = gchild.get('strand'),
491 ID = gchild.get('ID'),
492 score = gchild.get('score'),
493 gene_id = '' ))
494 return p_feat, child_n_map
495