comparison GFFParser.py @ 10:c42c69aa81f8

fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
author vipints <vipin@cbio.mskcc.org>
date Thu, 23 Apr 2015 18:01:45 -0400
parents
children
comparison
equal deleted inserted replaced
9:7d67331368f3 10:c42c69aa81f8
1 #!/usr/bin/env python
2 """
3 Extract genome annotation from a GFF (a tab delimited format for storing sequence features and annotations) file.
4
5 Requirements:
6 Numpy :- http://numpy.org/
7
8 Copyright (C)
9
10 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany.
11 2012-2015 Memorial Sloan Kettering Cancer Center, New York City, USA.
12 """
13
14 import re
15 import os
16 import sys
17 import urllib
18 import numpy as np
19 import helper as utils
20 from collections import defaultdict
21
22 def attribute_tags(col9):
23 """
24 Split the key-value tags from the attribute column, it takes column number 9 from GTF/GFF file
25
26 @args col9: attribute column from GFF file
27 @type col9: str
28 """
29 info = defaultdict(list)
30 is_gff = False
31
32 if not col9:
33 return is_gff, info
34
35 # trim the line ending semi-colon ucsc may have some white-space
36 col9 = col9.rstrip(';| ')
37 # attributes from 9th column
38 atbs = col9.split(" ; ")
39 if len(atbs) == 1:
40 atbs = col9.split("; ")
41 if len(atbs) == 1:
42 atbs = col9.split(";")
43 # check the GFF3 pattern which has key value pairs like:
44 gff3_pat = re.compile("\w+=")
45 # sometime GTF have: gene_id uc002zkg.1;
46 gtf_pat = re.compile("\s?\w+\s")
47
48 key_vals = []
49
50 if gff3_pat.match(atbs[0]): # gff3 pattern
51 is_gff = True
52 key_vals = [at.split('=') for at in atbs]
53 elif gtf_pat.match(atbs[0]): # gtf pattern
54 for at in atbs:
55 key_vals.append(at.strip().split(" ",1))
56 else:
57 # to handle attribute column has only single value
58 key_vals.append(['ID', atbs[0]])
59 # get key, val items
60 for item in key_vals:
61 key, val = item
62 # replace the double qoutes from feature identifier
63 val = re.sub('"', '', val)
64 # replace the web formating place holders to plain text format
65 info[key].extend([urllib.unquote(v) for v in val.split(',') if v])
66
67 return is_gff, info
68
69 def spec_features_keywd(gff_parts):
70 """
71 Specify the feature key word according to the GFF specifications
72
73 @args gff_parts: attribute field key
74 @type gff_parts: str
75 """
76 for t_id in ["transcript_id", "transcriptId", "proteinId"]:
77 try:
78 gff_parts["info"]["Parent"] = gff_parts["info"][t_id]
79 break
80 except KeyError:
81 pass
82 for g_id in ["gene_id", "geneid", "geneId", "name", "gene_name", "genename"]:
83 try:
84 gff_parts["info"]["GParent"] = gff_parts["info"][g_id]
85 break
86 except KeyError:
87 pass
88 ## TODO key words
89 for flat_name in ["Transcript", "CDS"]:
90 if gff_parts["info"].has_key(flat_name):
91 # parents
92 if gff_parts['type'] in [flat_name] or re.search(r'transcript', gff_parts['type'], re.IGNORECASE):
93 if not gff_parts['id']:
94 gff_parts['id'] = gff_parts['info'][flat_name][0]
95 #gff_parts["info"]["ID"] = [gff_parts["id"]]
96 # children
97 elif gff_parts["type"] in ["intron", "exon", "three_prime_UTR",
98 "coding_exon", "five_prime_UTR", "CDS", "stop_codon",
99 "start_codon"]:
100 gff_parts["info"]["Parent"] = gff_parts["info"][flat_name]
101 break
102 return gff_parts
103
104 def Parse(ga_file):
105 """
106 Parsing GFF/GTF file based on feature relationship, it takes the input file.
107
108 @args ga_file: input file name
109 @type ga_file: str
110 """
111 child_map = defaultdict(list)
112 parent_map = dict()
113
114 ga_handle = utils.open_file(ga_file)
115
116 for rec in ga_handle:
117 rec = rec.strip('\n\r')
118
119 # skip empty line fasta identifier and commented line
120 if not rec or rec[0] in ['#', '>']:
121 continue
122 # skip the genome sequence
123 if not re.search('\t', rec):
124 continue
125
126 parts = rec.split('\t')
127 assert len(parts) >= 8, rec
128
129 # process the attribute column (9th column)
130 ftype, tags = attribute_tags(parts[-1])
131 if not tags: # skip the line if no attribute column.
132 continue
133
134 # extract fields
135 if parts[1]:
136 tags["source"] = parts[1]
137 if parts[7]:
138 tags["phase"] = parts[7]
139
140 gff_info = dict()
141 gff_info['info'] = dict(tags)
142 gff_info["is_gff3"] = ftype
143 gff_info['chr'] = parts[0]
144 gff_info['score'] = parts[5]
145
146 if parts[3] and parts[4]:
147 gff_info['location'] = [int(parts[3]) ,
148 int(parts[4])]
149 gff_info['type'] = parts[2]
150 gff_info['id'] = tags.get('ID', [''])[0]
151 if parts[6] in ['?', '.']:
152 parts[6] = None
153 gff_info['strand'] = parts[6]
154
155 # key word according to the GFF spec.
156 # is_gff3 flag is false check this condition and get the attribute fields
157 if not ftype:
158 gff_info = spec_features_keywd(gff_info)
159
160 # link the feature relationships
161 if gff_info['info'].has_key('Parent'):
162 for p in gff_info['info']['Parent']:
163 if p == gff_info['id']:
164 gff_info['id'] = ''
165 break
166 rec_category = 'child'
167 elif gff_info['id']:
168 rec_category = 'parent'
169 else:
170 rec_category = 'record'
171
172 # depends on the record category organize the features
173 if rec_category == 'child':
174 for p in gff_info['info']['Parent']:
175 # create the data structure based on source and feature id
176 child_map[(gff_info['chr'], gff_info['info']['source'], p)].append(
177 dict( type = gff_info['type'],
178 location = gff_info['location'],
179 strand = gff_info['strand'],
180 score = gff_info['score'],
181 ID = gff_info['id'],
182 gene_id = gff_info['info'].get('GParent', '')
183 ))
184 elif rec_category == 'parent':
185 parent_map[(gff_info['chr'], gff_info['info']['source'], gff_info['id'])] = dict(
186 type = gff_info['type'],
187 location = gff_info['location'],
188 strand = gff_info['strand'],
189 score = gff_info['score'],
190 name = tags.get('Name', [''])[0])
191 elif rec_category == 'record':
192 #TODO how to handle plain records?
193 c = 1
194 ga_handle.close()
195
196 # depends on file type create parent feature
197 if not ftype:
198 parent_map, child_map = create_missing_feature_type(parent_map, child_map)
199
200 # connecting parent child relations
201 # essentially the parent child features are here from any type of GTF/GFF2/GFF3 file
202 gene_mat = format_gene_models(parent_map, child_map)
203
204 return gene_mat
205
206 def format_gene_models(parent_nf_map, child_nf_map):
207 """
208 Genarate GeneObject based on the parsed file contents
209
210 @args parent_nf_map: parent features with source and chromosome information
211 @type parent_nf_map: collections defaultdict
212 @args child_nf_map: transctipt and exon information are encoded
213 @type child_nf_map: collections defaultdict
214 """
215
216 g_cnt = 0
217 gene = np.zeros((len(parent_nf_map),), dtype = utils.init_gene())
218
219 for pkey, pdet in parent_nf_map.items():
220 # considering only gene features
221 #if not re.search(r'gene', pdet.get('type', '')):
222 # continue
223
224 # infer the gene start and stop if not there in the
225 if not pdet.get('location', []):
226 GNS, GNE = [], []
227 # multiple number of transcripts
228 for L1 in child_nf_map[pkey]:
229 GNS.append(L1.get('location', [])[0])
230 GNE.append(L1.get('location', [])[1])
231 GNS.sort()
232 GNE.sort()
233 pdet['location'] = [GNS[0], GNE[-1]]
234
235 orient = pdet.get('strand', '')
236 gene[g_cnt]['id'] = g_cnt +1
237 gene[g_cnt]['chr'] = pkey[0]
238 gene[g_cnt]['source'] = pkey[1]
239 gene[g_cnt]['name'] = pkey[-1]
240 gene[g_cnt]['start'] = pdet.get('location', [])[0]
241 gene[g_cnt]['stop'] = pdet.get('location', [])[1]
242 gene[g_cnt]['strand'] = orient
243 gene[g_cnt]['score'] = pdet.get('score','')
244
245 # default value
246 gene[g_cnt]['is_alt_spliced'] = gene[g_cnt]['is_alt'] = 0
247 if len(child_nf_map[pkey]) > 1:
248 gene[g_cnt]['is_alt_spliced'] = gene[g_cnt]['is_alt'] = 1
249
250 # complete sub-feature for all transcripts
251 dim = len(child_nf_map[pkey])
252 TRS = np.zeros((dim,), dtype=np.object)
253 TR_TYP = np.zeros((dim,), dtype=np.object)
254 EXON = np.zeros((dim,), dtype=np.object)
255 UTR5 = np.zeros((dim,), dtype=np.object)
256 UTR3 = np.zeros((dim,), dtype=np.object)
257 CDS = np.zeros((dim,), dtype=np.object)
258 TISc = np.zeros((dim,), dtype=np.object)
259 TSSc = np.zeros((dim,), dtype=np.object)
260 CLV = np.zeros((dim,), dtype=np.object)
261 CSTOP = np.zeros((dim,), dtype=np.object)
262 TSTAT = np.zeros((dim,), dtype=np.object)
263 TSCORE = np.zeros((dim,), dtype=np.object)
264
265 # fetching corresponding transcripts
266 for xq, Lv1 in enumerate(child_nf_map[pkey]):
267
268 TID = Lv1.get('ID', '')
269 TRS[xq]= np.array([TID])
270
271 TYPE = Lv1.get('type', '')
272 TR_TYP[xq] = np.array('')
273 TR_TYP[xq] = np.array(TYPE) if TYPE else TR_TYP[xq]
274
275 orient = Lv1.get('strand', '')
276 tr_score = Lv1.get('score', '')
277
278 # fetching different sub-features
279 child_feat = defaultdict(list)
280 for Lv2 in child_nf_map[(pkey[0], pkey[1], TID)]:
281 E_TYP = Lv2.get('type', '')
282 child_feat[E_TYP].append(Lv2.get('location'))
283
284 # make general ascending order of coordinates
285 if orient == '-':
286 for etype, excod in child_feat.items():
287 if len(excod) > 1:
288 if excod[0][0] > excod[-1][0]:
289 excod.reverse()
290 child_feat[etype] = excod
291
292 # make exon coordinate from cds and utr regions
293 if not child_feat.get('exon'):
294 if child_feat.get('CDS'):
295 exon_cod = utils.make_Exon_cod( orient,
296 NonetoemptyList(child_feat.get('five_prime_UTR')),
297 NonetoemptyList(child_feat.get('CDS')),
298 NonetoemptyList(child_feat.get('three_prime_UTR')))
299 child_feat['exon'] = exon_cod
300 else:
301 # TODO only UTR's
302 # searching through keys to find a pattern describing exon feature
303 ex_key_pattern = [k for k in child_feat if k.endswith("exon")]
304 if ex_key_pattern:
305 child_feat['exon'] = child_feat[ex_key_pattern[0]]
306
307 # stop_codon are seperated from CDS, add the coordinates based on strand
308 if child_feat.get('stop_codon'):
309 if orient == '+':
310 if child_feat.get('stop_codon')[0][0] - child_feat.get('CDS')[-1][1] == 1:
311 child_feat['CDS'][-1] = [child_feat.get('CDS')[-1][0], child_feat.get('stop_codon')[0][1]]
312 else:
313 child_feat['CDS'].append(child_feat.get('stop_codon')[0])
314 elif orient == '-':
315 if child_feat.get('CDS')[0][0] - child_feat.get('stop_codon')[0][1] == 1:
316 child_feat['CDS'][0] = [child_feat.get('stop_codon')[0][0], child_feat.get('CDS')[0][1]]
317 else:
318 child_feat['CDS'].insert(0, child_feat.get('stop_codon')[0])
319
320 # transcript signal sites
321 TIS, cdsStop, TSS, cleave = [], [], [], []
322 cds_status, exon_status, utr_status = 0, 0, 0
323
324 if child_feat.get('exon'):
325 TSS = [child_feat.get('exon')[-1][1]]
326 TSS = [child_feat.get('exon')[0][0]] if orient == '+' else TSS
327 cleave = [child_feat.get('exon')[0][0]]
328 cleave = [child_feat.get('exon')[-1][1]] if orient == '+' else cleave
329 exon_status = 1
330
331 if child_feat.get('CDS'):
332 if orient == '+':
333 TIS = [child_feat.get('CDS')[0][0]]
334 cdsStop = [child_feat.get('CDS')[-1][1]-3]
335 else:
336 TIS = [child_feat.get('CDS')[-1][1]]
337 cdsStop = [child_feat.get('CDS')[0][0]+3]
338 cds_status = 1
339 # cds phase calculation
340 child_feat['CDS'] = utils.add_CDS_phase(orient, child_feat.get('CDS'))
341
342 # sub-feature status
343 if child_feat.get('three_prime_UTR') or child_feat.get('five_prime_UTR'):
344 utr_status =1
345
346 if utr_status == cds_status == exon_status == 1:
347 t_status = 1
348 else:
349 t_status = 0
350
351 # add sub-feature # make array for export to different out
352 TSTAT[xq] = t_status
353 EXON[xq] = np.array(child_feat.get('exon'), np.float64)
354 UTR5[xq] = np.array(NonetoemptyList(child_feat.get('five_prime_UTR')))
355 UTR3[xq] = np.array(NonetoemptyList(child_feat.get('three_prime_UTR')))
356 CDS[xq] = np.array(NonetoemptyList(child_feat.get('CDS')))
357 TISc[xq] = np.array(TIS)
358 CSTOP[xq] = np.array(cdsStop)
359 TSSc[xq] = np.array(TSS)
360 CLV[xq] = np.array(cleave)
361 TSCORE[xq] = tr_score
362
363 # add sub-features to the parent gene feature
364 gene[g_cnt]['transcript_status'] = TSTAT
365 gene[g_cnt]['transcripts'] = TRS
366 gene[g_cnt]['exons'] = EXON
367 gene[g_cnt]['utr5_exons'] = UTR5
368 gene[g_cnt]['cds_exons'] = CDS
369 gene[g_cnt]['utr3_exons'] = UTR3
370 gene[g_cnt]['transcript_type'] = TR_TYP
371 gene[g_cnt]['tis'] = TISc
372 gene[g_cnt]['cdsStop'] = CSTOP
373 gene[g_cnt]['tss'] = TSSc
374 gene[g_cnt]['cleave'] = CLV
375 gene[g_cnt]['transcript_score'] = TSCORE
376
377 gene[g_cnt]['gene_info'] = dict( ID = pkey[-1],
378 Name = pdet.get('name'),
379 Source = pkey[1])
380 # few empty fields // TODO fill this:
381 gene[g_cnt]['anno_id'] = []
382 gene[g_cnt]['confgenes_id'] = []
383 gene[g_cnt]['alias'] = ''
384 gene[g_cnt]['name2'] = []
385 gene[g_cnt]['chr_num'] = []
386 gene[g_cnt]['paralogs'] = []
387 gene[g_cnt]['transcript_valid'] = []
388 gene[g_cnt]['exons_confirmed'] = []
389 gene[g_cnt]['tis_conf'] = []
390 gene[g_cnt]['tis_info'] = []
391 gene[g_cnt]['cdsStop_conf'] = []
392 gene[g_cnt]['cdsStop_info'] = []
393 gene[g_cnt]['tss_info'] = []
394 gene[g_cnt]['tss_conf'] = []
395 gene[g_cnt]['cleave_info'] = []
396 gene[g_cnt]['cleave_conf'] = []
397 gene[g_cnt]['polya_info'] = []
398 gene[g_cnt]['polya_conf'] = []
399 gene[g_cnt]['is_valid'] = []
400 gene[g_cnt]['transcript_complete'] = []
401 gene[g_cnt]['is_complete'] = []
402 gene[g_cnt]['is_correctly_gff3_referenced'] = ''
403 gene[g_cnt]['splicegraph'] = []
404 g_cnt += 1
405
406 ## deleting empty gene records from the main array
407 XPFLG=0
408 for XP, ens in enumerate(gene):
409 if ens[0]==0:
410 XPFLG=1
411 break
412
413 if XPFLG==1:
414 XQC = range(XP, len(gene)+1)
415 gene = np.delete(gene, XQC)
416
417 return gene
418
419 def NonetoemptyList(XS):
420 """
421 Convert a None type to empty list
422
423 @args XS: None type
424 @type XS: str
425 """
426 return [] if XS is None else XS
427
428 def create_missing_feature_type(p_feat, c_feat):
429 """
430 GFF/GTF file defines only child features. This function tries to create
431 the parent feature from the information provided in the attribute column.
432
433 example:
434 chr21 hg19_knownGene exon 9690071 9690100 0.000000 + . gene_id "uc002zkg.1"; transcript_id "uc002zkg.1";
435 chr21 hg19_knownGene exon 9692178 9692207 0.000000 + . gene_id "uc021wgt.1"; transcript_id "uc021wgt.1";
436 chr21 hg19_knownGene exon 9711935 9712038 0.000000 + . gene_id "uc011abu.2"; transcript_id "uc011abu.2";
437
438 This function gets the parsed feature annotations.
439
440 @args p_feat: Parent feature map
441 @type p_feat: collections defaultdict
442 @args c_feat: Child feature map
443 @type c_feat: collections defaultdict
444 """
445
446 child_n_map = defaultdict(list)
447 for fid, det in c_feat.items():
448 # get the details from grand child
449 GID = STRD = SCR = None
450 SPOS, EPOS = [], []
451 TYP = dict()
452 for gchild in det:
453 GID = gchild.get('gene_id', [''])[0]
454 SPOS.append(gchild.get('location', [])[0])
455 EPOS.append(gchild.get('location', [])[1])
456 STRD = gchild.get('strand', '')
457 SCR = gchild.get('score', '')
458 if gchild.get('type', '') == "gene": ## gencode GTF file has this problem
459 continue
460 TYP[gchild.get('type', '')] = 1
461 SPOS.sort()
462 EPOS.sort()
463
464 # infer transcript type
465 transcript_type = 'transcript'
466 transcript_type = 'mRNA' if TYP.get('CDS', '') or TYP.get('cds', '') else transcript_type
467
468 # gene id and transcript id are same
469 transcript_id = fid[-1]
470 if GID == transcript_id:
471 transcript_id = 'Transcript:' + str(GID)
472
473 # level -1 feature type
474 p_feat[(fid[0], fid[1], GID)] = dict( type = 'gene',
475 location = [], ## infer location based on multiple transcripts
476 strand = STRD,
477 name = GID )
478 # level -2 feature type
479 child_n_map[(fid[0], fid[1], GID)].append(
480 dict( type = transcript_type,
481 location = [SPOS[0], EPOS[-1]],
482 strand = STRD,
483 score = SCR,
484 ID = transcript_id,
485 gene_id = '' ))
486 # reorganizing the grand child
487 for gchild in det:
488 child_n_map[(fid[0], fid[1], transcript_id)].append(
489 dict( type = gchild.get('type', ''),
490 location = gchild.get('location'),
491 strand = gchild.get('strand'),
492 ID = gchild.get('ID'),
493 score = gchild.get('score'),
494 gene_id = '' ))
495 return p_feat, child_n_map
496