comparison GFFtools-GX/GFFParser.py @ 3:ff2c2e6f4ab3

Uploaded version 2.0.0 of gfftools ready to import to local instance
author vipints
date Wed, 11 Jun 2014 16:29:25 -0400
parents
children
comparison
equal deleted inserted replaced
2:db3c67b03d55 3:ff2c2e6f4ab3
1 #!/usr/bin/env python
2 """
3 Extract genome annotation from a GFF (a tab delimited format for storing sequence features and annotations) file.
4
5 Requirements:
6 Numpy :- http://numpy.org/
7 Scipy :- http://scipy.org/
8
9 Copyright (C)
10
11 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany.
12 2012-2014 Memorial Sloan Kettering Cancer Center, New York City, USA.
13 """
14
15 import re
16 import os
17 import sys
18 import urllib
19 import numpy as np
20 import scipy.io as sio
21 from collections import defaultdict
22 import helper as utils
23
24 def attribute_tags(col9):
25 """
26 Split the key-value tags from the attribute column, it takes column number 9 from GTF/GFF file
27
28 @args col9: attribute column from GFF file
29 @type col9: str
30 """
31 info = defaultdict(list)
32 is_gff = False
33
34 if not col9:
35 return is_gff, info
36
37 # trim the line ending semi-colon ucsc may have some white-space
38 col9 = col9.rstrip(';| ')
39 # attributes from 9th column
40 atbs = col9.split(" ; ")
41 if len(atbs) == 1:
42 atbs = col9.split("; ")
43 if len(atbs) == 1:
44 atbs = col9.split(";")
45 # check the GFF3 pattern which has key value pairs like:
46 gff3_pat = re.compile("\w+=")
47 # sometime GTF have: gene_id uc002zkg.1;
48 gtf_pat = re.compile("\s?\w+\s")
49
50 key_vals = []
51
52 if gff3_pat.match(atbs[0]): # gff3 pattern
53 is_gff = True
54 key_vals = [at.split('=') for at in atbs]
55 elif gtf_pat.match(atbs[0]): # gtf pattern
56 for at in atbs:
57 key_vals.append(at.strip().split(" ",1))
58 else:
59 # to handle attribute column has only single value
60 key_vals.append(['ID', atbs[0]])
61 # get key, val items
62 for item in key_vals:
63 key, val = item
64 # replace the double qoutes from feature identifier
65 val = re.sub('"', '', val)
66 # replace the web formating place holders to plain text format
67 info[key].extend([urllib.unquote(v) for v in val.split(',') if v])
68
69 return is_gff, info
70
71 def spec_features_keywd(gff_parts):
72 """
73 Specify the feature key word according to the GFF specifications
74
75 @args gff_parts: attribute field key
76 @type gff_parts: str
77 """
78 for t_id in ["transcript_id", "transcriptId", "proteinId"]:
79 try:
80 gff_parts["info"]["Parent"] = gff_parts["info"][t_id]
81 break
82 except KeyError:
83 pass
84 for g_id in ["gene_id", "geneid", "geneId", "name", "gene_name", "genename"]:
85 try:
86 gff_parts["info"]["GParent"] = gff_parts["info"][g_id]
87 break
88 except KeyError:
89 pass
90 ## TODO key words
91 for flat_name in ["Transcript", "CDS"]:
92 if gff_parts["info"].has_key(flat_name):
93 # parents
94 if gff_parts['type'] in [flat_name] or re.search(r'transcript', gff_parts['type'], re.IGNORECASE):
95 if not gff_parts['id']:
96 gff_parts['id'] = gff_parts['info'][flat_name][0]
97 #gff_parts["info"]["ID"] = [gff_parts["id"]]
98 # children
99 elif gff_parts["type"] in ["intron", "exon", "three_prime_UTR",
100 "coding_exon", "five_prime_UTR", "CDS", "stop_codon",
101 "start_codon"]:
102 gff_parts["info"]["Parent"] = gff_parts["info"][flat_name]
103 break
104 return gff_parts
105
106 def Parse(ga_file):
107 """
108 Parsing GFF/GTF file based on feature relationship, it takes the input file.
109
110 @args ga_file: input file name
111 @type ga_file: str
112 """
113 child_map = defaultdict(list)
114 parent_map = dict()
115
116 ga_handle = utils.open_file(ga_file)
117
118 for rec in ga_handle:
119 rec = rec.strip('\n\r')
120
121 # skip empty line fasta identifier and commented line
122 if not rec or rec[0] in ['#', '>']:
123 continue
124 # skip the genome sequence
125 if not re.search('\t', rec):
126 continue
127
128 parts = rec.split('\t')
129 assert len(parts) >= 8, rec
130
131 # process the attribute column (9th column)
132 ftype, tags = attribute_tags(parts[-1])
133 if not tags: # skip the line if no attribute column.
134 continue
135
136 # extract fields
137 if parts[1]:
138 tags["source"] = parts[1]
139 if parts[7]:
140 tags["phase"] = parts[7]
141
142 gff_info = dict()
143 gff_info['info'] = dict(tags)
144 #gff_info["is_gff3"] = ftype
145 gff_info['chr'] = parts[0]
146 gff_info['score'] = parts[5]
147
148 if parts[3] and parts[4]:
149 gff_info['location'] = [int(parts[3]) ,
150 int(parts[4])]
151 gff_info['type'] = parts[2]
152 gff_info['id'] = tags.get('ID', [''])[0]
153 if parts[6] in ['?', '.']:
154 parts[6] = None
155 gff_info['strand'] = parts[6]
156
157 # key word according to the GFF spec.
158 if not ftype:
159 gff_info = spec_features_keywd(gff_info)
160
161 # link the feature relationships
162 if gff_info['info'].has_key('Parent'):
163 for p in gff_info['info']['Parent']:
164 if p == gff_info['id']:
165 gff_info['id'] = ''
166 break
167 rec_category = 'child'
168 elif gff_info['id']:
169 rec_category = 'parent'
170 else:
171 rec_category = 'record'
172
173 # depends on the record category organize the features
174 if rec_category == 'child':
175 for p in gff_info['info']['Parent']:
176 # create the data structure based on source and feature id
177 child_map[(gff_info['chr'], gff_info['info']['source'], p)].append(
178 dict( type = gff_info['type'],
179 location = gff_info['location'],
180 strand = gff_info['strand'],
181 score = gff_info['score'],
182 ID = gff_info['id'],
183 gene_id = gff_info['info'].get('GParent', '')
184 ))
185 elif rec_category == 'parent':
186 parent_map[(gff_info['chr'], gff_info['info']['source'], gff_info['id'])] = dict(
187 type = gff_info['type'],
188 location = gff_info['location'],
189 strand = gff_info['strand'],
190 score = gff_info['score'],
191 name = tags.get('Name', [''])[0])
192 elif rec_category == 'record':
193 #TODO how to handle plain records?
194 c = 1
195 ga_handle.close()
196
197 # depends on file type create parent feature
198 if not ftype:
199 parent_map, child_map = create_missing_feature_type(parent_map, child_map)
200
201 # connecting parent child relations
202 # // essentially the parent child features are here from any type of GTF/GFF2/GFF3 file
203 gene_mat = format_gene_models(parent_map, child_map)
204
205 return gene_mat
206
207 def format_gene_models(parent_nf_map, child_nf_map):
208 """
209 Genarate GeneObject based on the parsed file contents
210
211 @args parent_nf_map: parent features with source and chromosome information
212 @type parent_nf_map: collections defaultdict
213 @args child_nf_map: transctipt and exon information are encoded
214 @type child_nf_map: collections defaultdict
215 """
216 g_cnt = 0
217 gene = np.zeros((len(parent_nf_map),), dtype = utils.init_gene())
218
219 for pkey, pdet in parent_nf_map.items():
220
221 # considering only gene features
222 if not re.search(r'gene', pdet.get('type', '')):
223 continue
224 # infer the gene start and stop if not there in the
225 if not pdet.get('location', []):
226 GNS, GNE = [], []
227 # multiple number of transcripts
228 for L1 in child_nf_map[pkey]:
229 GNS.append(L1.get('location', [])[0])
230 GNE.append(L1.get('location', [])[1])
231 GNS.sort()
232 GNE.sort()
233 pdet['location'] = [GNS[0], GNE[-1]]
234 orient = pdet.get('strand', '')
235
236 gene[g_cnt]['id'] = g_cnt +1
237 gene[g_cnt]['chr'] = pkey[0]
238 gene[g_cnt]['source'] = pkey[1]
239 gene[g_cnt]['name'] = pkey[-1]
240 gene[g_cnt]['start'] = pdet.get('location', [])[0]
241 gene[g_cnt]['stop'] = pdet.get('location', [])[1]
242 gene[g_cnt]['strand'] = orient
243 gene[g_cnt]['score'] = pdet.get('score','')
244
245 # default value
246 gene[g_cnt]['is_alt_spliced'] = gene[g_cnt]['is_alt'] = 0
247 if len(child_nf_map[pkey]) > 1:
248 gene[g_cnt]['is_alt_spliced'] = gene[g_cnt]['is_alt'] = 1
249
250 # complete sub-feature for all transcripts
251 dim = len(child_nf_map[pkey])
252 TRS = np.zeros((dim,), dtype=np.object)
253 TR_TYP = np.zeros((dim,), dtype=np.object)
254 EXON = np.zeros((dim,), dtype=np.object)
255 UTR5 = np.zeros((dim,), dtype=np.object)
256 UTR3 = np.zeros((dim,), dtype=np.object)
257 CDS = np.zeros((dim,), dtype=np.object)
258 TISc = np.zeros((dim,), dtype=np.object)
259 TSSc = np.zeros((dim,), dtype=np.object)
260 CLV = np.zeros((dim,), dtype=np.object)
261 CSTOP = np.zeros((dim,), dtype=np.object)
262 TSTAT = np.zeros((dim,), dtype=np.object)
263
264 # fetching corresponding transcripts
265 for xq, Lv1 in enumerate(child_nf_map[pkey]):
266
267 TID = Lv1.get('ID', '')
268 TRS[xq]= np.array([TID])
269
270 TYPE = Lv1.get('type', '')
271 TR_TYP[xq] = np.array('')
272 TR_TYP[xq] = np.array(TYPE) if TYPE else TR_TYP[xq]
273
274 orient = Lv1.get('strand', '')
275
276 # fetching different sub-features
277 child_feat = defaultdict(list)
278 for Lv2 in child_nf_map[(pkey[0], pkey[1], TID)]:
279 E_TYP = Lv2.get('type', '')
280 child_feat[E_TYP].append(Lv2.get('location'))
281
282 # make general ascending order of coordinates
283 if orient == '-':
284 for etype, excod in child_feat.items():
285 if len(excod) > 1:
286 if excod[0][0] > excod[-1][0]:
287 excod.reverse()
288 child_feat[etype] = excod
289
290 # make exon coordinate from cds and utr regions
291 if not child_feat.get('exon'):
292 if child_feat.get('CDS'):
293 exon_cod = utils.make_Exon_cod( orient,
294 NonetoemptyList(child_feat.get('five_prime_UTR')),
295 NonetoemptyList(child_feat.get('CDS')),
296 NonetoemptyList(child_feat.get('three_prime_UTR')))
297 child_feat['exon'] = exon_cod
298 else:
299 # TODO only UTR's
300 # searching through keys to find a pattern describing exon feature
301 ex_key_pattern = [k for k in child_feat if k.endswith("exon")]
302 if ex_key_pattern:
303 child_feat['exon'] = child_feat[ex_key_pattern[0]]
304
305 # stop_codon are seperated from CDS, add the coordinates based on strand
306 if child_feat.get('stop_codon'):
307 if orient == '+':
308 if child_feat.get('stop_codon')[0][0] - child_feat.get('CDS')[-1][1] == 1:
309 child_feat['CDS'][-1] = [child_feat.get('CDS')[-1][0], child_feat.get('stop_codon')[0][1]]
310 else:
311 child_feat['CDS'].append(child_feat.get('stop_codon')[0])
312 elif orient == '-':
313 if child_feat.get('CDS')[0][0] - child_feat.get('stop_codon')[0][1] == 1:
314 child_feat['CDS'][0] = [child_feat.get('stop_codon')[0][0], child_feat.get('CDS')[0][1]]
315 else:
316 child_feat['CDS'].insert(0, child_feat.get('stop_codon')[0])
317
318 # transcript signal sites
319 TIS, cdsStop, TSS, cleave = [], [], [], []
320 cds_status, exon_status, utr_status = 0, 0, 0
321
322 if child_feat.get('exon'):
323 TSS = [child_feat.get('exon')[-1][1]]
324 TSS = [child_feat.get('exon')[0][0]] if orient == '+' else TSS
325 cleave = [child_feat.get('exon')[0][0]]
326 cleave = [child_feat.get('exon')[-1][1]] if orient == '+' else cleave
327 exon_status = 1
328
329 if child_feat.get('CDS'):
330 if orient == '+':
331 TIS = [child_feat.get('CDS')[0][0]]
332 cdsStop = [child_feat.get('CDS')[-1][1]-3]
333 else:
334 TIS = [child_feat.get('CDS')[-1][1]]
335 cdsStop = [child_feat.get('CDS')[0][0]+3]
336 cds_status = 1
337 # cds phase calculation
338 child_feat['CDS'] = utils.add_CDS_phase(orient, child_feat.get('CDS'))
339
340 # sub-feature status
341 if child_feat.get('three_prime_UTR') or child_feat.get('five_prime_UTR'):
342 utr_status =1
343
344 if utr_status == cds_status == exon_status == 1:
345 t_status = 1
346 else:
347 t_status = 0
348
349 # add sub-feature # make array for export to different out
350 TSTAT[xq] = t_status
351 EXON[xq] = np.array(child_feat.get('exon'), np.float64)
352 UTR5[xq] = np.array(NonetoemptyList(child_feat.get('five_prime_UTR')))
353 UTR3[xq] = np.array(NonetoemptyList(child_feat.get('three_prime_UTR')))
354 CDS[xq] = np.array(NonetoemptyList(child_feat.get('CDS')))
355 TISc[xq] = np.array(TIS)
356 CSTOP[xq] = np.array(cdsStop)
357 TSSc[xq] = np.array(TSS)
358 CLV[xq] = np.array(cleave)
359
360 # add sub-features to the parent gene feature
361 gene[g_cnt]['transcript_status'] = TSTAT
362 gene[g_cnt]['transcripts'] = TRS
363 gene[g_cnt]['exons'] = EXON
364 gene[g_cnt]['utr5_exons'] = UTR5
365 gene[g_cnt]['cds_exons'] = CDS
366 gene[g_cnt]['utr3_exons'] = UTR3
367 gene[g_cnt]['transcript_type'] = TR_TYP
368 gene[g_cnt]['tis'] = TISc
369 gene[g_cnt]['cdsStop'] = CSTOP
370 gene[g_cnt]['tss'] = TSSc
371 gene[g_cnt]['cleave'] = CLV
372
373 gene[g_cnt]['gene_info'] = dict( ID = pkey[-1],
374 Name = pdet.get('name'),
375 Source = pkey[1])
376 # few empty fields // TODO fill this:
377 gene[g_cnt]['anno_id'] = []
378 gene[g_cnt]['confgenes_id'] = []
379 gene[g_cnt]['alias'] = ''
380 gene[g_cnt]['name2'] = []
381 gene[g_cnt]['chr_num'] = []
382 gene[g_cnt]['paralogs'] = []
383 gene[g_cnt]['transcript_info'] = []
384 gene[g_cnt]['transcript_valid'] = []
385 gene[g_cnt]['exons_confirmed'] = []
386 gene[g_cnt]['tis_conf'] = []
387 gene[g_cnt]['tis_info'] = []
388 gene[g_cnt]['cdsStop_conf'] = []
389 gene[g_cnt]['cdsStop_info'] = []
390 gene[g_cnt]['tss_info'] = []
391 gene[g_cnt]['tss_conf'] = []
392 gene[g_cnt]['cleave_info'] = []
393 gene[g_cnt]['cleave_conf'] = []
394 gene[g_cnt]['polya_info'] = []
395 gene[g_cnt]['polya_conf'] = []
396 gene[g_cnt]['is_valid'] = []
397 gene[g_cnt]['transcript_complete'] = []
398 gene[g_cnt]['is_complete'] = []
399 gene[g_cnt]['is_correctly_gff3_referenced'] = ''
400 gene[g_cnt]['splicegraph'] = []
401 g_cnt += 1
402
403 ## deleting empty gene records from the main array
404 XPFLG=0
405 for XP, ens in enumerate(gene):
406 if ens[0]==0:
407 XPFLG=1
408 break
409
410 if XPFLG==1:
411 XQC = range(XP, len(gene)+1)
412 gene = np.delete(gene, XQC)
413
414 return gene
415
416 def NonetoemptyList(XS):
417 """
418 Convert a None type to empty list
419
420 @args XS: None type
421 @type XS: str
422 """
423 return [] if XS is None else XS
424
425 def create_missing_feature_type(p_feat, c_feat):
426 """
427 GFF/GTF file defines only child features. This function tries to create
428 the parent feature from the information provided in the attribute column.
429
430 example:
431 chr21 hg19_knownGene exon 9690071 9690100 0.000000 + . gene_id "uc002zkg.1"; transcript_id "uc002zkg.1";
432 chr21 hg19_knownGene exon 9692178 9692207 0.000000 + . gene_id "uc021wgt.1"; transcript_id "uc021wgt.1";
433 chr21 hg19_knownGene exon 9711935 9712038 0.000000 + . gene_id "uc011abu.2"; transcript_id "uc011abu.2";
434
435 This function gets the parsed feature annotations.
436
437 @args p_feat: Parent feature map
438 @type p_feat: collections defaultdict
439 @args c_feat: Child feature map
440 @type c_feat: collections defaultdict
441 """
442
443 child_n_map = defaultdict(list)
444 for fid, det in c_feat.items():
445 # get the details from grand child
446 GID = STRD = SCR = None
447 SPOS, EPOS = [], []
448 TYP = dict()
449 for gchild in det:
450 GID = gchild.get('gene_id', [''])[0]
451 SPOS.append(gchild.get('location', [])[0])
452 EPOS.append(gchild.get('location', [])[1])
453 STRD = gchild.get('strand', '')
454 SCR = gchild.get('score', '')
455 TYP[gchild.get('type', '')] = 1
456 SPOS.sort()
457 EPOS.sort()
458
459 # infer transcript type
460 transcript_type = 'transcript'
461 transcript_type = 'mRNA' if TYP.get('CDS', '') or TYP.get('cds', '') else transcript_type
462
463 # gene id and transcript id are same
464 transcript_id = fid[-1]
465 if GID == transcript_id:
466 transcript_id = 'Transcript:' + str(GID)
467
468 # level -1 feature type
469 p_feat[(fid[0], fid[1], GID)] = dict( type = 'gene',
470 location = [], ## infer location based on multiple transcripts
471 strand = STRD,
472 name = GID )
473 # level -2 feature type
474 child_n_map[(fid[0], fid[1], GID)].append(
475 dict( type = transcript_type,
476 location = [SPOS[0], EPOS[-1]],
477 strand = STRD,
478 score = SCR,
479 ID = transcript_id,
480 gene_id = '' ))
481 # reorganizing the grand child
482 for gchild in det:
483 child_n_map[(fid[0], fid[1], transcript_id)].append(
484 dict( type = gchild.get('type', ''),
485 location = gchild.get('location'),
486 strand = gchild.get('strand'),
487 ID = gchild.get('ID'),
488 score = gchild.get('score'),
489 gene_id = '' ))
490 return p_feat, child_n_map
491