comparison rDiff/tools/ParseGFF.py @ 0:0f80a5141704

version 0.3 uploaded
author vipints
date Thu, 14 Feb 2013 23:38:36 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:0f80a5141704
1 #!/usr/bin/env python
2 """
3 Extract genome annotation from a GFF3 (a tab delimited
4 format for storing sequence features and annotations:
5 http://www.sequenceontology.org/gff3.shtml) file.
6
7 Usage: ParseGFF.py in.gff3 out.mat
8
9 Requirements:
10 Scipy :- http://scipy.org/
11 Numpy :- http://numpy.org/
12
13 Copyright (C) 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany
14 2012- Memorial Sloan-Kettering Cancer Center, New York City, USA
15 """
16
17 import re, sys
18 import scipy.io as sio
19 import numpy as np
20
21 def addCDSphase(strand, cds):
22 """Add CDS phase to the CDS exons
23 """
24 cds_region, cds_flag = [], 0
25 if strand == '+':
26 for cdspos in cds:
27 if cds_flag == 0:
28 cdspos = (cdspos[0], cdspos[1], 0)
29 diff = (cdspos[1]-(cdspos[0]-1))%3
30 else:
31 xy = 0
32 if diff == 0:
33 cdspos = (cdspos[0], cdspos[1], 0)
34 elif diff == 1:
35 cdspos = (cdspos[0], cdspos[1], 2)
36 xy = 2
37 elif diff == 2:
38 cdspos = (cdspos[0], cdspos[1], 1)
39 xy = 1
40 diff = ((cdspos[1]-(cdspos[0]-1))-xy)%3
41 cds_region.append(cdspos)
42 cds_flag = 1
43 elif strand == '-':
44 cds.reverse()
45 for cdspos in cds:
46 if cds_flag == 0:
47 cdspos = (cdspos[0], cdspos[1], 0)
48 diff = (cdspos[1]-(cdspos[0]-1))%3
49 else:
50 xy = 0
51 if diff == 0:
52 cdspos = (cdspos[0], cdspos[1], 0)
53 elif diff == 1:
54 cdspos = (cdspos[0], cdspos[1], 2)
55 xy = 2
56 elif diff == 2:
57 cdspos = (cdspos[0], cdspos[1], 1)
58 xy = 1
59 diff = ((cdspos[1]-(cdspos[0]-1))-xy)%3
60 cds_region.append(cdspos)
61 cds_flag = 1
62 cds_region.reverse()
63 return cds_region
64
65 def createExon(strand_p, five_p_utr, cds_cod, three_p_utr):
66 """Create exon cordinates from UTR's and CDS region
67 """
68 exon_pos = []
69 if strand_p == '+':
70 utr5_start, utr5_end = 0, 0
71 if five_p_utr != []:
72 utr5_start, utr5_end = five_p_utr[-1][0], five_p_utr[-1][1]
73 cds_5start, cds_5end = cds_cod[0][0], cds_cod[0][1]
74 jun_exon = []
75 if cds_5start-utr5_end == 0 or cds_5start-utr5_end == 1:
76 jun_exon = [utr5_start, cds_5end]
77 if len(cds_cod) == 1:
78 five_prime_flag = 0
79 if jun_exon != []:
80 five_p_utr = five_p_utr[:-1]
81 five_prime_flag = 1
82 for utr5 in five_p_utr:
83 exon_pos.append(utr5)
84 jun_exon = []
85 utr3_start, utr3_end = 0, 0
86 if three_p_utr != []:
87 utr3_start = three_p_utr[0][0]
88 utr3_end = three_p_utr[0][1]
89 if utr3_start-cds_5end == 0 or utr3_start-cds_5end == 1:
90 jun_exon = [cds_5start, utr3_end]
91 three_prime_flag = 0
92 if jun_exon != []:
93 cds_cod = cds_cod[:-1]
94 three_p_utr = three_p_utr[1:]
95 three_prime_flag = 1
96 if five_prime_flag == 1 and three_prime_flag == 1:
97 exon_pos.append([utr5_start, utr3_end])
98 if five_prime_flag == 1 and three_prime_flag == 0:
99 exon_pos.append([utr5_start, cds_5end])
100 cds_cod = cds_cod[:-1]
101 if five_prime_flag == 0 and three_prime_flag == 1:
102 exon_pos.append([cds_5start, utr3_end])
103 for cds in cds_cod:
104 exon_pos.append(cds)
105 for utr3 in three_p_utr:
106 exon_pos.append(utr3)
107 else:
108 if jun_exon != []:
109 five_p_utr = five_p_utr[:-1]
110 cds_cod = cds_cod[1:]
111 for utr5 in five_p_utr:
112 exon_pos.append(utr5)
113 exon_pos.append(jun_exon) if jun_exon != [] else ''
114 jun_exon = []
115 utr3_start, utr3_end = 0, 0
116 if three_p_utr != []:
117 utr3_start = three_p_utr[0][0]
118 utr3_end = three_p_utr[0][1]
119 cds_3start = cds_cod[-1][0]
120 cds_3end = cds_cod[-1][1]
121 if utr3_start-cds_3end == 0 or utr3_start-cds_3end == 1:
122 jun_exon = [cds_3start, utr3_end]
123 if jun_exon != []:
124 cds_cod = cds_cod[:-1]
125 three_p_utr = three_p_utr[1:]
126 for cds in cds_cod:
127 exon_pos.append(cds)
128 exon_pos.append(jun_exon) if jun_exon != [] else ''
129 for utr3 in three_p_utr:
130 exon_pos.append(utr3)
131 elif strand_p == '-':
132 utr3_start, utr3_end = 0, 0
133 if three_p_utr != []:
134 utr3_start = three_p_utr[-1][0]
135 utr3_end = three_p_utr[-1][1]
136 cds_3start = cds_cod[0][0]
137 cds_3end = cds_cod[0][1]
138 jun_exon = []
139 if cds_3start-utr3_end == 0 or cds_3start-utr3_end == 1:
140 jun_exon = [utr3_start, cds_3end]
141 if len(cds_cod) == 1:
142 three_prime_flag = 0
143 if jun_exon != []:
144 three_p_utr = three_p_utr[:-1]
145 three_prime_flag = 1
146 for utr3 in three_p_utr:
147 exon_pos.append(utr3)
148 jun_exon = []
149 (utr5_start, utr5_end) = (0, 0)
150 if five_p_utr != []:
151 utr5_start = five_p_utr[0][0]
152 utr5_end = five_p_utr[0][1]
153 if utr5_start-cds_3end == 0 or utr5_start-cds_3end == 1:
154 jun_exon = [cds_3start, utr5_end]
155 five_prime_flag = 0
156 if jun_exon != []:
157 cds_cod = cds_cod[:-1]
158 five_p_utr = five_p_utr[1:]
159 five_prime_flag = 1
160 if three_prime_flag == 1 and five_prime_flag == 1:
161 exon_pos.append([utr3_start, utr5_end])
162 if three_prime_flag == 1 and five_prime_flag == 0:
163 exon_pos.append([utr3_start, cds_3end])
164 cds_cod = cds_cod[:-1]
165 if three_prime_flag == 0 and five_prime_flag == 1:
166 exon_pos.append([cds_3start, utr5_end])
167 for cds in cds_cod:
168 exon_pos.append(cds)
169 for utr5 in five_p_utr:
170 exon_pos.append(utr5)
171 else:
172 if jun_exon != []:
173 three_p_utr = three_p_utr[:-1]
174 cds_cod = cds_cod[1:]
175 for utr3 in three_p_utr:
176 exon_pos.append(utr3)
177 if jun_exon != []:
178 exon_pos.append(jun_exon)
179 jun_exon = []
180 (utr5_start, utr5_end) = (0, 0)
181 if five_p_utr != []:
182 utr5_start = five_p_utr[0][0]
183 utr5_end = five_p_utr[0][1]
184 cds_5start = cds_cod[-1][0]
185 cds_5end = cds_cod[-1][1]
186 if utr5_start-cds_5end == 0 or utr5_start-cds_5end == 1:
187 jun_exon = [cds_5start, utr5_end]
188 if jun_exon != []:
189 cds_cod = cds_cod[:-1]
190 five_p_utr = five_p_utr[1:]
191 for cds in cds_cod:
192 exon_pos.append(cds)
193 if jun_exon != []:
194 exon_pos.append(jun_exon)
195 for utr5 in five_p_utr:
196 exon_pos.append(utr5)
197 return exon_pos
198
199 def init_gene():
200 """Initializing the gene structure
201 """
202 gene_details=dict(
203 id = '',
204 anno_id = [],
205 confgenes_id = [],
206 name = '',
207 source = '',
208 gene_info = {},
209 alias = '',
210 name2 = [],
211 strand = '',
212 chr = '',
213 chr_num = [],
214 paralogs = [],
215 start = '',
216 stop = '',
217 transcripts = [],
218 transcript_info = [],
219 transcript_status = [],
220 transcript_valid = [],
221 exons = [],
222 exons_confirmed = [],
223 cds_exons = [],
224 utr5_exons = [],
225 utr3_exons = [],
226 tis = [],
227 tis_conf = [],
228 tis_info = [],
229 cdsStop = [],
230 cdsStop_conf = [],
231 cdsStop_info = [],
232 tss = [],
233 tss_info = [],
234 tss_conf = [],
235 cleave = [],
236 cleave_info = [],
237 cleave_conf = [],
238 polya = [],
239 polya_info = [],
240 polya_conf = [],
241 is_alt = [],
242 is_alt_spliced = 0,
243 is_valid = [],
244 transcript_complete = [],
245 is_complete = [],
246 is_correctly_gff3_referenced = '',
247 splicegraph = []
248 )
249 return gene_details
250
251 def FeatureValueFormat(singlegene):
252 """Make feature value compactable to write in a .mat format
253 """
254 comp_exon = np.zeros((len(singlegene['exons']),), dtype=np.object)
255 for i in range(len(singlegene['exons'])):
256 comp_exon[i]= np.array(singlegene['exons'][i])
257 singlegene['exons'] = comp_exon
258 comp_cds = np.zeros((len(singlegene['cds_exons']),), dtype=np.object)
259 for i in range(len(singlegene['cds_exons'])):
260 comp_cds[i]= np.array(singlegene['cds_exons'][i])
261 singlegene['cds_exons'] = comp_cds
262 comp_utr3 = np.zeros((len(singlegene['utr3_exons']),), dtype=np.object)
263 for i in range(len(singlegene['utr3_exons'])):
264 comp_utr3[i]= np.array(singlegene['utr3_exons'][i])
265 singlegene['utr3_exons'] = comp_utr3
266 comp_utr5 = np.zeros((len(singlegene['utr5_exons']),), dtype=np.object)
267 for i in range(len(singlegene['utr5_exons'])):
268 comp_utr5[i]= np.array(singlegene['utr5_exons'][i])
269 singlegene['utr5_exons'] = comp_utr5
270 comp_transcripts = np.zeros((len(singlegene['transcripts']),), dtype=np.object)
271 for i in range(len(singlegene['transcripts'])):
272 comp_transcripts[i]= np.array(singlegene['transcripts'][i])
273 singlegene['transcripts'] = comp_transcripts
274 comp_tss = np.zeros((len(singlegene['tss']),), dtype=np.object)
275 for i in range(len(singlegene['tss'])):
276 comp_tss[i]= np.array(singlegene['tss'][i])
277 singlegene['tss'] = comp_tss
278 comp_tis = np.zeros((len(singlegene['tis']),), dtype=np.object)
279 for i in range(len(singlegene['tis'])):
280 comp_tis[i]= np.array(singlegene['tis'][i])
281 singlegene['tis'] = comp_tis
282 comp_cleave = np.zeros((len(singlegene['cleave']),), dtype=np.object)
283 for i in range(len(singlegene['cleave'])):
284 comp_cleave[i]= np.array(singlegene['cleave'][i])
285 singlegene['cleave'] = comp_cleave
286 comp_cdsStop = np.zeros((len(singlegene['cdsStop']),), dtype=np.object)
287 for i in range(len(singlegene['cdsStop'])):
288 comp_cdsStop[i]= np.array(singlegene['cdsStop'][i])
289 singlegene['cdsStop'] = comp_cdsStop
290
291 return singlegene
292
293 def CreateGeneModels(genes_cmpt, transcripts_cmpt, exons_cmpt, utr3_cmpt, utr5_cmpt, cds_cmpt):
294 """Creating Coding/Non-coding gene models from parsed GFF objects.
295 """
296 gene_counter, gene_models = 1, []
297 for gene_entry in genes_cmpt: ## Figure out the genes and transcripts associated feature
298 if gene_entry in transcripts_cmpt:
299 gene=init_gene()
300 gene['id']=gene_counter
301 gene['name']=gene_entry[1]
302 gene['chr']=genes_cmpt[gene_entry]['chr']
303 gene['source']=genes_cmpt[gene_entry]['source']
304 gene['start']=genes_cmpt[gene_entry]['start']
305 gene['stop']=genes_cmpt[gene_entry]['stop']
306 gene['strand']=genes_cmpt[gene_entry]['strand']
307 if not gene['strand'] in ['+', '-']:
308 gene['strand']='.' # Strand info not known replaced with a dot symbol instead of None, ?, . etc.
309 if len(transcripts_cmpt[gene_entry])>1:
310 gene['is_alt_spliced'] = 1
311 gene['is_alt'] = 1
312 gtype=[]
313 for tids in transcripts_cmpt[gene_entry]: ## transcript section related tags
314 gene['transcripts'].append(tids['ID'])
315 gtype.append(tids['type'])
316 exon_cod, utr5_cod, utr3_cod, cds_cod = [], [], [], []
317 if (gene['chr'], tids['ID']) in exons_cmpt:
318 exon_cod = [[feat_exon['start'], feat_exon['stop']] for feat_exon in exons_cmpt[(gene['chr'], tids['ID'])]]
319 if (gene['chr'], tids['ID']) in utr5_cmpt:
320 utr5_cod = [[feat_utr5['start'], feat_utr5['stop']] for feat_utr5 in utr5_cmpt[(gene['chr'], tids['ID'])]]
321 if (gene['chr'], tids['ID']) in utr3_cmpt:
322 utr3_cod = [[feat_utr3['start'], feat_utr3['stop']] for feat_utr3 in utr3_cmpt[(gene['chr'], tids['ID'])]]
323 if (gene['chr'], tids['ID']) in cds_cmpt:
324 cds_cod = [[feat_cds['start'], feat_cds['stop']] for feat_cds in cds_cmpt[(gene['chr'], tids['ID'])]]
325 if len(exon_cod) == 0: ## build exon coordinates from UTR3, UTR5 and CDS
326 if cds_cod != []:
327 exon_cod=createExon(gene['strand'], utr5_cod, cds_cod, utr3_cod)
328
329 if gene['strand']=='-': ## general order to coordinates
330 if len(exon_cod) >1:
331 if exon_cod[0][0] > exon_cod[-1][0]:
332 exon_cod.reverse()
333 if len(cds_cod) >1:
334 if cds_cod[0][0] > cds_cod[-1][0]:
335 cds_cod.reverse()
336 if len(utr3_cod) >1:
337 if utr3_cod[0][0] > utr3_cod[-1][0]:
338 utr3_cod.reverse()
339 if len(utr5_cod) >1:
340 if utr5_cod[0][0] > utr5_cod[-1][0]:
341 utr5_cod.reverse()
342
343 tis, cdsStop, tss, cleave = [], [], [], [] ## speacial sited in the gene region
344 if cds_cod != []:
345 if gene['strand'] == '+':
346 tis = [cds_cod[0][0]]
347 cdsStop = [cds_cod[-1][1]-3]
348 elif gene['strand'] == '-':
349 tis = [cds_cod[-1][1]]
350 cdsStop = [cds_cod[0][0]+3]
351 if utr5_cod != []:
352 if gene['strand'] == '+':
353 tss = [utr5_cod[0][0]]
354 elif gene['strand'] == '-':
355 tss = [utr5_cod[-1][1]]
356 if utr3_cod != []:
357 if gene['strand'] == '+':
358 cleave = [utr3_cod[-1][1]]
359 elif gene['strand'] == '-':
360 cleave = [utr3_cod[0][0]]
361
362 cds_status, exon_status, utr_status = 0, 0, 0 ## status of the complete elements of the gene
363 if cds_cod != []: ## adding phase to the CDS region
364 cds_cod_phase = addCDSphase(gene['strand'], cds_cod)
365 cds_status = 1
366 gene['cds_exons'].append(cds_cod_phase)
367
368 if exon_cod != []:
369 exon_status = 1
370 if utr5_cod != [] or utr3_cod != []:
371 utr_status = 1
372 if cds_status != 0 and exon_status != 0 and utr_status != 0:
373 gene['transcript_status'].append(1)
374 else:
375 gene['transcript_status'].append(0)
376
377 if exon_cod: ## final check point for a valid gene model
378 gene['exons'].append(exon_cod)
379 gene['utr3_exons'].append(utr3_cod)
380 gene['utr5_exons'].append(utr5_cod)
381 gene['tis'].append(tis)
382 gene['cdsStop'].append(cdsStop)
383 gene['tss'].append(tss)
384 gene['cleave'].append(cleave)
385
386 gtype=list(set(gtype)) ## different types
387 gene['gene_info']=dict(ID=gene_entry[1],
388 Source=genes_cmpt[gene_entry]['source'],
389 Type=gtype)
390 gene=FeatureValueFormat(gene) ## get prepare for MAT writing
391 gene_counter+=1
392 gene_models.append(gene)
393 return gene_models
394
395 def GFFParse(gff_file):
396 """Parsing GFF file based on feature relationship.
397 """
398 genes, utr5, exons=dict(), dict(), dict()
399 transcripts, utr3, cds=dict(), dict(), dict()
400 # TODO Include growing key words of different non-coding/coding transcripts
401 features=['mrna', 'transcript', 'ncRNA', 'mirna', 'pseudogenic_transcript', 'rrna', 'snorna', 'snrna', 'trna', 'scrna']
402 gff_handle=open(gff_file, "rU")
403 for gff_line in gff_handle:
404 gff_line=gff_line.strip('\n\r').split('\t')
405 if re.match(r'#|>', gff_line[0]): # skip commented line or fasta identifier line
406 continue
407 if len(gff_line)==1: # skip fasta sequence/empty line if present
408 continue
409 assert len(gff_line)==9, '\t'.join(gff_line) # not found 9 tab-delimited fields in this line
410 if '' in gff_line: # skip this line if there any field with an empty value
411 print 'Skipping..', '\t'.join(gff_line)
412 continue
413 if gff_line[-1][-1]==';': # trim the last ';' character
414 gff_line[-1]=gff_line[-1].strip(';')
415 if gff_line[2].lower() in ['gene', 'pseudogene']:
416 gid, gene_info=None, dict()
417 gene_info['start']=int(gff_line[3])
418 gene_info['stop']=int(gff_line[4])
419 gene_info['chr']=gff_line[0]
420 gene_info['source']=gff_line[1]
421 gene_info['strand']=gff_line[6]
422 for attb in gff_line[-1].split(';'):
423 attb=attb.split('=') # gff attributes are separated by key=value pair
424 if attb[0]=='ID':
425 gid=attb[1]
426 break
427 genes[(gff_line[0], gid)]=gene_info # store gene information based on the chromosome and gene symbol.
428 elif gff_line[2].lower() in features:
429 gid, mrna_info=None, dict()
430 mrna_info['start']=int(gff_line[3])
431 mrna_info['stop']=int(gff_line[4])
432 mrna_info['chr']=gff_line[0]
433 mrna_info['strand']=gff_line[6]
434 mrna_info['type'] = gff_line[2]
435 for attb in gff_line[-1].split(';'):
436 attb=attb.split('=')
437 if attb[0]=='Parent':
438 gid=attb[1]
439 elif attb[0]=='ID':
440 mrna_info[attb[0]]=attb[1]
441 for fid in gid.split(','): # child may be mapped to multiple parents ex: Parent=AT01,AT01-1-Protein
442 if (gff_line[0], fid) in transcripts:
443 transcripts[(gff_line[0], fid)].append(mrna_info)
444 else:
445 transcripts[(gff_line[0], fid)]=[mrna_info]
446 elif gff_line[2].lower() in ['exon']:
447 tids, exon_info=None, dict()
448 exon_info['start']=int(gff_line[3])
449 exon_info['stop']=int(gff_line[4])
450 exon_info['chr']=gff_line[0]
451 exon_info['strand']=gff_line[6]
452 for attb in gff_line[-1].split(';'):
453 attb=attb.split('=')
454 if attb[0]=='Parent':
455 tids=attb[1]
456 break
457 for tid in tids.split(','):
458 if (gff_line[0], tid) in exons:
459 exons[(gff_line[0], tid)].append(exon_info)
460 else:
461 exons[(gff_line[0], tid)]=[exon_info]
462 elif gff_line[2].lower() in ['five_prime_utr']:
463 utr5_info, tids=dict(), None
464 utr5_info['start']=int(gff_line[3])
465 utr5_info['stop']=int(gff_line[4])
466 utr5_info['chr']=gff_line[0]
467 utr5_info['strand']=gff_line[6]
468 for attb in gff_line[-1].split(';'):
469 attb=attb.split('=')
470 if attb[0]=='Parent':
471 tids=attb[1]
472 break
473 for tid in tids.split(','):
474 if (gff_line[0], tid) in utr5:
475 utr5[(gff_line[0], tid)].append(utr5_info)
476 else:
477 utr5[(gff_line[0], tid)]=[utr5_info]
478 elif gff_line[2].lower() in ['cds']:
479 cds_info, tids=dict(), None
480 cds_info['start']=int(gff_line[3])
481 cds_info['stop']=int(gff_line[4])
482 cds_info['chr']=gff_line[0]
483 cds_info['strand']=gff_line[6]
484 for attb in gff_line[-1].split(';'):
485 attb=attb.split('=')
486 if attb[0]=='Parent':
487 tids=attb[1]
488 break
489 for tid in tids.split(','):
490 if (gff_line[0], tid) in cds:
491 cds[(gff_line[0], tid)].append(cds_info)
492 else:
493 cds[(gff_line[0], tid)]=[cds_info]
494 elif gff_line[2].lower() in ['three_prime_utr']:
495 utr3_info, tids=dict(), None
496 utr3_info['start']=int(gff_line[3])
497 utr3_info['stop']=int(gff_line[4])
498 utr3_info['chr']=gff_line[0]
499 utr3_info['strand']=gff_line[6]
500 for attb in gff_line[-1].split(';'):
501 attb=attb.split('=')
502 if attb[0]=='Parent':
503 tids=attb[1]
504 break
505 for tid in tids.split(','):
506 if (gff_line[0], tid) in utr3:
507 utr3[(gff_line[0], tid)].append(utr3_info)
508 else:
509 utr3[(gff_line[0], tid)]=[utr3_info]
510 gff_handle.close()
511 return genes, transcripts, exons, utr3, utr5, cds
512
513 def __main__():
514 """extract genome feature information
515 """
516 try:
517 gff_file = sys.argv[1]
518 mat_file = sys.argv[2]
519 except:
520 print __doc__
521 sys.exit(-1)
522
523 genes, transcripts, exons, utr3, utr5, cds = GFFParse(gff_file)
524 gene_models = CreateGeneModels(genes, transcripts, exons, utr3, utr5, cds)
525 # TODO Write to matlab/octave struct instead of cell arrays.
526 sio.savemat(mat_file,
527 mdict=dict(genes=gene_models),
528 format='5',
529 oned_as='row')
530
531 if __name__=='__main__':
532 __main__()