5
|
1 #!/usr/bin/env python
|
|
2 """
|
|
3 Convert Gene Transfer Format [GTF] to Generic Feature Format Version 3 [GFF3].
|
|
4
|
|
5 Usage: python gtf_to_gff.py in.gtf > out.gff3
|
|
6
|
|
7 Requirement:
|
|
8 GFFParser.py: https://github.com/vipints/GFFtools-GX/blob/master/GFFParser.py
|
|
9 helper.py : https://github.com/vipints/GFFtools-GX/blob/master/helper.py
|
|
10
|
|
11 Copyright (C)
|
|
12 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany.
|
|
13 2012-2014 Memorial Sloan Kettering Cancer Center New York City, USA.
|
|
14 """
|
|
15
|
|
16 import re
|
|
17 import sys
|
|
18 import GFFParser
|
|
19 import helper
|
|
20
|
|
21 def GFFWriter(gtf_content):
|
|
22 """
|
|
23 write the feature information to GFF format
|
|
24
|
|
25 @args gtf_content: Parsed object from gtf file
|
|
26 @type gtf_content: numpy array
|
|
27 """
|
|
28
|
|
29 print '##gff-version 3'
|
|
30
|
|
31 for ent1 in gtf_content:
|
|
32
|
|
33 chr_name = ent1['chr']
|
|
34 strand = ent1['strand']
|
|
35 start = ent1['start']
|
|
36 stop = ent1['stop']
|
|
37 source = ent1['source']
|
|
38 ID = ent1['name']
|
|
39 Name = ent1['gene_info']['Name']
|
|
40
|
|
41 Name = ID if not Name else Name
|
|
42
|
|
43 print '%s\t%s\tgene\t%d\t%d\t.\t%s\t.\tID=%s;Name=%s' % (chr_name, source, start, stop, strand, ID, Name)
|
|
44
|
|
45 for idx, tid in enumerate(ent1['transcripts']):
|
|
46 print idx
|
|
47 print tid
|
|
48
|
|
49 t_start = ent1['exons'][idx][0][0]
|
|
50 t_stop = ent1['exons'][idx][-1][-1]
|
|
51 t_type = ent1['transcript_type'][idx]
|
|
52
|
|
53 utr5_exons, utr3_exons = [], []
|
|
54 if ent1['exons'][idx].any() and ent1['cds_exons'][idx].any():
|
|
55 utr5_exons, utr3_exons = helper.buildUTR(ent1['cds_exons'][idx], ent1['exons'][idx], strand)
|
|
56
|
|
57 print '%s\t%s\t%s\t%d\t%d\t.\t%s\t.\tID=%s;Parent=%s' % (chr_name, source, t_type, t_start, t_stop, strand, tid[0], ID)
|
|
58
|
|
59 for ex_cod in utr5_exons:
|
|
60 print '%s\t%s\tfive_prime_UTR\t%d\t%d\t.\t%s\t.\tParent=%s' % (chr_name, source, ex_cod[0], ex_cod[1], strand, tid[0])
|
|
61
|
|
62 for ex_cod in ent1['cds_exons'][idx]:
|
|
63 print '%s\t%s\tCDS\t%d\t%d\t.\t%s\t%d\tParent=%s' % (chr_name, source, ex_cod[0], ex_cod[1], strand, ex_cod[2], tid[0])
|
|
64
|
|
65 for ex_cod in utr3_exons:
|
|
66 print '%s\t%s\tthree_prime_UTR\t%d\t%d\t.\t%s\t.\tParent=%s' % (chr_name, source, ex_cod[0], ex_cod[1], strand, tid[0])
|
|
67
|
|
68 for ex_cod in ent1['exons'][idx]:
|
|
69 print '%s\t%s\texon\t%d\t%d\t.\t%s\t.\tParent=%s' % (chr_name, source, ex_cod[0], ex_cod[1], strand, tid[0])
|
|
70
|
|
71
|
|
72 def __main__():
|
|
73
|
|
74 try:
|
|
75 gtf_fname = sys.argv[1]
|
|
76 except:
|
|
77 print __doc__
|
|
78 sys.exit(-1)
|
|
79
|
|
80 gtf_file_content = GFFParser.Parse(gtf_fname)
|
|
81
|
|
82 GFFWriter(gtf_file_content)
|
|
83
|
|
84 if __name__ == "__main__":
|
|
85 __main__()
|