comparison gff_to_bed.py @ 5:6e589f267c14

Uploaded
author devteam
date Tue, 04 Nov 2014 12:15:19 -0500
parents
children
comparison
equal deleted inserted replaced
4:619e0fcd9126 5:6e589f267c14
1 #!/usr/bin/env python
2 """
3 Convert genome annotation data in GFF/GTF to a 12 column BED format.
4 BED format typically represents the transcript models.
5
6 Usage: python gff_to_bed.py in.gff > out.bed
7
8 Requirement:
9 GFFParser.py: https://github.com/vipints/GFFtools-GX/blob/master/GFFParser.py
10
11 Copyright (C)
12 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany.
13 2012-2014 Memorial Sloan Kettering Cancer Center New York City, USA.
14 """
15
16 import re
17 import sys
18 import GFFParser
19
20 def writeBED(tinfo):
21 """
22 writing result files in bed format
23
24 @args tinfo: list of genes
25 @args tinfo: numpy object
26 """
27
28 for ent1 in tinfo:
29 child_flag = False
30
31 for idx, tid in enumerate(ent1['transcripts']):
32 child_flag = True
33 exon_cnt = len(ent1['exons'][idx])
34 exon_len = ''
35 exon_cod = ''
36 rel_start = None
37 rel_stop = None
38 for idz, ex_cod in enumerate(ent1['exons'][idx]):#check for exons of corresponding transcript
39 exon_len += '%d,' % (ex_cod[1]-ex_cod[0]+1)
40 if idz == 0: #calculate the relative start position
41 exon_cod += '0,'
42 rel_start = int(ex_cod[0])
43 rel_stop = ex_cod[1]
44 else:
45 exon_cod += '%d,' % (ex_cod[0]-rel_start)
46 rel_stop = int(ex_cod[1])
47
48 if exon_len:
49 score = '0'
50 score = ent1['score'][0] if ent1['score'] else score
51 out_print = [ent1['chr'],
52 str(rel_start),
53 str(rel_stop),
54 tid[0],
55 score,
56 ent1['strand'],
57 str(rel_start),
58 str(rel_stop),
59 '0',
60 str(exon_cnt),
61 exon_len,
62 exon_cod]
63 print '\t'.join(out_print)
64
65 if not child_flag: # file just contains only a single parent type i.e, gff3 defines only one feature type
66 score = '0'
67 score = ent1['score'][0] if ent1['score'] else score
68
69 out_print = [ent1['chr'],
70 '%d' % int(ent1['start']),
71 '%d' % int(ent1['stop']),
72 ent1['name'],
73 score,
74 ent1['strand'],
75 '%d' % int(ent1['start']),
76 '%d' % int(ent1['stop']),
77 '0',
78 '1',
79 '%d,' % (int(ent1['stop'])-int(ent1['start'])+1),
80 '0,']
81
82 print '\t'.join(out_print)
83
84
85 def __main__():
86 try:
87 query_file = sys.argv[1]
88 except:
89 print __doc__
90 sys.exit(-1)
91
92 Transcriptdb = GFFParser.Parse(query_file)
93 writeBED(Transcriptdb)
94
95 if __name__ == "__main__":
96 __main__()