annotate gff_to_bed.py @ 5:6e589f267c14

Uploaded
author devteam
date Tue, 04 Nov 2014 12:15:19 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
5
6e589f267c14 Uploaded
devteam
parents:
diff changeset
1 #!/usr/bin/env python
6e589f267c14 Uploaded
devteam
parents:
diff changeset
2 """
6e589f267c14 Uploaded
devteam
parents:
diff changeset
3 Convert genome annotation data in GFF/GTF to a 12 column BED format.
6e589f267c14 Uploaded
devteam
parents:
diff changeset
4 BED format typically represents the transcript models.
6e589f267c14 Uploaded
devteam
parents:
diff changeset
5
6e589f267c14 Uploaded
devteam
parents:
diff changeset
6 Usage: python gff_to_bed.py in.gff > out.bed
6e589f267c14 Uploaded
devteam
parents:
diff changeset
7
6e589f267c14 Uploaded
devteam
parents:
diff changeset
8 Requirement:
6e589f267c14 Uploaded
devteam
parents:
diff changeset
9 GFFParser.py: https://github.com/vipints/GFFtools-GX/blob/master/GFFParser.py
6e589f267c14 Uploaded
devteam
parents:
diff changeset
10
6e589f267c14 Uploaded
devteam
parents:
diff changeset
11 Copyright (C)
6e589f267c14 Uploaded
devteam
parents:
diff changeset
12 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany.
6e589f267c14 Uploaded
devteam
parents:
diff changeset
13 2012-2014 Memorial Sloan Kettering Cancer Center New York City, USA.
6e589f267c14 Uploaded
devteam
parents:
diff changeset
14 """
6e589f267c14 Uploaded
devteam
parents:
diff changeset
15
6e589f267c14 Uploaded
devteam
parents:
diff changeset
16 import re
6e589f267c14 Uploaded
devteam
parents:
diff changeset
17 import sys
6e589f267c14 Uploaded
devteam
parents:
diff changeset
18 import GFFParser
6e589f267c14 Uploaded
devteam
parents:
diff changeset
19
6e589f267c14 Uploaded
devteam
parents:
diff changeset
20 def writeBED(tinfo):
6e589f267c14 Uploaded
devteam
parents:
diff changeset
21 """
6e589f267c14 Uploaded
devteam
parents:
diff changeset
22 writing result files in bed format
6e589f267c14 Uploaded
devteam
parents:
diff changeset
23
6e589f267c14 Uploaded
devteam
parents:
diff changeset
24 @args tinfo: list of genes
6e589f267c14 Uploaded
devteam
parents:
diff changeset
25 @args tinfo: numpy object
6e589f267c14 Uploaded
devteam
parents:
diff changeset
26 """
6e589f267c14 Uploaded
devteam
parents:
diff changeset
27
6e589f267c14 Uploaded
devteam
parents:
diff changeset
28 for ent1 in tinfo:
6e589f267c14 Uploaded
devteam
parents:
diff changeset
29 child_flag = False
6e589f267c14 Uploaded
devteam
parents:
diff changeset
30
6e589f267c14 Uploaded
devteam
parents:
diff changeset
31 for idx, tid in enumerate(ent1['transcripts']):
6e589f267c14 Uploaded
devteam
parents:
diff changeset
32 child_flag = True
6e589f267c14 Uploaded
devteam
parents:
diff changeset
33 exon_cnt = len(ent1['exons'][idx])
6e589f267c14 Uploaded
devteam
parents:
diff changeset
34 exon_len = ''
6e589f267c14 Uploaded
devteam
parents:
diff changeset
35 exon_cod = ''
6e589f267c14 Uploaded
devteam
parents:
diff changeset
36 rel_start = None
6e589f267c14 Uploaded
devteam
parents:
diff changeset
37 rel_stop = None
6e589f267c14 Uploaded
devteam
parents:
diff changeset
38 for idz, ex_cod in enumerate(ent1['exons'][idx]):#check for exons of corresponding transcript
6e589f267c14 Uploaded
devteam
parents:
diff changeset
39 exon_len += '%d,' % (ex_cod[1]-ex_cod[0]+1)
6e589f267c14 Uploaded
devteam
parents:
diff changeset
40 if idz == 0: #calculate the relative start position
6e589f267c14 Uploaded
devteam
parents:
diff changeset
41 exon_cod += '0,'
6e589f267c14 Uploaded
devteam
parents:
diff changeset
42 rel_start = int(ex_cod[0])
6e589f267c14 Uploaded
devteam
parents:
diff changeset
43 rel_stop = ex_cod[1]
6e589f267c14 Uploaded
devteam
parents:
diff changeset
44 else:
6e589f267c14 Uploaded
devteam
parents:
diff changeset
45 exon_cod += '%d,' % (ex_cod[0]-rel_start)
6e589f267c14 Uploaded
devteam
parents:
diff changeset
46 rel_stop = int(ex_cod[1])
6e589f267c14 Uploaded
devteam
parents:
diff changeset
47
6e589f267c14 Uploaded
devteam
parents:
diff changeset
48 if exon_len:
6e589f267c14 Uploaded
devteam
parents:
diff changeset
49 score = '0'
6e589f267c14 Uploaded
devteam
parents:
diff changeset
50 score = ent1['score'][0] if ent1['score'] else score
6e589f267c14 Uploaded
devteam
parents:
diff changeset
51 out_print = [ent1['chr'],
6e589f267c14 Uploaded
devteam
parents:
diff changeset
52 str(rel_start),
6e589f267c14 Uploaded
devteam
parents:
diff changeset
53 str(rel_stop),
6e589f267c14 Uploaded
devteam
parents:
diff changeset
54 tid[0],
6e589f267c14 Uploaded
devteam
parents:
diff changeset
55 score,
6e589f267c14 Uploaded
devteam
parents:
diff changeset
56 ent1['strand'],
6e589f267c14 Uploaded
devteam
parents:
diff changeset
57 str(rel_start),
6e589f267c14 Uploaded
devteam
parents:
diff changeset
58 str(rel_stop),
6e589f267c14 Uploaded
devteam
parents:
diff changeset
59 '0',
6e589f267c14 Uploaded
devteam
parents:
diff changeset
60 str(exon_cnt),
6e589f267c14 Uploaded
devteam
parents:
diff changeset
61 exon_len,
6e589f267c14 Uploaded
devteam
parents:
diff changeset
62 exon_cod]
6e589f267c14 Uploaded
devteam
parents:
diff changeset
63 print '\t'.join(out_print)
6e589f267c14 Uploaded
devteam
parents:
diff changeset
64
6e589f267c14 Uploaded
devteam
parents:
diff changeset
65 if not child_flag: # file just contains only a single parent type i.e, gff3 defines only one feature type
6e589f267c14 Uploaded
devteam
parents:
diff changeset
66 score = '0'
6e589f267c14 Uploaded
devteam
parents:
diff changeset
67 score = ent1['score'][0] if ent1['score'] else score
6e589f267c14 Uploaded
devteam
parents:
diff changeset
68
6e589f267c14 Uploaded
devteam
parents:
diff changeset
69 out_print = [ent1['chr'],
6e589f267c14 Uploaded
devteam
parents:
diff changeset
70 '%d' % int(ent1['start']),
6e589f267c14 Uploaded
devteam
parents:
diff changeset
71 '%d' % int(ent1['stop']),
6e589f267c14 Uploaded
devteam
parents:
diff changeset
72 ent1['name'],
6e589f267c14 Uploaded
devteam
parents:
diff changeset
73 score,
6e589f267c14 Uploaded
devteam
parents:
diff changeset
74 ent1['strand'],
6e589f267c14 Uploaded
devteam
parents:
diff changeset
75 '%d' % int(ent1['start']),
6e589f267c14 Uploaded
devteam
parents:
diff changeset
76 '%d' % int(ent1['stop']),
6e589f267c14 Uploaded
devteam
parents:
diff changeset
77 '0',
6e589f267c14 Uploaded
devteam
parents:
diff changeset
78 '1',
6e589f267c14 Uploaded
devteam
parents:
diff changeset
79 '%d,' % (int(ent1['stop'])-int(ent1['start'])+1),
6e589f267c14 Uploaded
devteam
parents:
diff changeset
80 '0,']
6e589f267c14 Uploaded
devteam
parents:
diff changeset
81
6e589f267c14 Uploaded
devteam
parents:
diff changeset
82 print '\t'.join(out_print)
6e589f267c14 Uploaded
devteam
parents:
diff changeset
83
6e589f267c14 Uploaded
devteam
parents:
diff changeset
84
6e589f267c14 Uploaded
devteam
parents:
diff changeset
85 def __main__():
6e589f267c14 Uploaded
devteam
parents:
diff changeset
86 try:
6e589f267c14 Uploaded
devteam
parents:
diff changeset
87 query_file = sys.argv[1]
6e589f267c14 Uploaded
devteam
parents:
diff changeset
88 except:
6e589f267c14 Uploaded
devteam
parents:
diff changeset
89 print __doc__
6e589f267c14 Uploaded
devteam
parents:
diff changeset
90 sys.exit(-1)
6e589f267c14 Uploaded
devteam
parents:
diff changeset
91
6e589f267c14 Uploaded
devteam
parents:
diff changeset
92 Transcriptdb = GFFParser.Parse(query_file)
6e589f267c14 Uploaded
devteam
parents:
diff changeset
93 writeBED(Transcriptdb)
6e589f267c14 Uploaded
devteam
parents:
diff changeset
94
6e589f267c14 Uploaded
devteam
parents:
diff changeset
95 if __name__ == "__main__":
6e589f267c14 Uploaded
devteam
parents:
diff changeset
96 __main__()