5
|
1 #!/usr/bin/env python
|
|
2 """
|
|
3 Convert genome annotation data in a 12 column BED format to GFF3.
|
|
4
|
|
5 Usage: python bed_to_gff.py in.bed > out.gff
|
|
6
|
|
7 Requirement:
|
|
8 helper.py : https://github.com/vipints/GFFtools-GX/blob/master/helper.py
|
|
9
|
|
10 Copyright (C)
|
|
11 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany.
|
|
12 2012-2014 Memorial Sloan Kettering Cancer Center New York City, USA.
|
|
13 """
|
|
14
|
|
15 import re
|
|
16 import sys
|
|
17 import helper
|
|
18
|
|
19 def __main__():
|
|
20 """
|
|
21 main function
|
|
22 """
|
|
23
|
|
24 try:
|
|
25 bed_fname = sys.argv[1]
|
|
26 except:
|
|
27 print __doc__
|
|
28 sys.exit(-1)
|
|
29
|
|
30 bed_fh = helper.open_file(bed_fname)
|
|
31
|
|
32 for line in bed_fh:
|
|
33 line = line.strip( '\n\r' )
|
|
34
|
|
35 if not line or line[0] in ['#']:
|
|
36 continue
|
|
37
|
|
38 parts = line.split('\t')
|
|
39 assert len(parts) >= 12, line
|
|
40
|
|
41 rstarts = parts[-1].split(',')
|
|
42 rstarts.pop() if rstarts[-1] == '' else rstarts
|
|
43
|
|
44 exon_lens = parts[-2].split(',')
|
|
45 exon_lens.pop() if exon_lens[-1] == '' else exon_lens
|
|
46
|
|
47 if len(rstarts) != len(exon_lens):
|
|
48 continue # checking the consistency col 11 and col 12
|
|
49
|
|
50 if len(rstarts) != int(parts[-3]):
|
|
51 continue # checking the number of exons and block count are same
|
|
52
|
|
53 if not parts[5] in ['+', '-']:
|
|
54 parts[5] = '.' # replace the unknown strand with '.'
|
|
55
|
|
56 # bed2gff result line
|
|
57 print '%s\tbed2gff\tgene\t%d\t%s\t%s\t%s\t.\tID=Gene:%s;Name=Gene:%s' % (parts[0], int(parts[1])+1, parts[2], parts[4], parts[5], parts[3], parts[3])
|
|
58 print '%s\tbed2gff\ttranscript\t%d\t%s\t%s\t%s\t.\tID=%s;Name=%s;Parent=Gene:%s' % (parts[0], int(parts[1])+1, parts[2], parts[4], parts[5], parts[3], parts[3], parts[3])
|
|
59
|
|
60 st = int(parts[1])
|
|
61 for ex_cnt in range(int(parts[-3])):
|
|
62 start = st + int(rstarts[ex_cnt]) + 1
|
|
63 stop = start + int(exon_lens[ex_cnt]) - 1
|
|
64 print '%s\tbed2gff\texon\t%d\t%d\t%s\t%s\t.\tParent=%s' % (parts[0], start, stop, parts[4], parts[5], parts[3])
|
|
65
|
|
66 bed_fh.close()
|
|
67
|
|
68
|
|
69 if __name__ == "__main__":
|
|
70 __main__()
|