comparison bed_to_gff.py @ 10:c42c69aa81f8

fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
author vipints <vipin@cbio.mskcc.org>
date Thu, 23 Apr 2015 18:01:45 -0400
parents
children
comparison
equal deleted inserted replaced
9:7d67331368f3 10:c42c69aa81f8
1 #!/usr/bin/env python
2 """
3 Convert genome annotation data in a 12 column BED format to GFF3.
4
5 Usage:
6 python bed_to_gff.py in.bed > out.gff
7
8 Requirement:
9 helper.py : https://github.com/vipints/GFFtools-GX/blob/master/helper.py
10
11 Copyright (C)
12 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany.
13 2012-2015 Memorial Sloan Kettering Cancer Center New York City, USA.
14 """
15
16 import re
17 import sys
18 import helper
19
20 def __main__():
21 """
22 main function
23 """
24
25 try:
26 bed_fname = sys.argv[1]
27 except:
28 print __doc__
29 sys.exit(-1)
30
31 bed_fh = helper.open_file(bed_fname)
32
33 for line in bed_fh:
34 line = line.strip( '\n\r' )
35
36 if not line or line[0] in ['#']:
37 continue
38
39 parts = line.split('\t')
40 assert len(parts) >= 12, line
41
42 rstarts = parts[-1].split(',')
43 rstarts.pop() if rstarts[-1] == '' else rstarts
44
45 exon_lens = parts[-2].split(',')
46 exon_lens.pop() if exon_lens[-1] == '' else exon_lens
47
48 if len(rstarts) != len(exon_lens):
49 continue # checking the consistency col 11 and col 12
50
51 if len(rstarts) != int(parts[-3]):
52 continue # checking the number of exons and block count are same
53
54 if not parts[5] in ['+', '-']:
55 parts[5] = '.' # replace the unknown strand with '.'
56
57 # bed2gff result line
58 sys.stdout.write('%s\tbed2gff\tgene\t%d\t%s\t%s\t%s\t.\tID=Gene:%s;Name=Gene:%s\n' % (parts[0], int(parts[1])+1, parts[2], parts[4], parts[5], parts[3], parts[3]))
59 sys.stdout.write('%s\tbed2gff\ttranscript\t%d\t%s\t%s\t%s\t.\tID=%s;Name=%s;Parent=Gene:%s\n' % (parts[0], int(parts[1])+1, parts[2], parts[4], parts[5], parts[3], parts[3], parts[3]))
60
61 st = int(parts[1])
62 for ex_cnt in range(int(parts[-3])):
63 start = st + int(rstarts[ex_cnt]) + 1
64 stop = start + int(exon_lens[ex_cnt]) - 1
65 sys.stdout.write('%s\tbed2gff\texon\t%d\t%d\t%s\t%s\t.\tParent=%s\n' % (parts[0], start, stop, parts[4], parts[5], parts[3]))
66
67 bed_fh.close()
68
69
70 if __name__ == "__main__":
71 __main__()