0
|
1 #Adapted from gbk_to_gff
|
|
2 # Brigida Rusconi, UTSA
|
|
3 # October 8 2015
|
|
4 # Finds all CDS that match the key words provided in the comma separated argument -k in the product description
|
|
5 # adds a given number of bases on each side of the transcript for the exclusion set
|
|
6
|
|
7
|
|
8
|
|
9 #!/usr/bin/env python
|
|
10 """
|
|
11 Convert data from Genbank format to GFF.
|
|
12
|
|
13 Usage:
|
|
14 python gbk_to_gff.py in.gbk > out.gff
|
|
15
|
|
16 Requirements:
|
|
17 BioPython:- http://biopython.org/
|
|
18 helper.py:- https://github.com/vipints/GFFtools-GX/blob/master/helper.py
|
|
19
|
|
20 Copyright (C)
|
|
21 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany.
|
|
22 2012-2015 Memorial Sloan Kettering Cancer Center New York City, USA.
|
|
23 """
|
|
24
|
|
25 import os
|
|
26 import re
|
|
27 import sys
|
|
28 import helper
|
|
29 import collections
|
|
30 import pdb
|
|
31 import argparse
|
|
32 from Bio import SeqIO
|
|
33
|
|
34 def feature_table(chr_id, source, orient, genes, transcripts, cds, exons, unk, product, keys, n):
|
|
35 """
|
|
36 Write the feature information
|
|
37 """
|
|
38 for gname, ginfo in genes.items():
|
|
39 line = [str(chr_id),
|
|
40 ginfo[3],
|
|
41 str(ginfo[0]-int(n)),
|
|
42 str(ginfo[1]+int(n)),
|
|
43 ginfo[2],
|
|
44 str(product)]
|
|
45
|
|
46 # to key on specfic products only
|
|
47 for k in keys.split(","):
|
|
48 if k in str(product):
|
|
49
|
|
50 sys.stdout.write('\t'.join(line)+"\n")
|
|
51 break
|
|
52
|
|
53
|
|
54 unk +=1
|
|
55 return unk
|
|
56
|
|
57
|
|
58
|
|
59
|
|
60 def gbk_parse(fname,keys,n):
|
|
61 """
|
|
62 Extract genome annotation recods from genbank format
|
|
63
|
|
64 @args fname: gbk file name
|
|
65 @type fname: str
|
|
66 """
|
|
67 fhand = helper.open_file(gbkfname)
|
|
68 unk = 1
|
|
69
|
|
70 for record in SeqIO.parse(fhand, "genbank"):
|
|
71 gene_tags = dict()
|
|
72 tx_tags = collections.defaultdict(list)
|
|
73 exon = collections.defaultdict(list)
|
|
74 cds = collections.defaultdict(list)
|
|
75 mol_type, chr_id = None, None
|
|
76
|
|
77 for rec in record.features:
|
|
78
|
|
79
|
|
80 product = "NA"
|
|
81
|
|
82 if rec.qualifiers.has_key('product'):
|
|
83
|
|
84 product = rec.qualifiers['product'][0]
|
|
85
|
|
86
|
|
87 if rec.type == 'source':
|
|
88 try:
|
|
89 mol_type = rec.qualifiers['mol_type'][0]
|
|
90 except:
|
|
91 mol_type = '.'
|
|
92 pass
|
|
93 try:
|
|
94 chr_id = rec.qualifiers['chromosome'][0]
|
|
95 except:
|
|
96 chr_id = record.name
|
|
97 continue
|
|
98
|
|
99 strand='-'
|
|
100 strand='+' if rec.strand>0 else strand
|
|
101
|
|
102 fid = None
|
|
103 try:
|
|
104 fid = rec.qualifiers['gene'][0]
|
|
105 except:
|
|
106 pass
|
|
107
|
|
108 transcript_id = None
|
|
109 try:
|
|
110 transcript_id = rec.qualifiers['transcript_id'][0]
|
|
111 except:
|
|
112 pass
|
|
113
|
|
114 if re.search(r'gene', rec.type):
|
|
115 gene_tags[fid] = (rec.location._start.position+1,
|
|
116 rec.location._end.position,
|
|
117 strand,
|
|
118 rec.type
|
|
119 )
|
|
120 elif rec.type == 'exon':
|
|
121 exon[fid].append((rec.location._start.position+1,
|
|
122 rec.location._end.position))
|
|
123 elif rec.type=='CDS':
|
|
124 cds[fid].append((rec.location._start.position+1,
|
|
125 rec.location._end.position))
|
|
126 else:
|
|
127 # get all transcripts
|
|
128 if transcript_id:
|
|
129 tx_tags[fid].append((rec.location._start.position+1,
|
|
130 rec.location._end.position,
|
|
131 transcript_id,
|
|
132 rec.type))
|
|
133 # record extracted, generate feature table
|
|
134 unk = feature_table(chr_id, mol_type, strand, gene_tags, tx_tags, cds, exon, unk, product,keys,n)
|
|
135
|
|
136 fhand.close()
|
|
137
|
|
138
|
|
139 if __name__=='__main__':
|
|
140
|
|
141 try:
|
|
142 parser = argparse.ArgumentParser()
|
|
143 parser.add_argument('-k', '--keys', help="list of products to key on in a comma delimited list")
|
|
144 parser.add_argument('-g', '--gbk_file', help="genbank file of reference")
|
|
145 parser.add_argument('-n', '--number', help="flanking region to include")
|
|
146
|
|
147 args = parser.parse_args()
|
|
148 gbkfname = args.gbk_file
|
|
149 n=args.number
|
|
150 keys=args.keys
|
|
151
|
|
152 except:
|
|
153 print __doc__
|
|
154 sys.exit(-1)
|
|
155
|
|
156 ## extract gbk records
|
|
157 gbk_parse(gbkfname,keys,n)
|
|
158
|
|
159
|
|
160
|
|
161
|