0
|
1 #Adapted from gbk_to_gff
|
|
2 # Brigida Rusconi, UTSA
|
|
3 # October 8 2015
|
|
4 # Finds all CDS that match the key words provided in the comma separated argument -k in the product description
|
|
5 # adds a given number of bases on each side of the transcript for the exclusion set
|
|
6
|
|
7
|
|
8
|
|
9 #!/usr/bin/env python
|
|
10
|
|
11
|
|
12 import os
|
|
13 import re
|
|
14 import sys
|
|
15 import collections
|
|
16 import pdb
|
|
17 import argparse
|
|
18 from Bio import SeqIO
|
|
19
|
8
|
20 def open_file(fname):
|
|
21 """
|
|
22 Open the file (supports .gz .bz2) and returns the handler
|
|
23
|
|
24 @args fname: input file name for reading
|
|
25 @type fname: str
|
|
26 """
|
|
27
|
|
28 try:
|
|
29 if os.path.splitext(fname)[1] == ".gz":
|
|
30 FH = gzip.open(fname, 'rb')
|
|
31 elif os.path.splitext(fname)[1] == ".bz2":
|
|
32 FH = bz2.BZ2File(fname, 'rb')
|
|
33 else:
|
|
34 FH = open(fname, 'rU')
|
|
35 except Exception as error:
|
|
36 sys.exit(error)
|
|
37
|
|
38 return FH
|
|
39
|
|
40
|
|
41 def feature_table(chr_id, source, orient, genes, transcripts, cds, exons, product, keys_1, n):
|
0
|
42 """
|
|
43 Write the feature information
|
|
44 """
|
8
|
45
|
|
46 lines = []
|
|
47
|
0
|
48 for gname, ginfo in genes.items():
|
|
49 line = [str(chr_id),
|
|
50 ginfo[3],
|
8
|
51 str(ginfo[0]-n),
|
|
52 str(ginfo[1]+n),
|
0
|
53 ginfo[2],
|
|
54 str(product)]
|
|
55
|
|
56 # to key on specfic products only
|
8
|
57 for k in keys_1.split(","):
|
0
|
58 if k in str(product):
|
|
59
|
8
|
60 lines.append('\t'.join(line)+"\n")
|
0
|
61 break
|
|
62
|
8
|
63 return lines
|
0
|
64
|
|
65
|
|
66
|
|
67
|
8
|
68 def gbk_parse(fname,keys_1,n,outfile=""):
|
0
|
69 """
|
|
70 Extract genome annotation recods from genbank format
|
|
71
|
|
72 @args fname: gbk file name
|
|
73 @type fname: str
|
|
74 """
|
8
|
75 fhand = open_file(gbkfname)
|
|
76 lines = []
|
0
|
77
|
|
78 for record in SeqIO.parse(fhand, "genbank"):
|
|
79 gene_tags = dict()
|
|
80 tx_tags = collections.defaultdict(list)
|
|
81 exon = collections.defaultdict(list)
|
|
82 cds = collections.defaultdict(list)
|
|
83 mol_type, chr_id = None, None
|
|
84
|
|
85 for rec in record.features:
|
|
86
|
|
87
|
|
88 product = "NA"
|
|
89
|
|
90 if rec.qualifiers.has_key('product'):
|
|
91
|
|
92 product = rec.qualifiers['product'][0]
|
|
93
|
|
94
|
|
95 if rec.type == 'source':
|
|
96 try:
|
|
97 mol_type = rec.qualifiers['mol_type'][0]
|
|
98 except:
|
|
99 mol_type = '.'
|
|
100 pass
|
|
101 try:
|
|
102 chr_id = rec.qualifiers['chromosome'][0]
|
|
103 except:
|
|
104 chr_id = record.name
|
|
105 continue
|
|
106
|
|
107 strand='-'
|
|
108 strand='+' if rec.strand>0 else strand
|
|
109
|
|
110 fid = None
|
|
111 try:
|
|
112 fid = rec.qualifiers['gene'][0]
|
|
113 except:
|
|
114 pass
|
|
115
|
|
116 transcript_id = None
|
|
117 try:
|
|
118 transcript_id = rec.qualifiers['transcript_id'][0]
|
|
119 except:
|
|
120 pass
|
|
121
|
|
122 if re.search(r'gene', rec.type):
|
|
123 gene_tags[fid] = (rec.location._start.position+1,
|
|
124 rec.location._end.position,
|
|
125 strand,
|
|
126 rec.type
|
|
127 )
|
|
128 elif rec.type == 'exon':
|
|
129 exon[fid].append((rec.location._start.position+1,
|
|
130 rec.location._end.position))
|
|
131 elif rec.type=='CDS':
|
|
132 cds[fid].append((rec.location._start.position+1,
|
|
133 rec.location._end.position))
|
|
134 else:
|
|
135 # get all transcripts
|
|
136 if transcript_id:
|
|
137 tx_tags[fid].append((rec.location._start.position+1,
|
|
138 rec.location._end.position,
|
|
139 transcript_id,
|
|
140 rec.type))
|
|
141 # record extracted, generate feature table
|
8
|
142 lines += feature_table(chr_id, mol_type, strand, gene_tags, tx_tags, cds, exon, product,keys_1,n)
|
0
|
143
|
|
144 fhand.close()
|
|
145
|
8
|
146 return lines
|
0
|
147
|
|
148 if __name__=='__main__':
|
|
149
|
|
150 try:
|
|
151 parser = argparse.ArgumentParser()
|
8
|
152 parser.add_argument('-k', '--keys_1', type=str, help="list of products to key on in a comma delimited list")
|
0
|
153 parser.add_argument('-g', '--gbk_file', help="genbank file of reference")
|
8
|
154 parser.add_argument('-n', '--number', type=int, help="flanking region to include")
|
|
155 parser.add_argument('-o', '--output', help="output file")
|
0
|
156
|
|
157 args = parser.parse_args()
|
|
158 gbkfname = args.gbk_file
|
|
159 n=args.number
|
8
|
160 keys_1=args.keys_1
|
|
161 output=args.output
|
0
|
162
|
8
|
163 except Exception,e:
|
|
164 print "Error: %s" % e
|
0
|
165 sys.exit(-1)
|
|
166
|
8
|
167 ## extract gbk records
|
|
168
|
|
169 print "Filtering on keys: %s" % keys_1
|
|
170 lines = gbk_parse(gbkfname,keys_1,n)
|
|
171
|
|
172 with open(output, 'w') as of:
|
|
173 for l in lines:
|
|
174 of.write(l)
|
0
|
175
|
|
176
|
|
177
|
|
178
|