0
|
1 #!/usr/bin/env python
|
|
2 import BIO_FIX_TOPO # NOQA
|
|
3 import argparse
|
|
4 import sys # noqa
|
|
5 from Bio import SeqIO
|
|
6
|
|
7 import logging
|
|
8
|
|
9 logging.basicConfig(level=logging.INFO)
|
|
10 log = logging.getLogger()
|
|
11
|
|
12 # gene and RBS features are also included in the tagged features list, but are dealt with specifically elsewhere.
|
|
13 # This is used to filter out just valid "in gene" features
|
|
14 TAGGED_FEATURES = ["CDS", "tRNA", "intron", "mat_peptide"]
|
|
15
|
|
16
|
|
17 def renumber_genes(
|
|
18 gbk_files,
|
|
19 tag_to_update="locus_tag",
|
|
20 string_prefix="display_id",
|
|
21 leading_zeros=3,
|
|
22 forceTagMatch=False,
|
|
23 change_table=None,
|
|
24 ):
|
|
25
|
|
26 for gbk_file in gbk_files:
|
|
27 for record in SeqIO.parse(gbk_file, "genbank"):
|
|
28 if string_prefix == "display_id":
|
|
29 format_string = record.id + "_%0" + str(leading_zeros) + "d"
|
|
30 else:
|
|
31 format_string = string_prefix + "%0" + str(leading_zeros) + "d"
|
|
32
|
|
33 # f_cds = [f for f in record.features if f.type == 'CDS']
|
|
34 # f_rbs = [f for f in record.features if f.type == 'RBS']
|
|
35 # f_gene = [f for f in record.features if f.type == 'gene']
|
|
36 # f_intron = [f for f in record.features if f.type == 'intron']
|
|
37 # f_trna = [f for f in record.features if f.type == 'tRNA']
|
|
38 # f_pep = [f for f in record.features if f.type == 'mat_peptide']
|
|
39 # f_oth = [f for f in record.features if f.type not in ['CDS', 'RBS',
|
|
40 # 'gene', 'intron',
|
|
41 # 'tRNA', 'mat_peptide']]
|
|
42 # Apparently we're numbering tRNAs now, thanks for telling me.
|
|
43 # f_oth2 = []
|
|
44 # for q in sorted(f_oth, key=lambda x: x.location.start):
|
|
45 # if q.type == 'tRNA':
|
|
46 # q.qualifiers['locus_tag'] = format_string_t % tRNA_count
|
|
47 # tRNA_count += 1
|
|
48 # f_oth2.append(q)
|
|
49 # else:
|
|
50 # f_oth2.append(q)
|
|
51 # f_oth = f_oth2
|
|
52
|
|
53 # f_care_about = []
|
|
54
|
|
55 # Make sure we've hit every RBS and gene
|
|
56 # for cds in f_cds:
|
|
57 # If there's an associated gene feature, it will share a stop codon
|
|
58 # if cds.location.strand > 0:
|
|
59 # associated_genes = [f for f in f_gene if f.location.end ==
|
|
60 # cds.location.end]
|
|
61 # else:
|
|
62 # associated_genes = [f for f in f_gene if f.location.start ==
|
|
63 # cds.location.start]
|
|
64
|
|
65 # # If there's an RBS it'll be upstream a bit.
|
|
66 # if cds.location.strand > 0:
|
|
67 # associated_rbss = [f for f in f_rbs if f.location.end <
|
|
68 # cds.location.start and f.location.end >
|
|
69 # cds.location.start - 24]
|
|
70 # else:
|
|
71 # associated_rbss = [f for f in f_rbs if f.location.start >
|
|
72 # cds.location.end and f.location.start <
|
|
73 # cds.location.end + 24]
|
|
74 # tmp_result = [cds]
|
|
75 # if len(associated_genes) > 0:
|
|
76 # tmp_result.append(associated_genes[0])
|
|
77
|
|
78 # if len(associated_rbss) == 1:
|
|
79 # tmp_result.append(associated_rbss[0])
|
|
80 # else:
|
|
81 # log.warning("%s RBSs found for %s", len(associated_rbss), cds.location)
|
|
82 # We choose to append to f_other as that has all features not
|
|
83 # already accessed. It may mean that some gene/RBS features are
|
|
84 # missed if they aren't detected here, which we'll need to handle.
|
|
85 # f_care_about.append(tmp_result)
|
|
86
|
|
87 #####-----------------------------------------------------------------------------------------------------------#####
|
|
88 # Build list of genes, then iterate over non-gene features and sort into containing genes.
|
|
89 # tags are assigned based on genes, so start the lists with the gene features
|
|
90 f_gene = sorted(
|
|
91 [f for f in record.features if f.type == "gene"],
|
|
92 key=lambda x: x.location.start,
|
|
93 )
|
|
94 oldNames = []
|
|
95 for x in f_gene:
|
|
96 if tag_to_update in x.qualifiers.keys():
|
|
97 oldNames.append(x.qualifiers[tag_to_update])
|
|
98
|
|
99 f_rbs = sorted(
|
|
100 [f for f in record.features if f.type == "RBS"],
|
|
101 key=lambda x: x.location.start,
|
|
102 )
|
|
103 f_tag = list()
|
|
104 f_sorted = sorted(
|
|
105 [f for f in record.features if f.type in TAGGED_FEATURES],
|
|
106 key=lambda x: x.location.start,
|
|
107 )
|
|
108 # genes not in the TAGGED_FEATURES list are exluded from the processing and assumed to already be clean
|
|
109 clean_features = sorted(
|
|
110 [
|
|
111 f
|
|
112 for f in record.features
|
|
113 if f.type not in TAGGED_FEATURES and f.type not in ["gene", "RBS"]
|
|
114 ],
|
|
115 key=lambda x: x.location.start,
|
|
116 )
|
|
117
|
|
118 f_processed = []
|
|
119 for gene in f_gene:
|
|
120 tag = [gene]
|
|
121 if gene.location.strand >= 0: # Be strict on where to find starting RBS
|
|
122 geneComp = gene.location.start
|
|
123 else:
|
|
124 geneComp = gene.location.end
|
|
125 # find the gene's RBS feature
|
|
126 for rbs in [f for f in f_rbs if f not in f_processed]:
|
|
127 if is_within(rbs, gene) and (
|
|
128 rbs.location.start == geneComp or rbs.location.end == geneComp
|
|
129 ):
|
|
130 if (tag_to_update not in rbs.qualifiers.keys()):
|
|
131 tag.append(rbs)
|
|
132 f_processed.append(rbs)
|
|
133 break
|
|
134 elif (tag_to_update not in gene.qualifiers.keys()): # This will gurantee qual is in gene and RBS for next check
|
|
135 tag.append(rbs)
|
|
136 f_processed.append(rbs)
|
|
137 break
|
|
138 elif (not forceTagMatch) or (rbs.qualifiers[tag_to_update] == gene.qualifiers[tag_to_update]):
|
|
139 tag.append(rbs)
|
|
140 f_processed.append(rbs)
|
|
141 break
|
|
142
|
|
143 # find all other non-RBS features
|
|
144 for feature in [f for f in f_sorted if f not in f_processed]:
|
|
145 # If the feature is within the gene boundaries (genes are the first entry in tag list),
|
|
146 # add it to the same locus tag group, does not process RBS
|
|
147 if is_within(feature, gene):
|
|
148 if tag_to_update not in feature.qualifiers.keys():
|
|
149 # catches genes and CDS feature that are intron-contained.
|
|
150 if feature.type == "CDS":
|
|
151 if (
|
|
152 feature.location.start == gene.location.start
|
|
153 or feature.location.end == gene.location.end
|
|
154 ):
|
|
155
|
|
156 tag.append(feature)
|
|
157 f_processed.append(feature)
|
|
158 else:
|
|
159 tag.append(feature)
|
|
160 f_processed.append(feature)
|
|
161 elif (not forceTagMatch) or (tag_to_update in gene.qualifiers.keys() and feature.qualifiers[tag_to_update] == gene.qualifiers[tag_to_update]):
|
|
162 tag.append(feature)
|
|
163 f_processed.append(feature)
|
|
164 elif feature.location.start > gene.location.end:
|
|
165 # because the features are sorted by coordinates,
|
|
166 # no features further down on the list will be in this gene
|
|
167 break
|
|
168 f_tag.append(tag)
|
|
169
|
|
170 # Process for frameshifts and mat_peptides (inteins)
|
|
171
|
|
172 # check for overlapped genes
|
|
173 # at this point, relevant features are put into tag buckets along with the containing gene
|
|
174 # matin the form of [gene, feature1, feature2, ...]
|
|
175 for rbs in [f for f in f_rbs if f not in f_processed]:
|
|
176 dupeRBS = False
|
|
177 for x in f_processed:
|
|
178 if x.type == "RBS" and (tag_to_update in rbs.qualifiers.keys() and tag_to_update in x.qualifiers.keys() and rbs.qualifiers[tag_to_update] == x.qualifiers[tag_to_update]):
|
|
179 dupeRBS = True
|
|
180 if dupeRBS:
|
|
181 change_table.write(
|
|
182 record.id
|
|
183 + "\t"
|
|
184 + rbs.type
|
|
185 + ":"
|
|
186 + (rbs.qualifiers[tag_to_update][0])
|
|
187 + "\t[Removed: Parent gene already had an RBS]\n"
|
|
188 )
|
|
189 else:
|
|
190 change_table.write(
|
|
191 record.id
|
|
192 + "\t"
|
|
193 + rbs.type
|
|
194 + ":"
|
|
195 + (rbs.qualifiers[tag_to_update][0])
|
|
196 + "\t[Removed: RBS did not both fall within boundary of gene and share a boundary with a gene]\n"
|
|
197 )
|
|
198
|
|
199
|
|
200 tag_index = 1
|
|
201 delta = []
|
|
202 for tag in f_tag: # each tag list is one 'bucket'
|
|
203 new_tag_value = format_string % tag_index
|
|
204 for feature in tag:
|
|
205 original_tag_value = delta_old(feature, tag_to_update)
|
|
206 feature.qualifiers[tag_to_update] = [new_tag_value]
|
|
207 # Once the tag is renumbered, it's added to the clean list for later output
|
|
208 clean_features.append(feature)
|
|
209 delta.append(
|
|
210 "\t".join((record.id, original_tag_value, new_tag_value))
|
|
211 )
|
|
212 tag_index += 1
|
|
213
|
|
214 # Why must these people start at 1
|
|
215 # Because we have to modify the array we work on a separate one
|
|
216 # clean_features = f_oth
|
|
217 # delta = []
|
|
218 # for index, feature_list in enumerate(sorted(f_care_about, key=lambda x: x[0].location.start)):
|
|
219 # for f in feature_list:
|
|
220 # original_tag_value = delta_old(f, tag_to_update)
|
|
221 # # Add 1 to index for 1-indexed counting for happy scientists
|
|
222 # new_tag_value = format_string % (index+1)
|
|
223 # f.qualifiers[tag_to_update] = [new_tag_value]
|
|
224 # clean_features.append(f)
|
|
225 # delta.append('\t'.join((record.id, original_tag_value, new_tag_value)))
|
|
226
|
|
227 # Update all features
|
|
228 record.features = sorted(clean_features, key=lambda x: x.location.start)
|
|
229
|
|
230 for feature in [f for f in f_sorted if f not in f_processed]:
|
|
231 if feature.type == "CDS":
|
|
232 if tag_to_update in feature.qualifiers.keys() and forceTagMatch:
|
|
233 failNameCheck = True
|
|
234 for x in oldNames:
|
|
235 for tag in feature.qualifiers[tag_to_update]:
|
|
236 if tag in x:
|
|
237 failNameCheck = False
|
|
238 if not failNameCheck:
|
|
239 break
|
|
240 if failNameCheck:
|
|
241 change_table.write(
|
|
242 record.id
|
|
243 + "\t"
|
|
244 + feature.type
|
|
245 + ":"
|
|
246 + (feature.qualifiers[tag_to_update][0])
|
|
247 + "\t[Removed: (Tag check enabled) CDS did not both share a start/end with and fall within a gene with the same " + tag_to_update + " value]\n"
|
|
248 )
|
|
249 else:
|
|
250 change_table.write(
|
|
251 record.id
|
|
252 + "\t"
|
|
253 + feature.type
|
|
254 + ":"
|
|
255 + (feature.qualifiers[tag_to_update][0])
|
|
256 + "\t[Removed: CDS did not both fall within boundary of gene and share a boundary with a gene]\n"
|
|
257 )
|
|
258 elif tag_to_update in feature.qualifiers.keys():
|
|
259 change_table.write(
|
|
260 record.id
|
|
261 + "\t"
|
|
262 + feature.type
|
|
263 + ":"
|
|
264 + (feature.qualifiers[tag_to_update][0])
|
|
265 + "\t[Removed: CDS did not both fall within boundary of gene and share a boundary with a gene]\n"
|
|
266 )
|
|
267 else:
|
|
268 change_table.write(
|
|
269 record.id
|
|
270 + "\t"
|
|
271 + feature.type
|
|
272 + ": No "
|
|
273 + tag_to_update
|
|
274 + "\t[Removed: CDS at (" + str(feature.location.start) + "," + str(feature.location.end) + ") did not both fall within boundary of gene and share a boundary with a gene]\n"
|
|
275 )
|
|
276 else:
|
|
277 if tag_to_update in feature.qualifiers.keys() and forceTagMatch:
|
|
278 failNameCheck = True
|
|
279 for x in oldNames:
|
|
280 for tag in feature.qualifiers[tag_to_update]:
|
|
281 if tag in x:
|
|
282 failNameCheck = False
|
|
283 if not failNameCheck:
|
|
284 break
|
|
285 if failNameCheck:
|
|
286 change_table.write(
|
|
287 record.id
|
|
288 + "\t"
|
|
289 + feature.type
|
|
290 + ":"
|
|
291 + (feature.qualifiers[tag_to_update][0])
|
|
292 + "\t[Removed: (Tag check enabled) Feature did not fall within a gene it shared a " + tag_to_update + " value with]\n"
|
|
293 )
|
|
294 else:
|
|
295 change_table.write(
|
|
296 record.id
|
|
297 + "\t"
|
|
298 + feature.type
|
|
299 + ":"
|
|
300 + (feature.qualifiers[tag_to_update][0])
|
|
301 + "\t[Removed: Feature not within boundary of a gene]\n"
|
|
302 )
|
|
303 elif tag_to_update in feature.qualifiers.keys():
|
|
304 change_table.write(
|
|
305 record.id
|
|
306 + "\t"
|
|
307 + feature.type
|
|
308 + ":"
|
|
309 + (feature.qualifiers[tag_to_update][0])
|
|
310 + "\t[Removed: Feature not within boundary of a gene]\n"
|
|
311 )
|
|
312 else:
|
|
313 change_table.write(
|
|
314 record.id
|
|
315 + "\t"
|
|
316 + feature.type
|
|
317 + ": (has no "
|
|
318 + tag_to_update
|
|
319 + ")\t[Removed: Feature not within boundary of a gene]\n"
|
|
320 )
|
|
321 change_table.write("\n".join(delta) + "\n")
|
|
322
|
|
323 # Output
|
|
324 yield record
|
|
325
|
|
326
|
|
327 def delta_old(feature, tag_to_update):
|
|
328 # First part of delta entry, old name
|
|
329 if tag_to_update in feature.qualifiers:
|
|
330 return feature.qualifiers[tag_to_update][0]
|
|
331 else:
|
|
332 return "%s %s %s" % (
|
|
333 feature.location.start,
|
|
334 feature.location.end,
|
|
335 feature.location.strand,
|
|
336 )
|
|
337
|
|
338
|
|
339 def is_within(query, feature):
|
|
340 # checks if the query item is within the bounds of the given feature
|
|
341 sortedList = sorted(query.location.parts, key=lambda x: x.start)
|
|
342 for x in sortedList:
|
|
343 if (
|
|
344 feature.location.start <= x.start
|
|
345 and feature.location.end >= x.end
|
|
346 ):
|
|
347 if x.strand < 0 and x == sortedList[-1]:
|
|
348 return True
|
|
349 elif x.strand >= 0 and x == sortedList[0]:
|
|
350 return True
|
|
351 #else:
|
|
352 return False
|
|
353
|
|
354
|
|
355 # def fix_frameshift(a, b):
|
|
356 # #checks if gene a and gene b are a frameshifted gene (either shares a start or an end and an RBS)
|
|
357 # if a[0].location.start == b[0].location.start or a[0].location.end == b[0].location.end:
|
|
358 # # It is likely a frameshift. Treat is as such. Find shared RBS, determine which CDS is which
|
|
359 # big_gene = a if (a[0].location.end - a[0].location.start) > (b[0].location.end - b[0].location.start) else b
|
|
360 # small_gene = a if big_gene==b else b
|
|
361 # rbs = [f for f in a if f.type == 'RBS']
|
|
362 # # In the way that the tag lists are generated, the larger gene should contain both CDS features.
|
|
363 # # Retrieve and dermine big/small CDS
|
|
364 # cdss = [f for f in big_gene if f.type == 'CDS']
|
|
365 # big_cds = cdss[0] if (cdss[0].location.end - cdss[0].location.start) > (cdss[1].location.end - cdss[1].location.start) else cdss[1]
|
|
366 # small_cds = cdss[0] if big_cds==cdss[1] else cdss[1]
|
|
367
|
|
368
|
|
369 if __name__ == "__main__":
|
|
370 parser = argparse.ArgumentParser(description="Renumber genbank files")
|
|
371 parser.add_argument(
|
|
372 "gbk_files", type=argparse.FileType("r"), nargs="+", help="Genbank files"
|
|
373 )
|
|
374 parser.add_argument(
|
|
375 "--tag_to_update", type=str, help="Tag to update", default="locus_tag"
|
|
376 )
|
|
377 parser.add_argument(
|
|
378 "--string_prefix", type=str, help="Prefix string", default="display_id"
|
|
379 )
|
|
380 parser.add_argument(
|
|
381 "--leading_zeros", type=int, help="# of leading zeroes", default=3
|
|
382 )
|
|
383
|
|
384 parser.add_argument(
|
|
385 "--forceTagMatch", action="store_true", help="Make non-CDS features match tag initially"
|
|
386 )
|
|
387
|
|
388 parser.add_argument(
|
|
389 "--change_table",
|
|
390 type=argparse.FileType("w"),
|
|
391 help="Location to store change table in",
|
|
392 default="renumber.tsv",
|
|
393 )
|
|
394
|
|
395 args = parser.parse_args()
|
|
396 for record in renumber_genes(**vars(args)):
|
|
397 SeqIO.write(record, sys.stdout, "genbank")
|