Mercurial > repos > cpt > cpt_gbk_renumber
comparison cpt_renumber_gbk/renumber.py @ 0:8cac332dbc77 draft default tip
Uploaded
author | cpt |
---|---|
date | Fri, 17 Jun 2022 13:13:47 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:8cac332dbc77 |
---|---|
1 #!/usr/bin/env python | |
2 import BIO_FIX_TOPO # NOQA | |
3 import argparse | |
4 import sys # noqa | |
5 from Bio import SeqIO | |
6 | |
7 import logging | |
8 | |
9 logging.basicConfig(level=logging.INFO) | |
10 log = logging.getLogger() | |
11 | |
12 # gene and RBS features are also included in the tagged features list, but are dealt with specifically elsewhere. | |
13 # This is used to filter out just valid "in gene" features | |
14 TAGGED_FEATURES = ["CDS", "tRNA", "intron", "mat_peptide"] | |
15 | |
16 | |
17 def renumber_genes( | |
18 gbk_files, | |
19 tag_to_update="locus_tag", | |
20 string_prefix="display_id", | |
21 leading_zeros=3, | |
22 forceTagMatch=False, | |
23 change_table=None, | |
24 ): | |
25 | |
26 for gbk_file in gbk_files: | |
27 for record in SeqIO.parse(gbk_file, "genbank"): | |
28 if string_prefix == "display_id": | |
29 format_string = record.id + "_%0" + str(leading_zeros) + "d" | |
30 else: | |
31 format_string = string_prefix + "%0" + str(leading_zeros) + "d" | |
32 | |
33 # f_cds = [f for f in record.features if f.type == 'CDS'] | |
34 # f_rbs = [f for f in record.features if f.type == 'RBS'] | |
35 # f_gene = [f for f in record.features if f.type == 'gene'] | |
36 # f_intron = [f for f in record.features if f.type == 'intron'] | |
37 # f_trna = [f for f in record.features if f.type == 'tRNA'] | |
38 # f_pep = [f for f in record.features if f.type == 'mat_peptide'] | |
39 # f_oth = [f for f in record.features if f.type not in ['CDS', 'RBS', | |
40 # 'gene', 'intron', | |
41 # 'tRNA', 'mat_peptide']] | |
42 # Apparently we're numbering tRNAs now, thanks for telling me. | |
43 # f_oth2 = [] | |
44 # for q in sorted(f_oth, key=lambda x: x.location.start): | |
45 # if q.type == 'tRNA': | |
46 # q.qualifiers['locus_tag'] = format_string_t % tRNA_count | |
47 # tRNA_count += 1 | |
48 # f_oth2.append(q) | |
49 # else: | |
50 # f_oth2.append(q) | |
51 # f_oth = f_oth2 | |
52 | |
53 # f_care_about = [] | |
54 | |
55 # Make sure we've hit every RBS and gene | |
56 # for cds in f_cds: | |
57 # If there's an associated gene feature, it will share a stop codon | |
58 # if cds.location.strand > 0: | |
59 # associated_genes = [f for f in f_gene if f.location.end == | |
60 # cds.location.end] | |
61 # else: | |
62 # associated_genes = [f for f in f_gene if f.location.start == | |
63 # cds.location.start] | |
64 | |
65 # # If there's an RBS it'll be upstream a bit. | |
66 # if cds.location.strand > 0: | |
67 # associated_rbss = [f for f in f_rbs if f.location.end < | |
68 # cds.location.start and f.location.end > | |
69 # cds.location.start - 24] | |
70 # else: | |
71 # associated_rbss = [f for f in f_rbs if f.location.start > | |
72 # cds.location.end and f.location.start < | |
73 # cds.location.end + 24] | |
74 # tmp_result = [cds] | |
75 # if len(associated_genes) > 0: | |
76 # tmp_result.append(associated_genes[0]) | |
77 | |
78 # if len(associated_rbss) == 1: | |
79 # tmp_result.append(associated_rbss[0]) | |
80 # else: | |
81 # log.warning("%s RBSs found for %s", len(associated_rbss), cds.location) | |
82 # We choose to append to f_other as that has all features not | |
83 # already accessed. It may mean that some gene/RBS features are | |
84 # missed if they aren't detected here, which we'll need to handle. | |
85 # f_care_about.append(tmp_result) | |
86 | |
87 #####-----------------------------------------------------------------------------------------------------------##### | |
88 # Build list of genes, then iterate over non-gene features and sort into containing genes. | |
89 # tags are assigned based on genes, so start the lists with the gene features | |
90 f_gene = sorted( | |
91 [f for f in record.features if f.type == "gene"], | |
92 key=lambda x: x.location.start, | |
93 ) | |
94 oldNames = [] | |
95 for x in f_gene: | |
96 if tag_to_update in x.qualifiers.keys(): | |
97 oldNames.append(x.qualifiers[tag_to_update]) | |
98 | |
99 f_rbs = sorted( | |
100 [f for f in record.features if f.type == "RBS"], | |
101 key=lambda x: x.location.start, | |
102 ) | |
103 f_tag = list() | |
104 f_sorted = sorted( | |
105 [f for f in record.features if f.type in TAGGED_FEATURES], | |
106 key=lambda x: x.location.start, | |
107 ) | |
108 # genes not in the TAGGED_FEATURES list are exluded from the processing and assumed to already be clean | |
109 clean_features = sorted( | |
110 [ | |
111 f | |
112 for f in record.features | |
113 if f.type not in TAGGED_FEATURES and f.type not in ["gene", "RBS"] | |
114 ], | |
115 key=lambda x: x.location.start, | |
116 ) | |
117 | |
118 f_processed = [] | |
119 for gene in f_gene: | |
120 tag = [gene] | |
121 if gene.location.strand >= 0: # Be strict on where to find starting RBS | |
122 geneComp = gene.location.start | |
123 else: | |
124 geneComp = gene.location.end | |
125 # find the gene's RBS feature | |
126 for rbs in [f for f in f_rbs if f not in f_processed]: | |
127 if is_within(rbs, gene) and ( | |
128 rbs.location.start == geneComp or rbs.location.end == geneComp | |
129 ): | |
130 if (tag_to_update not in rbs.qualifiers.keys()): | |
131 tag.append(rbs) | |
132 f_processed.append(rbs) | |
133 break | |
134 elif (tag_to_update not in gene.qualifiers.keys()): # This will gurantee qual is in gene and RBS for next check | |
135 tag.append(rbs) | |
136 f_processed.append(rbs) | |
137 break | |
138 elif (not forceTagMatch) or (rbs.qualifiers[tag_to_update] == gene.qualifiers[tag_to_update]): | |
139 tag.append(rbs) | |
140 f_processed.append(rbs) | |
141 break | |
142 | |
143 # find all other non-RBS features | |
144 for feature in [f for f in f_sorted if f not in f_processed]: | |
145 # If the feature is within the gene boundaries (genes are the first entry in tag list), | |
146 # add it to the same locus tag group, does not process RBS | |
147 if is_within(feature, gene): | |
148 if tag_to_update not in feature.qualifiers.keys(): | |
149 # catches genes and CDS feature that are intron-contained. | |
150 if feature.type == "CDS": | |
151 if ( | |
152 feature.location.start == gene.location.start | |
153 or feature.location.end == gene.location.end | |
154 ): | |
155 | |
156 tag.append(feature) | |
157 f_processed.append(feature) | |
158 else: | |
159 tag.append(feature) | |
160 f_processed.append(feature) | |
161 elif (not forceTagMatch) or (tag_to_update in gene.qualifiers.keys() and feature.qualifiers[tag_to_update] == gene.qualifiers[tag_to_update]): | |
162 tag.append(feature) | |
163 f_processed.append(feature) | |
164 elif feature.location.start > gene.location.end: | |
165 # because the features are sorted by coordinates, | |
166 # no features further down on the list will be in this gene | |
167 break | |
168 f_tag.append(tag) | |
169 | |
170 # Process for frameshifts and mat_peptides (inteins) | |
171 | |
172 # check for overlapped genes | |
173 # at this point, relevant features are put into tag buckets along with the containing gene | |
174 # matin the form of [gene, feature1, feature2, ...] | |
175 for rbs in [f for f in f_rbs if f not in f_processed]: | |
176 dupeRBS = False | |
177 for x in f_processed: | |
178 if x.type == "RBS" and (tag_to_update in rbs.qualifiers.keys() and tag_to_update in x.qualifiers.keys() and rbs.qualifiers[tag_to_update] == x.qualifiers[tag_to_update]): | |
179 dupeRBS = True | |
180 if dupeRBS: | |
181 change_table.write( | |
182 record.id | |
183 + "\t" | |
184 + rbs.type | |
185 + ":" | |
186 + (rbs.qualifiers[tag_to_update][0]) | |
187 + "\t[Removed: Parent gene already had an RBS]\n" | |
188 ) | |
189 else: | |
190 change_table.write( | |
191 record.id | |
192 + "\t" | |
193 + rbs.type | |
194 + ":" | |
195 + (rbs.qualifiers[tag_to_update][0]) | |
196 + "\t[Removed: RBS did not both fall within boundary of gene and share a boundary with a gene]\n" | |
197 ) | |
198 | |
199 | |
200 tag_index = 1 | |
201 delta = [] | |
202 for tag in f_tag: # each tag list is one 'bucket' | |
203 new_tag_value = format_string % tag_index | |
204 for feature in tag: | |
205 original_tag_value = delta_old(feature, tag_to_update) | |
206 feature.qualifiers[tag_to_update] = [new_tag_value] | |
207 # Once the tag is renumbered, it's added to the clean list for later output | |
208 clean_features.append(feature) | |
209 delta.append( | |
210 "\t".join((record.id, original_tag_value, new_tag_value)) | |
211 ) | |
212 tag_index += 1 | |
213 | |
214 # Why must these people start at 1 | |
215 # Because we have to modify the array we work on a separate one | |
216 # clean_features = f_oth | |
217 # delta = [] | |
218 # for index, feature_list in enumerate(sorted(f_care_about, key=lambda x: x[0].location.start)): | |
219 # for f in feature_list: | |
220 # original_tag_value = delta_old(f, tag_to_update) | |
221 # # Add 1 to index for 1-indexed counting for happy scientists | |
222 # new_tag_value = format_string % (index+1) | |
223 # f.qualifiers[tag_to_update] = [new_tag_value] | |
224 # clean_features.append(f) | |
225 # delta.append('\t'.join((record.id, original_tag_value, new_tag_value))) | |
226 | |
227 # Update all features | |
228 record.features = sorted(clean_features, key=lambda x: x.location.start) | |
229 | |
230 for feature in [f for f in f_sorted if f not in f_processed]: | |
231 if feature.type == "CDS": | |
232 if tag_to_update in feature.qualifiers.keys() and forceTagMatch: | |
233 failNameCheck = True | |
234 for x in oldNames: | |
235 for tag in feature.qualifiers[tag_to_update]: | |
236 if tag in x: | |
237 failNameCheck = False | |
238 if not failNameCheck: | |
239 break | |
240 if failNameCheck: | |
241 change_table.write( | |
242 record.id | |
243 + "\t" | |
244 + feature.type | |
245 + ":" | |
246 + (feature.qualifiers[tag_to_update][0]) | |
247 + "\t[Removed: (Tag check enabled) CDS did not both share a start/end with and fall within a gene with the same " + tag_to_update + " value]\n" | |
248 ) | |
249 else: | |
250 change_table.write( | |
251 record.id | |
252 + "\t" | |
253 + feature.type | |
254 + ":" | |
255 + (feature.qualifiers[tag_to_update][0]) | |
256 + "\t[Removed: CDS did not both fall within boundary of gene and share a boundary with a gene]\n" | |
257 ) | |
258 elif tag_to_update in feature.qualifiers.keys(): | |
259 change_table.write( | |
260 record.id | |
261 + "\t" | |
262 + feature.type | |
263 + ":" | |
264 + (feature.qualifiers[tag_to_update][0]) | |
265 + "\t[Removed: CDS did not both fall within boundary of gene and share a boundary with a gene]\n" | |
266 ) | |
267 else: | |
268 change_table.write( | |
269 record.id | |
270 + "\t" | |
271 + feature.type | |
272 + ": No " | |
273 + tag_to_update | |
274 + "\t[Removed: CDS at (" + str(feature.location.start) + "," + str(feature.location.end) + ") did not both fall within boundary of gene and share a boundary with a gene]\n" | |
275 ) | |
276 else: | |
277 if tag_to_update in feature.qualifiers.keys() and forceTagMatch: | |
278 failNameCheck = True | |
279 for x in oldNames: | |
280 for tag in feature.qualifiers[tag_to_update]: | |
281 if tag in x: | |
282 failNameCheck = False | |
283 if not failNameCheck: | |
284 break | |
285 if failNameCheck: | |
286 change_table.write( | |
287 record.id | |
288 + "\t" | |
289 + feature.type | |
290 + ":" | |
291 + (feature.qualifiers[tag_to_update][0]) | |
292 + "\t[Removed: (Tag check enabled) Feature did not fall within a gene it shared a " + tag_to_update + " value with]\n" | |
293 ) | |
294 else: | |
295 change_table.write( | |
296 record.id | |
297 + "\t" | |
298 + feature.type | |
299 + ":" | |
300 + (feature.qualifiers[tag_to_update][0]) | |
301 + "\t[Removed: Feature not within boundary of a gene]\n" | |
302 ) | |
303 elif tag_to_update in feature.qualifiers.keys(): | |
304 change_table.write( | |
305 record.id | |
306 + "\t" | |
307 + feature.type | |
308 + ":" | |
309 + (feature.qualifiers[tag_to_update][0]) | |
310 + "\t[Removed: Feature not within boundary of a gene]\n" | |
311 ) | |
312 else: | |
313 change_table.write( | |
314 record.id | |
315 + "\t" | |
316 + feature.type | |
317 + ": (has no " | |
318 + tag_to_update | |
319 + ")\t[Removed: Feature not within boundary of a gene]\n" | |
320 ) | |
321 change_table.write("\n".join(delta) + "\n") | |
322 | |
323 # Output | |
324 yield record | |
325 | |
326 | |
327 def delta_old(feature, tag_to_update): | |
328 # First part of delta entry, old name | |
329 if tag_to_update in feature.qualifiers: | |
330 return feature.qualifiers[tag_to_update][0] | |
331 else: | |
332 return "%s %s %s" % ( | |
333 feature.location.start, | |
334 feature.location.end, | |
335 feature.location.strand, | |
336 ) | |
337 | |
338 | |
339 def is_within(query, feature): | |
340 # checks if the query item is within the bounds of the given feature | |
341 sortedList = sorted(query.location.parts, key=lambda x: x.start) | |
342 for x in sortedList: | |
343 if ( | |
344 feature.location.start <= x.start | |
345 and feature.location.end >= x.end | |
346 ): | |
347 if x.strand < 0 and x == sortedList[-1]: | |
348 return True | |
349 elif x.strand >= 0 and x == sortedList[0]: | |
350 return True | |
351 #else: | |
352 return False | |
353 | |
354 | |
355 # def fix_frameshift(a, b): | |
356 # #checks if gene a and gene b are a frameshifted gene (either shares a start or an end and an RBS) | |
357 # if a[0].location.start == b[0].location.start or a[0].location.end == b[0].location.end: | |
358 # # It is likely a frameshift. Treat is as such. Find shared RBS, determine which CDS is which | |
359 # big_gene = a if (a[0].location.end - a[0].location.start) > (b[0].location.end - b[0].location.start) else b | |
360 # small_gene = a if big_gene==b else b | |
361 # rbs = [f for f in a if f.type == 'RBS'] | |
362 # # In the way that the tag lists are generated, the larger gene should contain both CDS features. | |
363 # # Retrieve and dermine big/small CDS | |
364 # cdss = [f for f in big_gene if f.type == 'CDS'] | |
365 # big_cds = cdss[0] if (cdss[0].location.end - cdss[0].location.start) > (cdss[1].location.end - cdss[1].location.start) else cdss[1] | |
366 # small_cds = cdss[0] if big_cds==cdss[1] else cdss[1] | |
367 | |
368 | |
369 if __name__ == "__main__": | |
370 parser = argparse.ArgumentParser(description="Renumber genbank files") | |
371 parser.add_argument( | |
372 "gbk_files", type=argparse.FileType("r"), nargs="+", help="Genbank files" | |
373 ) | |
374 parser.add_argument( | |
375 "--tag_to_update", type=str, help="Tag to update", default="locus_tag" | |
376 ) | |
377 parser.add_argument( | |
378 "--string_prefix", type=str, help="Prefix string", default="display_id" | |
379 ) | |
380 parser.add_argument( | |
381 "--leading_zeros", type=int, help="# of leading zeroes", default=3 | |
382 ) | |
383 | |
384 parser.add_argument( | |
385 "--forceTagMatch", action="store_true", help="Make non-CDS features match tag initially" | |
386 ) | |
387 | |
388 parser.add_argument( | |
389 "--change_table", | |
390 type=argparse.FileType("w"), | |
391 help="Location to store change table in", | |
392 default="renumber.tsv", | |
393 ) | |
394 | |
395 args = parser.parse_args() | |
396 for record in renumber_genes(**vars(args)): | |
397 SeqIO.write(record, sys.stdout, "genbank") |