Mercurial > repos > cpt > cpt_gbk_to_gff
comparison gbk_to_gff3.py @ 1:bb6332a85aa6 draft
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author | cpt |
---|---|
date | Mon, 05 Jun 2023 02:43:04 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:a68f32350196 | 1:bb6332a85aa6 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 import argparse | |
4 import sys | |
5 | |
6 from Bio import SeqIO | |
7 from Bio.SeqRecord import SeqRecord | |
8 from Bio.SeqFeature import FeatureLocation | |
9 from CPT_GFFParser import gffSeqFeature, gffWrite | |
10 | |
11 bottomFeatTypes = ["exon", "RBS", "CDS"] | |
12 | |
13 | |
14 def makeGffFeat(inFeat, num, recName, identifier): | |
15 if inFeat.type == "RBS" or ( | |
16 inFeat.type == "regulatory" | |
17 and "regulatory_class" in inFeat.qualifiers.keys() | |
18 and inFeat.qualifiers["regulatory_class"][0] == "ribosome_binding_site" | |
19 ): | |
20 inFeat.type = "Shine_Dalgarno_sequence" | |
21 if "codon_start" in inFeat.qualifiers.keys(): | |
22 shift = int(inFeat.qualifiers["codon_start"][0]) - 1 | |
23 else: | |
24 shift = "." | |
25 if identifier in inFeat.qualifiers.keys(): | |
26 name = inFeat.qualifiers[identifier][0] + "." + inFeat.type | |
27 if num > 0: | |
28 name += "." + str(num) | |
29 else: | |
30 name = recName + "." + inFeat.type + "." + str(num) | |
31 | |
32 outFeat = gffSeqFeature( | |
33 inFeat.location, | |
34 inFeat.type, | |
35 "", | |
36 inFeat.strand, | |
37 name, | |
38 inFeat.qualifiers, | |
39 None, | |
40 None, | |
41 None, | |
42 shift, | |
43 0, | |
44 "GbkToGff", | |
45 ) | |
46 outFeat.qualifiers["ID"] = [name] | |
47 return outFeat | |
48 | |
49 | |
50 def main(inFile, makeMRNA, makeGene, identifier, fastaFile, outFile): | |
51 | |
52 ofh = sys.stdout | |
53 if outFile: | |
54 ofh = outFile | |
55 | |
56 outRec = [] | |
57 failed = 0 | |
58 for rec in SeqIO.parse(inFile, "genbank"): | |
59 recID = rec.name | |
60 | |
61 if len(str(rec.seq)) > 0: | |
62 seqs_pending_writes = True | |
63 outSeq = str(rec.seq) | |
64 seqLen = len(outSeq) | |
65 | |
66 locBucket = {} | |
67 outFeats = [] | |
68 topTypeDict = {} | |
69 seekingParent = [] | |
70 geneNum = 0 | |
71 autoGeneNum = 0 | |
72 for feat in rec.features: | |
73 if ( | |
74 identifier not in feat.qualifiers.keys() | |
75 ): # Allow metadata features and other features with no ID (Output warning?) - AJC | |
76 if feat.type in bottomFeatTypes: | |
77 seekingParent.append( | |
78 [feat, [], []] | |
79 ) # [Feature, all parent candidates, strongest parent candidates] | |
80 continue | |
81 elif feat.type not in topTypeDict.keys(): | |
82 topTypeDict[feat.type] = 1 | |
83 else: | |
84 topTypeDict[feat.type] += 1 | |
85 outFeats.append( | |
86 makeGffFeat(feat, topTypeDict[feat.type], recID, identifier) | |
87 ) | |
88 continue | |
89 elif feat.qualifiers[identifier][0] not in locBucket.keys(): | |
90 locBucket[feat.qualifiers[identifier][0]] = [] | |
91 locBucket[feat.qualifiers[identifier][0]].append(feat) | |
92 | |
93 for locus in locBucket.keys(): | |
94 minLoc = locBucket[locus][0].location.start | |
95 maxLoc = locBucket[locus][0].location.end | |
96 for feat in locBucket[locus]: | |
97 minLoc = min(minLoc, feat.location.start) | |
98 maxLoc = max(maxLoc, feat.location.end) | |
99 for x in seekingParent: | |
100 if x[0].location.start >= minLoc and x[0].location.end <= maxLoc: | |
101 x[1].append(locus) | |
102 if x[0].location.start == minLoc or x[0].location.end == maxLoc: | |
103 x[2].append(locus) | |
104 | |
105 for x in seekingParent: # Reformat to [Feature, Locus, Unused/Free] | |
106 if len(x[2]) == 1: | |
107 finList = "" | |
108 if len(x[1]) > 1: | |
109 for loc in x[1]: | |
110 if loc != x[2][0]: | |
111 finList += loc + ", " | |
112 finList = ( | |
113 str(x[0].type) | |
114 + " had no locus tag set in .gbk file, automatically derived. Other, weaker candidate(s) were " | |
115 + finList[0:-2] | |
116 + "." | |
117 ) | |
118 else: | |
119 finList = ( | |
120 str(x[0].type) | |
121 + " had no locus tag set in .gbk file, automatically derived." | |
122 ) | |
123 if "Notes" not in x[0].qualifiers.keys(): | |
124 x[0].qualifiers["Notes"] = [] | |
125 x[0].qualifiers["Notes"].append(finList) | |
126 x[1] = x[2][0] | |
127 elif len(x[2]) > 1: | |
128 candidate = x[2][0] # Arbitrarily choose first one | |
129 finList = "" | |
130 strongList = "" | |
131 for loc in x[2]: | |
132 if loc != candidate: | |
133 finList += loc + ", " | |
134 strongList += loc + ", " | |
135 for loc in x[1]: | |
136 if loc not in x[2]: | |
137 finList += loc + ", " | |
138 finList = ( | |
139 str(x[0].type) | |
140 + " had no locus tag set in .gbk file, automatically derived. Other candidate(s) were " | |
141 + finList[0:-2] | |
142 + " (Equally strong candidate(s): " | |
143 + strongList[0:-2] | |
144 + ")." | |
145 ) | |
146 if "Notes" not in x[0].qualifiers.keys(): | |
147 x[0].qualifiers["Notes"] = [] | |
148 x[0].qualifiers["Notes"].append(finList) | |
149 x[1] = candidate | |
150 elif len(x[1]) == 1: | |
151 x[1] = x[1][0] | |
152 if "Notes" not in x[0].qualifiers.keys(): | |
153 x[0].qualifiers["Notes"] = [] | |
154 finList = ( | |
155 str(x[0].type) | |
156 + " had no locus tag set in .gbk file, automatically derived." | |
157 ) | |
158 x[0].qualifiers["Notes"].append(finList) | |
159 elif len(x[1]) > 1: | |
160 candidate = x[1][0] # Arbitrarily choose first one | |
161 finList = "" | |
162 for loc in x[1]: | |
163 if loc != candidate: | |
164 finList += loc + ", " | |
165 finList = ( | |
166 str(x[0].type) | |
167 + " had no locus tag set in .gbk file, automatically derived. Other candidates were " | |
168 + finList[0:-2] | |
169 + "." | |
170 ) | |
171 if "Notes" not in x[0].qualifiers.keys(): | |
172 x[0].qualifiers["Notes"] = [] | |
173 x[0].qualifiers["Notes"].append(finList) | |
174 x[1] = candidate | |
175 else: | |
176 if makeGene: | |
177 sys.stderr.write( | |
178 "Warning: Unable to find potential parent for feature with no " | |
179 + identifier | |
180 + " of type " | |
181 + str(x[0].type) | |
182 + " at location [" | |
183 + str(x[0].location.start + 1) | |
184 + ", " | |
185 + str(x[0].location.end) | |
186 + "], creating standalone gene.\n" | |
187 ) | |
188 autoGeneNum += 1 | |
189 x[0].source = "GbkToGff" | |
190 x[0].score = 0 | |
191 x[0].shift = 0 | |
192 if "ID" not in x[0].qualifiers.keys(): | |
193 x[0].qualifiers["ID"] = [ | |
194 recID + ".standalone_" + x[0].type + "." + str(autoGeneNum) | |
195 ] | |
196 tempName = recID + ".derived_Gene." + str(autoGeneNum) | |
197 tempQuals = { | |
198 "ID": [tempName], | |
199 "Notes": [ | |
200 "Gene feature automatically generated by Gbk to GFF conversion" | |
201 ], | |
202 } | |
203 tempGene = gffSeqFeature( | |
204 FeatureLocation( | |
205 x[0].location.start, x[0].location.end, x[0].location.strand | |
206 ), | |
207 "gene", | |
208 "", | |
209 x[0].strand, | |
210 tempName, | |
211 tempQuals, | |
212 None, | |
213 None, | |
214 None, | |
215 ".", | |
216 0, | |
217 "GbkToGff", | |
218 ) | |
219 if makeMRNA: | |
220 tempName = recID + ".derived_mRNA." + str(autoGeneNum) | |
221 tempQuals = { | |
222 "ID": [tempName], | |
223 "Notes": [ | |
224 "mRNA feature automatically generated by Gbk to GFF conversion" | |
225 ], | |
226 } | |
227 tempGene.sub_features.append( | |
228 gffSeqFeature( | |
229 FeatureLocation( | |
230 x[0].location.start, | |
231 x[0].location.end, | |
232 x[0].location.strand, | |
233 ), | |
234 "mRNA", | |
235 "", | |
236 x[0].strand, | |
237 tempName, | |
238 tempQuals, | |
239 None, | |
240 None, | |
241 None, | |
242 ".", | |
243 0, | |
244 "GbkToGff", | |
245 ) | |
246 ) | |
247 tempGene.sub_features[-1].sub_features.append(x[0]) | |
248 else: | |
249 tempGene.sub_features.append(x[0]) | |
250 | |
251 outFeats.append(tempGene) | |
252 else: | |
253 sys.stderr.write( | |
254 "Warning: Unable to find potential parent for feature with no " | |
255 + identifier | |
256 + " of type " | |
257 + str(x[0].type) | |
258 + " at location [" | |
259 + str(x[0].location.start + 1) | |
260 + ", " | |
261 + str(x[0].location.end) | |
262 + "].\n" | |
263 ) | |
264 if x[0].type not in topTypeDict.keys(): | |
265 topTypeDict[x[0].type] = 1 | |
266 else: | |
267 topTypeDict[x[0].type] += 1 | |
268 outFeats.append( | |
269 makeGffFeat(x[0], topTypeDict[x[0].type], recID, identifier) | |
270 ) | |
271 | |
272 for locus in locBucket.keys(): | |
273 if len(locBucket[locus]) == 1: # No heirarchy to be made | |
274 outFeats.append(makeGffFeat(locBucket[locus][0], 0, recID, identifier)) | |
275 continue | |
276 topFeat = None | |
277 midFeat = None | |
278 bottomFeats = [] | |
279 typeDict = {} | |
280 minLoc = locBucket[locus][0].location.start | |
281 maxLoc = locBucket[locus][0].location.end | |
282 geneNum += 1 | |
283 for feat in locBucket[locus]: | |
284 # If we want to make our own top-level feat? | |
285 minLoc = min(minLoc, feat.location.start) | |
286 maxLoc = max(maxLoc, feat.location.end) | |
287 | |
288 # Gene->mRNA->CDS included as example, to add other feature-heirarchys in the appropriate slot | |
289 if feat.type in ["gene"]: | |
290 if not topFeat: | |
291 topFeat = feat | |
292 # Else handle multiple top features | |
293 elif feat.type in ["mRNA", "tRNA", "rRNA"]: | |
294 if not midFeat: | |
295 midFeat = feat | |
296 # Else handle multiple mid feats (May need another elif type-in-list statement if we actually expect a list of mid feats) | |
297 else: | |
298 if feat.type not in typeDict.keys(): | |
299 typeDict[feat.type] = 1 | |
300 else: | |
301 typeDict[feat.type] += 1 | |
302 bottomFeats.append(feat) | |
303 | |
304 for x in seekingParent: | |
305 if type(x[1]) != "list" and locus == x[1]: | |
306 x[0].qualifiers[identifier] = [locus] | |
307 bottomFeats.append(x[0]) | |
308 if x[0].type not in typeDict.keys(): | |
309 typeDict[x[0].type] = 1 | |
310 else: | |
311 typeDict[x[0].type] += 1 | |
312 | |
313 # if not topFeat: # Make our own top-level feature based off minLoc, maxLoc bounds | |
314 | |
315 for ( | |
316 x | |
317 ) in ( | |
318 typeDict.keys() | |
319 ): # If only 1, set it to 0 so we don't append a number to the name | |
320 if ( | |
321 typeDict[x] == 1 | |
322 ): # Else, set to 1 so that we count up as we encounter the features | |
323 typeDict[x] = 0 | |
324 else: | |
325 typeDict[x] = 1 | |
326 | |
327 if not topFeat: | |
328 if makeGene: | |
329 if midFeat: | |
330 possibleStrand = midFeat.strand | |
331 else: | |
332 possibleStrand = bottomFeats[0].strand | |
333 tempName = recID + ".gene." + str(geneNum) | |
334 tempQuals = { | |
335 identifier: [locus], | |
336 "ID": [tempName], | |
337 "Notes": [ | |
338 "Gene feature automatically generated by Gbk to GFF conversion" | |
339 ], | |
340 } | |
341 topFeat = gffSeqFeature( | |
342 FeatureLocation(minLoc, maxLoc, possibleStrand), | |
343 "gene", | |
344 "", | |
345 possibleStrand, | |
346 tempName, | |
347 tempQuals, | |
348 None, | |
349 None, | |
350 None, | |
351 ".", | |
352 0, | |
353 "GbkToGff", | |
354 ) | |
355 else: | |
356 sys.stderr.write( | |
357 "Unable to create a feature heirarchy at location [%d, %d] with features: \n" | |
358 % (minLoc, maxLoc) | |
359 ) | |
360 for x in locBucket[locus]: | |
361 sys.stderr.write(str(x)) | |
362 sys.stderr.write("\n") | |
363 failed = 1 | |
364 continue | |
365 | |
366 outFeats.append(makeGffFeat(topFeat, 0, recID, identifier)) | |
367 if not midFeat and topFeat.type == "gene" and makeMRNA: | |
368 if identifier in topFeat.qualifiers.keys(): | |
369 tempName = topFeat.qualifiers[identifier][0] + ".mRNA" | |
370 tempQuals = { | |
371 identifier: topFeat.qualifiers[identifier], | |
372 "ID": [tempName], | |
373 "Notes": [ | |
374 "mRNA feature automatically generated by Gbk to GFF conversion" | |
375 ], | |
376 } | |
377 else: | |
378 tempName = outFeats[-1].ID + ".mRNA" | |
379 tempQuals = { | |
380 identifier: topFeat.qualifiers[identifier], | |
381 "ID": [tempName], | |
382 "Notes": [ | |
383 "mRNA feature automatically generated by Gbk to GFF conversion" | |
384 ], | |
385 } | |
386 midFeat = gffSeqFeature( | |
387 FeatureLocation(minLoc, maxLoc, topFeat.strand), | |
388 "mRNA", | |
389 "", | |
390 topFeat.strand, | |
391 tempName, | |
392 tempQuals, | |
393 None, | |
394 None, | |
395 None, | |
396 ".", | |
397 0, | |
398 "GbkToGff", | |
399 ) | |
400 | |
401 if ( | |
402 midFeat | |
403 ): # Again, need a new if statement if we want to handle multiple mid-tier features | |
404 outFeats[-1].sub_features.append( | |
405 makeGffFeat(midFeat, 0, recID, identifier) | |
406 ) | |
407 outFeats[-1].sub_features[-1].qualifiers["Parent"] = [outFeats[-1].id] | |
408 for x in bottomFeats: | |
409 typeDict[x.type] += 1 | |
410 outFeats[-1].sub_features[-1].sub_features.append( | |
411 makeGffFeat(x, typeDict[x.type], recID, identifier) | |
412 ) | |
413 outFeats[-1].sub_features[-1].sub_features[-1].qualifiers[ | |
414 "Parent" | |
415 ] = [outFeats[-1].sub_features[-1].id] | |
416 else: # No midFeat, append bottom feats directly to top feats | |
417 for x in bottomFeats: | |
418 typeDict[x.type] += 1 | |
419 outFeats[-1].sub_features.append( | |
420 makeGffFeat(x, typeDict[x.type], recID, identifier) | |
421 ) | |
422 outFeats[-1].sub_features[-1].qualifiers["Parent"] = [ | |
423 outFeats[-1].id | |
424 ] | |
425 | |
426 outRec.append( | |
427 SeqRecord( | |
428 rec.seq, | |
429 recID, | |
430 rec.name, | |
431 rec.description, | |
432 rec.dbxrefs, | |
433 sorted(outFeats, key=lambda x: x.location.start), | |
434 rec.annotations, | |
435 rec.letter_annotations, | |
436 ) | |
437 ) | |
438 SeqIO.write([outRec[-1]], fastaFile, "fasta") | |
439 gffWrite(outRec, ofh) | |
440 exit(failed) # 0 if all features handled, 1 if unable to handle some | |
441 | |
442 | |
443 if __name__ == "__main__": | |
444 parser = argparse.ArgumentParser( | |
445 description="Biopython solution to Gbk to GFF conversion" | |
446 ) | |
447 | |
448 parser.add_argument( | |
449 "inFile", type=argparse.FileType("r"), help="Path to an input GBK file" | |
450 ) | |
451 parser.add_argument( | |
452 "--makeMRNA", | |
453 action="store_true", | |
454 required=False, | |
455 help="Automatically create mRNA features", | |
456 ) | |
457 parser.add_argument( | |
458 "--makeGene", | |
459 action="store_true", | |
460 required=False, | |
461 help="Automatically create missing Gene features", | |
462 ) | |
463 parser.add_argument( | |
464 "--identifier", | |
465 type=str, | |
466 default="locus_tag", | |
467 required=False, | |
468 help="Qualifier to derive ID property from", | |
469 ) | |
470 parser.add_argument( | |
471 "--fastaFile", type=argparse.FileType("w"), help="Fasta output for sequences" | |
472 ) | |
473 parser.add_argument( | |
474 "--outFile", type=argparse.FileType("w"), help="GFF feature output" | |
475 ) | |
476 args = parser.parse_args() | |
477 main(**vars(args)) |