comparison gbk_to_gff3.py @ 1:bb6332a85aa6 draft

planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author cpt
date Mon, 05 Jun 2023 02:43:04 +0000
parents
children
comparison
equal deleted inserted replaced
0:a68f32350196 1:bb6332a85aa6
1 #!/usr/bin/env python
2
3 import argparse
4 import sys
5
6 from Bio import SeqIO
7 from Bio.SeqRecord import SeqRecord
8 from Bio.SeqFeature import FeatureLocation
9 from CPT_GFFParser import gffSeqFeature, gffWrite
10
11 bottomFeatTypes = ["exon", "RBS", "CDS"]
12
13
14 def makeGffFeat(inFeat, num, recName, identifier):
15 if inFeat.type == "RBS" or (
16 inFeat.type == "regulatory"
17 and "regulatory_class" in inFeat.qualifiers.keys()
18 and inFeat.qualifiers["regulatory_class"][0] == "ribosome_binding_site"
19 ):
20 inFeat.type = "Shine_Dalgarno_sequence"
21 if "codon_start" in inFeat.qualifiers.keys():
22 shift = int(inFeat.qualifiers["codon_start"][0]) - 1
23 else:
24 shift = "."
25 if identifier in inFeat.qualifiers.keys():
26 name = inFeat.qualifiers[identifier][0] + "." + inFeat.type
27 if num > 0:
28 name += "." + str(num)
29 else:
30 name = recName + "." + inFeat.type + "." + str(num)
31
32 outFeat = gffSeqFeature(
33 inFeat.location,
34 inFeat.type,
35 "",
36 inFeat.strand,
37 name,
38 inFeat.qualifiers,
39 None,
40 None,
41 None,
42 shift,
43 0,
44 "GbkToGff",
45 )
46 outFeat.qualifiers["ID"] = [name]
47 return outFeat
48
49
50 def main(inFile, makeMRNA, makeGene, identifier, fastaFile, outFile):
51
52 ofh = sys.stdout
53 if outFile:
54 ofh = outFile
55
56 outRec = []
57 failed = 0
58 for rec in SeqIO.parse(inFile, "genbank"):
59 recID = rec.name
60
61 if len(str(rec.seq)) > 0:
62 seqs_pending_writes = True
63 outSeq = str(rec.seq)
64 seqLen = len(outSeq)
65
66 locBucket = {}
67 outFeats = []
68 topTypeDict = {}
69 seekingParent = []
70 geneNum = 0
71 autoGeneNum = 0
72 for feat in rec.features:
73 if (
74 identifier not in feat.qualifiers.keys()
75 ): # Allow metadata features and other features with no ID (Output warning?) - AJC
76 if feat.type in bottomFeatTypes:
77 seekingParent.append(
78 [feat, [], []]
79 ) # [Feature, all parent candidates, strongest parent candidates]
80 continue
81 elif feat.type not in topTypeDict.keys():
82 topTypeDict[feat.type] = 1
83 else:
84 topTypeDict[feat.type] += 1
85 outFeats.append(
86 makeGffFeat(feat, topTypeDict[feat.type], recID, identifier)
87 )
88 continue
89 elif feat.qualifiers[identifier][0] not in locBucket.keys():
90 locBucket[feat.qualifiers[identifier][0]] = []
91 locBucket[feat.qualifiers[identifier][0]].append(feat)
92
93 for locus in locBucket.keys():
94 minLoc = locBucket[locus][0].location.start
95 maxLoc = locBucket[locus][0].location.end
96 for feat in locBucket[locus]:
97 minLoc = min(minLoc, feat.location.start)
98 maxLoc = max(maxLoc, feat.location.end)
99 for x in seekingParent:
100 if x[0].location.start >= minLoc and x[0].location.end <= maxLoc:
101 x[1].append(locus)
102 if x[0].location.start == minLoc or x[0].location.end == maxLoc:
103 x[2].append(locus)
104
105 for x in seekingParent: # Reformat to [Feature, Locus, Unused/Free]
106 if len(x[2]) == 1:
107 finList = ""
108 if len(x[1]) > 1:
109 for loc in x[1]:
110 if loc != x[2][0]:
111 finList += loc + ", "
112 finList = (
113 str(x[0].type)
114 + " had no locus tag set in .gbk file, automatically derived. Other, weaker candidate(s) were "
115 + finList[0:-2]
116 + "."
117 )
118 else:
119 finList = (
120 str(x[0].type)
121 + " had no locus tag set in .gbk file, automatically derived."
122 )
123 if "Notes" not in x[0].qualifiers.keys():
124 x[0].qualifiers["Notes"] = []
125 x[0].qualifiers["Notes"].append(finList)
126 x[1] = x[2][0]
127 elif len(x[2]) > 1:
128 candidate = x[2][0] # Arbitrarily choose first one
129 finList = ""
130 strongList = ""
131 for loc in x[2]:
132 if loc != candidate:
133 finList += loc + ", "
134 strongList += loc + ", "
135 for loc in x[1]:
136 if loc not in x[2]:
137 finList += loc + ", "
138 finList = (
139 str(x[0].type)
140 + " had no locus tag set in .gbk file, automatically derived. Other candidate(s) were "
141 + finList[0:-2]
142 + " (Equally strong candidate(s): "
143 + strongList[0:-2]
144 + ")."
145 )
146 if "Notes" not in x[0].qualifiers.keys():
147 x[0].qualifiers["Notes"] = []
148 x[0].qualifiers["Notes"].append(finList)
149 x[1] = candidate
150 elif len(x[1]) == 1:
151 x[1] = x[1][0]
152 if "Notes" not in x[0].qualifiers.keys():
153 x[0].qualifiers["Notes"] = []
154 finList = (
155 str(x[0].type)
156 + " had no locus tag set in .gbk file, automatically derived."
157 )
158 x[0].qualifiers["Notes"].append(finList)
159 elif len(x[1]) > 1:
160 candidate = x[1][0] # Arbitrarily choose first one
161 finList = ""
162 for loc in x[1]:
163 if loc != candidate:
164 finList += loc + ", "
165 finList = (
166 str(x[0].type)
167 + " had no locus tag set in .gbk file, automatically derived. Other candidates were "
168 + finList[0:-2]
169 + "."
170 )
171 if "Notes" not in x[0].qualifiers.keys():
172 x[0].qualifiers["Notes"] = []
173 x[0].qualifiers["Notes"].append(finList)
174 x[1] = candidate
175 else:
176 if makeGene:
177 sys.stderr.write(
178 "Warning: Unable to find potential parent for feature with no "
179 + identifier
180 + " of type "
181 + str(x[0].type)
182 + " at location ["
183 + str(x[0].location.start + 1)
184 + ", "
185 + str(x[0].location.end)
186 + "], creating standalone gene.\n"
187 )
188 autoGeneNum += 1
189 x[0].source = "GbkToGff"
190 x[0].score = 0
191 x[0].shift = 0
192 if "ID" not in x[0].qualifiers.keys():
193 x[0].qualifiers["ID"] = [
194 recID + ".standalone_" + x[0].type + "." + str(autoGeneNum)
195 ]
196 tempName = recID + ".derived_Gene." + str(autoGeneNum)
197 tempQuals = {
198 "ID": [tempName],
199 "Notes": [
200 "Gene feature automatically generated by Gbk to GFF conversion"
201 ],
202 }
203 tempGene = gffSeqFeature(
204 FeatureLocation(
205 x[0].location.start, x[0].location.end, x[0].location.strand
206 ),
207 "gene",
208 "",
209 x[0].strand,
210 tempName,
211 tempQuals,
212 None,
213 None,
214 None,
215 ".",
216 0,
217 "GbkToGff",
218 )
219 if makeMRNA:
220 tempName = recID + ".derived_mRNA." + str(autoGeneNum)
221 tempQuals = {
222 "ID": [tempName],
223 "Notes": [
224 "mRNA feature automatically generated by Gbk to GFF conversion"
225 ],
226 }
227 tempGene.sub_features.append(
228 gffSeqFeature(
229 FeatureLocation(
230 x[0].location.start,
231 x[0].location.end,
232 x[0].location.strand,
233 ),
234 "mRNA",
235 "",
236 x[0].strand,
237 tempName,
238 tempQuals,
239 None,
240 None,
241 None,
242 ".",
243 0,
244 "GbkToGff",
245 )
246 )
247 tempGene.sub_features[-1].sub_features.append(x[0])
248 else:
249 tempGene.sub_features.append(x[0])
250
251 outFeats.append(tempGene)
252 else:
253 sys.stderr.write(
254 "Warning: Unable to find potential parent for feature with no "
255 + identifier
256 + " of type "
257 + str(x[0].type)
258 + " at location ["
259 + str(x[0].location.start + 1)
260 + ", "
261 + str(x[0].location.end)
262 + "].\n"
263 )
264 if x[0].type not in topTypeDict.keys():
265 topTypeDict[x[0].type] = 1
266 else:
267 topTypeDict[x[0].type] += 1
268 outFeats.append(
269 makeGffFeat(x[0], topTypeDict[x[0].type], recID, identifier)
270 )
271
272 for locus in locBucket.keys():
273 if len(locBucket[locus]) == 1: # No heirarchy to be made
274 outFeats.append(makeGffFeat(locBucket[locus][0], 0, recID, identifier))
275 continue
276 topFeat = None
277 midFeat = None
278 bottomFeats = []
279 typeDict = {}
280 minLoc = locBucket[locus][0].location.start
281 maxLoc = locBucket[locus][0].location.end
282 geneNum += 1
283 for feat in locBucket[locus]:
284 # If we want to make our own top-level feat?
285 minLoc = min(minLoc, feat.location.start)
286 maxLoc = max(maxLoc, feat.location.end)
287
288 # Gene->mRNA->CDS included as example, to add other feature-heirarchys in the appropriate slot
289 if feat.type in ["gene"]:
290 if not topFeat:
291 topFeat = feat
292 # Else handle multiple top features
293 elif feat.type in ["mRNA", "tRNA", "rRNA"]:
294 if not midFeat:
295 midFeat = feat
296 # Else handle multiple mid feats (May need another elif type-in-list statement if we actually expect a list of mid feats)
297 else:
298 if feat.type not in typeDict.keys():
299 typeDict[feat.type] = 1
300 else:
301 typeDict[feat.type] += 1
302 bottomFeats.append(feat)
303
304 for x in seekingParent:
305 if type(x[1]) != "list" and locus == x[1]:
306 x[0].qualifiers[identifier] = [locus]
307 bottomFeats.append(x[0])
308 if x[0].type not in typeDict.keys():
309 typeDict[x[0].type] = 1
310 else:
311 typeDict[x[0].type] += 1
312
313 # if not topFeat: # Make our own top-level feature based off minLoc, maxLoc bounds
314
315 for (
316 x
317 ) in (
318 typeDict.keys()
319 ): # If only 1, set it to 0 so we don't append a number to the name
320 if (
321 typeDict[x] == 1
322 ): # Else, set to 1 so that we count up as we encounter the features
323 typeDict[x] = 0
324 else:
325 typeDict[x] = 1
326
327 if not topFeat:
328 if makeGene:
329 if midFeat:
330 possibleStrand = midFeat.strand
331 else:
332 possibleStrand = bottomFeats[0].strand
333 tempName = recID + ".gene." + str(geneNum)
334 tempQuals = {
335 identifier: [locus],
336 "ID": [tempName],
337 "Notes": [
338 "Gene feature automatically generated by Gbk to GFF conversion"
339 ],
340 }
341 topFeat = gffSeqFeature(
342 FeatureLocation(minLoc, maxLoc, possibleStrand),
343 "gene",
344 "",
345 possibleStrand,
346 tempName,
347 tempQuals,
348 None,
349 None,
350 None,
351 ".",
352 0,
353 "GbkToGff",
354 )
355 else:
356 sys.stderr.write(
357 "Unable to create a feature heirarchy at location [%d, %d] with features: \n"
358 % (minLoc, maxLoc)
359 )
360 for x in locBucket[locus]:
361 sys.stderr.write(str(x))
362 sys.stderr.write("\n")
363 failed = 1
364 continue
365
366 outFeats.append(makeGffFeat(topFeat, 0, recID, identifier))
367 if not midFeat and topFeat.type == "gene" and makeMRNA:
368 if identifier in topFeat.qualifiers.keys():
369 tempName = topFeat.qualifiers[identifier][0] + ".mRNA"
370 tempQuals = {
371 identifier: topFeat.qualifiers[identifier],
372 "ID": [tempName],
373 "Notes": [
374 "mRNA feature automatically generated by Gbk to GFF conversion"
375 ],
376 }
377 else:
378 tempName = outFeats[-1].ID + ".mRNA"
379 tempQuals = {
380 identifier: topFeat.qualifiers[identifier],
381 "ID": [tempName],
382 "Notes": [
383 "mRNA feature automatically generated by Gbk to GFF conversion"
384 ],
385 }
386 midFeat = gffSeqFeature(
387 FeatureLocation(minLoc, maxLoc, topFeat.strand),
388 "mRNA",
389 "",
390 topFeat.strand,
391 tempName,
392 tempQuals,
393 None,
394 None,
395 None,
396 ".",
397 0,
398 "GbkToGff",
399 )
400
401 if (
402 midFeat
403 ): # Again, need a new if statement if we want to handle multiple mid-tier features
404 outFeats[-1].sub_features.append(
405 makeGffFeat(midFeat, 0, recID, identifier)
406 )
407 outFeats[-1].sub_features[-1].qualifiers["Parent"] = [outFeats[-1].id]
408 for x in bottomFeats:
409 typeDict[x.type] += 1
410 outFeats[-1].sub_features[-1].sub_features.append(
411 makeGffFeat(x, typeDict[x.type], recID, identifier)
412 )
413 outFeats[-1].sub_features[-1].sub_features[-1].qualifiers[
414 "Parent"
415 ] = [outFeats[-1].sub_features[-1].id]
416 else: # No midFeat, append bottom feats directly to top feats
417 for x in bottomFeats:
418 typeDict[x.type] += 1
419 outFeats[-1].sub_features.append(
420 makeGffFeat(x, typeDict[x.type], recID, identifier)
421 )
422 outFeats[-1].sub_features[-1].qualifiers["Parent"] = [
423 outFeats[-1].id
424 ]
425
426 outRec.append(
427 SeqRecord(
428 rec.seq,
429 recID,
430 rec.name,
431 rec.description,
432 rec.dbxrefs,
433 sorted(outFeats, key=lambda x: x.location.start),
434 rec.annotations,
435 rec.letter_annotations,
436 )
437 )
438 SeqIO.write([outRec[-1]], fastaFile, "fasta")
439 gffWrite(outRec, ofh)
440 exit(failed) # 0 if all features handled, 1 if unable to handle some
441
442
443 if __name__ == "__main__":
444 parser = argparse.ArgumentParser(
445 description="Biopython solution to Gbk to GFF conversion"
446 )
447
448 parser.add_argument(
449 "inFile", type=argparse.FileType("r"), help="Path to an input GBK file"
450 )
451 parser.add_argument(
452 "--makeMRNA",
453 action="store_true",
454 required=False,
455 help="Automatically create mRNA features",
456 )
457 parser.add_argument(
458 "--makeGene",
459 action="store_true",
460 required=False,
461 help="Automatically create missing Gene features",
462 )
463 parser.add_argument(
464 "--identifier",
465 type=str,
466 default="locus_tag",
467 required=False,
468 help="Qualifier to derive ID property from",
469 )
470 parser.add_argument(
471 "--fastaFile", type=argparse.FileType("w"), help="Fasta output for sequences"
472 )
473 parser.add_argument(
474 "--outFile", type=argparse.FileType("w"), help="GFF feature output"
475 )
476 args = parser.parse_args()
477 main(**vars(args))