Mercurial > repos > cpt > cpt_gbk_to_gff
view gbk_to_gff3.py @ 1:bb6332a85aa6 draft
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author | cpt |
---|---|
date | Mon, 05 Jun 2023 02:43:04 +0000 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env python import argparse import sys from Bio import SeqIO from Bio.SeqRecord import SeqRecord from Bio.SeqFeature import FeatureLocation from CPT_GFFParser import gffSeqFeature, gffWrite bottomFeatTypes = ["exon", "RBS", "CDS"] def makeGffFeat(inFeat, num, recName, identifier): if inFeat.type == "RBS" or ( inFeat.type == "regulatory" and "regulatory_class" in inFeat.qualifiers.keys() and inFeat.qualifiers["regulatory_class"][0] == "ribosome_binding_site" ): inFeat.type = "Shine_Dalgarno_sequence" if "codon_start" in inFeat.qualifiers.keys(): shift = int(inFeat.qualifiers["codon_start"][0]) - 1 else: shift = "." if identifier in inFeat.qualifiers.keys(): name = inFeat.qualifiers[identifier][0] + "." + inFeat.type if num > 0: name += "." + str(num) else: name = recName + "." + inFeat.type + "." + str(num) outFeat = gffSeqFeature( inFeat.location, inFeat.type, "", inFeat.strand, name, inFeat.qualifiers, None, None, None, shift, 0, "GbkToGff", ) outFeat.qualifiers["ID"] = [name] return outFeat def main(inFile, makeMRNA, makeGene, identifier, fastaFile, outFile): ofh = sys.stdout if outFile: ofh = outFile outRec = [] failed = 0 for rec in SeqIO.parse(inFile, "genbank"): recID = rec.name if len(str(rec.seq)) > 0: seqs_pending_writes = True outSeq = str(rec.seq) seqLen = len(outSeq) locBucket = {} outFeats = [] topTypeDict = {} seekingParent = [] geneNum = 0 autoGeneNum = 0 for feat in rec.features: if ( identifier not in feat.qualifiers.keys() ): # Allow metadata features and other features with no ID (Output warning?) - AJC if feat.type in bottomFeatTypes: seekingParent.append( [feat, [], []] ) # [Feature, all parent candidates, strongest parent candidates] continue elif feat.type not in topTypeDict.keys(): topTypeDict[feat.type] = 1 else: topTypeDict[feat.type] += 1 outFeats.append( makeGffFeat(feat, topTypeDict[feat.type], recID, identifier) ) continue elif feat.qualifiers[identifier][0] not in locBucket.keys(): locBucket[feat.qualifiers[identifier][0]] = [] locBucket[feat.qualifiers[identifier][0]].append(feat) for locus in locBucket.keys(): minLoc = locBucket[locus][0].location.start maxLoc = locBucket[locus][0].location.end for feat in locBucket[locus]: minLoc = min(minLoc, feat.location.start) maxLoc = max(maxLoc, feat.location.end) for x in seekingParent: if x[0].location.start >= minLoc and x[0].location.end <= maxLoc: x[1].append(locus) if x[0].location.start == minLoc or x[0].location.end == maxLoc: x[2].append(locus) for x in seekingParent: # Reformat to [Feature, Locus, Unused/Free] if len(x[2]) == 1: finList = "" if len(x[1]) > 1: for loc in x[1]: if loc != x[2][0]: finList += loc + ", " finList = ( str(x[0].type) + " had no locus tag set in .gbk file, automatically derived. Other, weaker candidate(s) were " + finList[0:-2] + "." ) else: finList = ( str(x[0].type) + " had no locus tag set in .gbk file, automatically derived." ) if "Notes" not in x[0].qualifiers.keys(): x[0].qualifiers["Notes"] = [] x[0].qualifiers["Notes"].append(finList) x[1] = x[2][0] elif len(x[2]) > 1: candidate = x[2][0] # Arbitrarily choose first one finList = "" strongList = "" for loc in x[2]: if loc != candidate: finList += loc + ", " strongList += loc + ", " for loc in x[1]: if loc not in x[2]: finList += loc + ", " finList = ( str(x[0].type) + " had no locus tag set in .gbk file, automatically derived. Other candidate(s) were " + finList[0:-2] + " (Equally strong candidate(s): " + strongList[0:-2] + ")." ) if "Notes" not in x[0].qualifiers.keys(): x[0].qualifiers["Notes"] = [] x[0].qualifiers["Notes"].append(finList) x[1] = candidate elif len(x[1]) == 1: x[1] = x[1][0] if "Notes" not in x[0].qualifiers.keys(): x[0].qualifiers["Notes"] = [] finList = ( str(x[0].type) + " had no locus tag set in .gbk file, automatically derived." ) x[0].qualifiers["Notes"].append(finList) elif len(x[1]) > 1: candidate = x[1][0] # Arbitrarily choose first one finList = "" for loc in x[1]: if loc != candidate: finList += loc + ", " finList = ( str(x[0].type) + " had no locus tag set in .gbk file, automatically derived. Other candidates were " + finList[0:-2] + "." ) if "Notes" not in x[0].qualifiers.keys(): x[0].qualifiers["Notes"] = [] x[0].qualifiers["Notes"].append(finList) x[1] = candidate else: if makeGene: sys.stderr.write( "Warning: Unable to find potential parent for feature with no " + identifier + " of type " + str(x[0].type) + " at location [" + str(x[0].location.start + 1) + ", " + str(x[0].location.end) + "], creating standalone gene.\n" ) autoGeneNum += 1 x[0].source = "GbkToGff" x[0].score = 0 x[0].shift = 0 if "ID" not in x[0].qualifiers.keys(): x[0].qualifiers["ID"] = [ recID + ".standalone_" + x[0].type + "." + str(autoGeneNum) ] tempName = recID + ".derived_Gene." + str(autoGeneNum) tempQuals = { "ID": [tempName], "Notes": [ "Gene feature automatically generated by Gbk to GFF conversion" ], } tempGene = gffSeqFeature( FeatureLocation( x[0].location.start, x[0].location.end, x[0].location.strand ), "gene", "", x[0].strand, tempName, tempQuals, None, None, None, ".", 0, "GbkToGff", ) if makeMRNA: tempName = recID + ".derived_mRNA." + str(autoGeneNum) tempQuals = { "ID": [tempName], "Notes": [ "mRNA feature automatically generated by Gbk to GFF conversion" ], } tempGene.sub_features.append( gffSeqFeature( FeatureLocation( x[0].location.start, x[0].location.end, x[0].location.strand, ), "mRNA", "", x[0].strand, tempName, tempQuals, None, None, None, ".", 0, "GbkToGff", ) ) tempGene.sub_features[-1].sub_features.append(x[0]) else: tempGene.sub_features.append(x[0]) outFeats.append(tempGene) else: sys.stderr.write( "Warning: Unable to find potential parent for feature with no " + identifier + " of type " + str(x[0].type) + " at location [" + str(x[0].location.start + 1) + ", " + str(x[0].location.end) + "].\n" ) if x[0].type not in topTypeDict.keys(): topTypeDict[x[0].type] = 1 else: topTypeDict[x[0].type] += 1 outFeats.append( makeGffFeat(x[0], topTypeDict[x[0].type], recID, identifier) ) for locus in locBucket.keys(): if len(locBucket[locus]) == 1: # No heirarchy to be made outFeats.append(makeGffFeat(locBucket[locus][0], 0, recID, identifier)) continue topFeat = None midFeat = None bottomFeats = [] typeDict = {} minLoc = locBucket[locus][0].location.start maxLoc = locBucket[locus][0].location.end geneNum += 1 for feat in locBucket[locus]: # If we want to make our own top-level feat? minLoc = min(minLoc, feat.location.start) maxLoc = max(maxLoc, feat.location.end) # Gene->mRNA->CDS included as example, to add other feature-heirarchys in the appropriate slot if feat.type in ["gene"]: if not topFeat: topFeat = feat # Else handle multiple top features elif feat.type in ["mRNA", "tRNA", "rRNA"]: if not midFeat: midFeat = feat # Else handle multiple mid feats (May need another elif type-in-list statement if we actually expect a list of mid feats) else: if feat.type not in typeDict.keys(): typeDict[feat.type] = 1 else: typeDict[feat.type] += 1 bottomFeats.append(feat) for x in seekingParent: if type(x[1]) != "list" and locus == x[1]: x[0].qualifiers[identifier] = [locus] bottomFeats.append(x[0]) if x[0].type not in typeDict.keys(): typeDict[x[0].type] = 1 else: typeDict[x[0].type] += 1 # if not topFeat: # Make our own top-level feature based off minLoc, maxLoc bounds for ( x ) in ( typeDict.keys() ): # If only 1, set it to 0 so we don't append a number to the name if ( typeDict[x] == 1 ): # Else, set to 1 so that we count up as we encounter the features typeDict[x] = 0 else: typeDict[x] = 1 if not topFeat: if makeGene: if midFeat: possibleStrand = midFeat.strand else: possibleStrand = bottomFeats[0].strand tempName = recID + ".gene." + str(geneNum) tempQuals = { identifier: [locus], "ID": [tempName], "Notes": [ "Gene feature automatically generated by Gbk to GFF conversion" ], } topFeat = gffSeqFeature( FeatureLocation(minLoc, maxLoc, possibleStrand), "gene", "", possibleStrand, tempName, tempQuals, None, None, None, ".", 0, "GbkToGff", ) else: sys.stderr.write( "Unable to create a feature heirarchy at location [%d, %d] with features: \n" % (minLoc, maxLoc) ) for x in locBucket[locus]: sys.stderr.write(str(x)) sys.stderr.write("\n") failed = 1 continue outFeats.append(makeGffFeat(topFeat, 0, recID, identifier)) if not midFeat and topFeat.type == "gene" and makeMRNA: if identifier in topFeat.qualifiers.keys(): tempName = topFeat.qualifiers[identifier][0] + ".mRNA" tempQuals = { identifier: topFeat.qualifiers[identifier], "ID": [tempName], "Notes": [ "mRNA feature automatically generated by Gbk to GFF conversion" ], } else: tempName = outFeats[-1].ID + ".mRNA" tempQuals = { identifier: topFeat.qualifiers[identifier], "ID": [tempName], "Notes": [ "mRNA feature automatically generated by Gbk to GFF conversion" ], } midFeat = gffSeqFeature( FeatureLocation(minLoc, maxLoc, topFeat.strand), "mRNA", "", topFeat.strand, tempName, tempQuals, None, None, None, ".", 0, "GbkToGff", ) if ( midFeat ): # Again, need a new if statement if we want to handle multiple mid-tier features outFeats[-1].sub_features.append( makeGffFeat(midFeat, 0, recID, identifier) ) outFeats[-1].sub_features[-1].qualifiers["Parent"] = [outFeats[-1].id] for x in bottomFeats: typeDict[x.type] += 1 outFeats[-1].sub_features[-1].sub_features.append( makeGffFeat(x, typeDict[x.type], recID, identifier) ) outFeats[-1].sub_features[-1].sub_features[-1].qualifiers[ "Parent" ] = [outFeats[-1].sub_features[-1].id] else: # No midFeat, append bottom feats directly to top feats for x in bottomFeats: typeDict[x.type] += 1 outFeats[-1].sub_features.append( makeGffFeat(x, typeDict[x.type], recID, identifier) ) outFeats[-1].sub_features[-1].qualifiers["Parent"] = [ outFeats[-1].id ] outRec.append( SeqRecord( rec.seq, recID, rec.name, rec.description, rec.dbxrefs, sorted(outFeats, key=lambda x: x.location.start), rec.annotations, rec.letter_annotations, ) ) SeqIO.write([outRec[-1]], fastaFile, "fasta") gffWrite(outRec, ofh) exit(failed) # 0 if all features handled, 1 if unable to handle some if __name__ == "__main__": parser = argparse.ArgumentParser( description="Biopython solution to Gbk to GFF conversion" ) parser.add_argument( "inFile", type=argparse.FileType("r"), help="Path to an input GBK file" ) parser.add_argument( "--makeMRNA", action="store_true", required=False, help="Automatically create mRNA features", ) parser.add_argument( "--makeGene", action="store_true", required=False, help="Automatically create missing Gene features", ) parser.add_argument( "--identifier", type=str, default="locus_tag", required=False, help="Qualifier to derive ID property from", ) parser.add_argument( "--fastaFile", type=argparse.FileType("w"), help="Fasta output for sequences" ) parser.add_argument( "--outFile", type=argparse.FileType("w"), help="GFF feature output" ) args = parser.parse_args() main(**vars(args))