diff gbk_to_gff3.py @ 1:bb6332a85aa6 draft

planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author cpt
date Mon, 05 Jun 2023 02:43:04 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gbk_to_gff3.py	Mon Jun 05 02:43:04 2023 +0000
@@ -0,0 +1,477 @@
+#!/usr/bin/env python
+
+import argparse
+import sys
+
+from Bio import SeqIO
+from Bio.SeqRecord import SeqRecord
+from Bio.SeqFeature import FeatureLocation
+from CPT_GFFParser import gffSeqFeature, gffWrite
+
+bottomFeatTypes = ["exon", "RBS", "CDS"]
+
+
+def makeGffFeat(inFeat, num, recName, identifier):
+    if inFeat.type == "RBS" or (
+        inFeat.type == "regulatory"
+        and "regulatory_class" in inFeat.qualifiers.keys()
+        and inFeat.qualifiers["regulatory_class"][0] == "ribosome_binding_site"
+    ):
+        inFeat.type = "Shine_Dalgarno_sequence"
+    if "codon_start" in inFeat.qualifiers.keys():
+        shift = int(inFeat.qualifiers["codon_start"][0]) - 1
+    else:
+        shift = "."
+    if identifier in inFeat.qualifiers.keys():
+        name = inFeat.qualifiers[identifier][0] + "." + inFeat.type
+        if num > 0:
+            name += "." + str(num)
+    else:
+        name = recName + "." + inFeat.type + "." + str(num)
+
+    outFeat = gffSeqFeature(
+        inFeat.location,
+        inFeat.type,
+        "",
+        inFeat.strand,
+        name,
+        inFeat.qualifiers,
+        None,
+        None,
+        None,
+        shift,
+        0,
+        "GbkToGff",
+    )
+    outFeat.qualifiers["ID"] = [name]
+    return outFeat
+
+
+def main(inFile, makeMRNA, makeGene, identifier, fastaFile, outFile):
+
+    ofh = sys.stdout
+    if outFile:
+        ofh = outFile
+
+    outRec = []
+    failed = 0
+    for rec in SeqIO.parse(inFile, "genbank"):
+        recID = rec.name
+
+        if len(str(rec.seq)) > 0:
+            seqs_pending_writes = True
+            outSeq = str(rec.seq)
+            seqLen = len(outSeq)
+
+        locBucket = {}
+        outFeats = []
+        topTypeDict = {}
+        seekingParent = []
+        geneNum = 0
+        autoGeneNum = 0
+        for feat in rec.features:
+            if (
+                identifier not in feat.qualifiers.keys()
+            ):  # Allow metadata features and other features with no ID (Output warning?) - AJC
+                if feat.type in bottomFeatTypes:
+                    seekingParent.append(
+                        [feat, [], []]
+                    )  # [Feature, all parent candidates, strongest parent candidates]
+                    continue
+                elif feat.type not in topTypeDict.keys():
+                    topTypeDict[feat.type] = 1
+                else:
+                    topTypeDict[feat.type] += 1
+                outFeats.append(
+                    makeGffFeat(feat, topTypeDict[feat.type], recID, identifier)
+                )
+                continue
+            elif feat.qualifiers[identifier][0] not in locBucket.keys():
+                locBucket[feat.qualifiers[identifier][0]] = []
+            locBucket[feat.qualifiers[identifier][0]].append(feat)
+
+        for locus in locBucket.keys():
+            minLoc = locBucket[locus][0].location.start
+            maxLoc = locBucket[locus][0].location.end
+            for feat in locBucket[locus]:
+                minLoc = min(minLoc, feat.location.start)
+                maxLoc = max(maxLoc, feat.location.end)
+            for x in seekingParent:
+                if x[0].location.start >= minLoc and x[0].location.end <= maxLoc:
+                    x[1].append(locus)
+                if x[0].location.start == minLoc or x[0].location.end == maxLoc:
+                    x[2].append(locus)
+
+        for x in seekingParent:  # Reformat to [Feature, Locus, Unused/Free]
+            if len(x[2]) == 1:
+                finList = ""
+                if len(x[1]) > 1:
+                    for loc in x[1]:
+                        if loc != x[2][0]:
+                            finList += loc + ", "
+                    finList = (
+                        str(x[0].type)
+                        + " had no locus tag set in .gbk file, automatically derived. Other, weaker candidate(s) were "
+                        + finList[0:-2]
+                        + "."
+                    )
+                else:
+                    finList = (
+                        str(x[0].type)
+                        + " had no locus tag set in .gbk file, automatically derived."
+                    )
+                if "Notes" not in x[0].qualifiers.keys():
+                    x[0].qualifiers["Notes"] = []
+                x[0].qualifiers["Notes"].append(finList)
+                x[1] = x[2][0]
+            elif len(x[2]) > 1:
+                candidate = x[2][0]  # Arbitrarily choose first one
+                finList = ""
+                strongList = ""
+                for loc in x[2]:
+                    if loc != candidate:
+                        finList += loc + ", "
+                        strongList += loc + ", "
+                for loc in x[1]:
+                    if loc not in x[2]:
+                        finList += loc + ", "
+                finList = (
+                    str(x[0].type)
+                    + " had no locus tag set in .gbk file, automatically derived. Other candidate(s) were "
+                    + finList[0:-2]
+                    + " (Equally strong candidate(s): "
+                    + strongList[0:-2]
+                    + ")."
+                )
+                if "Notes" not in x[0].qualifiers.keys():
+                    x[0].qualifiers["Notes"] = []
+                x[0].qualifiers["Notes"].append(finList)
+                x[1] = candidate
+            elif len(x[1]) == 1:
+                x[1] = x[1][0]
+                if "Notes" not in x[0].qualifiers.keys():
+                    x[0].qualifiers["Notes"] = []
+                finList = (
+                    str(x[0].type)
+                    + " had no locus tag set in .gbk file, automatically derived."
+                )
+                x[0].qualifiers["Notes"].append(finList)
+            elif len(x[1]) > 1:
+                candidate = x[1][0]  # Arbitrarily choose first one
+                finList = ""
+                for loc in x[1]:
+                    if loc != candidate:
+                        finList += loc + ", "
+                finList = (
+                    str(x[0].type)
+                    + " had no locus tag set in .gbk file, automatically derived. Other candidates were "
+                    + finList[0:-2]
+                    + "."
+                )
+                if "Notes" not in x[0].qualifiers.keys():
+                    x[0].qualifiers["Notes"] = []
+                x[0].qualifiers["Notes"].append(finList)
+                x[1] = candidate
+            else:
+                if makeGene:
+                    sys.stderr.write(
+                        "Warning: Unable to find potential parent for feature with no "
+                        + identifier
+                        + " of type "
+                        + str(x[0].type)
+                        + " at location ["
+                        + str(x[0].location.start + 1)
+                        + ", "
+                        + str(x[0].location.end)
+                        + "], creating standalone gene.\n"
+                    )
+                    autoGeneNum += 1
+                    x[0].source = "GbkToGff"
+                    x[0].score = 0
+                    x[0].shift = 0
+                    if "ID" not in x[0].qualifiers.keys():
+                        x[0].qualifiers["ID"] = [
+                            recID + ".standalone_" + x[0].type + "." + str(autoGeneNum)
+                        ]
+                    tempName = recID + ".derived_Gene." + str(autoGeneNum)
+                    tempQuals = {
+                        "ID": [tempName],
+                        "Notes": [
+                            "Gene feature automatically generated by Gbk to GFF conversion"
+                        ],
+                    }
+                    tempGene = gffSeqFeature(
+                        FeatureLocation(
+                            x[0].location.start, x[0].location.end, x[0].location.strand
+                        ),
+                        "gene",
+                        "",
+                        x[0].strand,
+                        tempName,
+                        tempQuals,
+                        None,
+                        None,
+                        None,
+                        ".",
+                        0,
+                        "GbkToGff",
+                    )
+                    if makeMRNA:
+                        tempName = recID + ".derived_mRNA." + str(autoGeneNum)
+                        tempQuals = {
+                            "ID": [tempName],
+                            "Notes": [
+                                "mRNA feature automatically generated by Gbk to GFF conversion"
+                            ],
+                        }
+                        tempGene.sub_features.append(
+                            gffSeqFeature(
+                                FeatureLocation(
+                                    x[0].location.start,
+                                    x[0].location.end,
+                                    x[0].location.strand,
+                                ),
+                                "mRNA",
+                                "",
+                                x[0].strand,
+                                tempName,
+                                tempQuals,
+                                None,
+                                None,
+                                None,
+                                ".",
+                                0,
+                                "GbkToGff",
+                            )
+                        )
+                        tempGene.sub_features[-1].sub_features.append(x[0])
+                    else:
+                        tempGene.sub_features.append(x[0])
+
+                    outFeats.append(tempGene)
+                else:
+                    sys.stderr.write(
+                        "Warning: Unable to find potential parent for feature with no "
+                        + identifier
+                        + " of type "
+                        + str(x[0].type)
+                        + " at location ["
+                        + str(x[0].location.start + 1)
+                        + ", "
+                        + str(x[0].location.end)
+                        + "].\n"
+                    )
+                    if x[0].type not in topTypeDict.keys():
+                        topTypeDict[x[0].type] = 1
+                    else:
+                        topTypeDict[x[0].type] += 1
+                    outFeats.append(
+                        makeGffFeat(x[0], topTypeDict[x[0].type], recID, identifier)
+                    )
+
+        for locus in locBucket.keys():
+            if len(locBucket[locus]) == 1:  # No heirarchy to be made
+                outFeats.append(makeGffFeat(locBucket[locus][0], 0, recID, identifier))
+                continue
+            topFeat = None
+            midFeat = None
+            bottomFeats = []
+            typeDict = {}
+            minLoc = locBucket[locus][0].location.start
+            maxLoc = locBucket[locus][0].location.end
+            geneNum += 1
+            for feat in locBucket[locus]:
+                # If we want to make our own top-level feat?
+                minLoc = min(minLoc, feat.location.start)
+                maxLoc = max(maxLoc, feat.location.end)
+
+                # Gene->mRNA->CDS included as example, to add other feature-heirarchys in the appropriate slot
+                if feat.type in ["gene"]:
+                    if not topFeat:
+                        topFeat = feat
+                    # Else handle multiple top features
+                elif feat.type in ["mRNA", "tRNA", "rRNA"]:
+                    if not midFeat:
+                        midFeat = feat
+                    # Else handle multiple mid feats (May need another elif type-in-list statement if we actually expect a list of mid feats)
+                else:
+                    if feat.type not in typeDict.keys():
+                        typeDict[feat.type] = 1
+                    else:
+                        typeDict[feat.type] += 1
+                    bottomFeats.append(feat)
+
+            for x in seekingParent:
+                if type(x[1]) != "list" and locus == x[1]:
+                    x[0].qualifiers[identifier] = [locus]
+                    bottomFeats.append(x[0])
+                    if x[0].type not in typeDict.keys():
+                        typeDict[x[0].type] = 1
+                    else:
+                        typeDict[x[0].type] += 1
+
+            # if not topFeat: # Make our own top-level feature based off minLoc, maxLoc bounds
+
+            for (
+                x
+            ) in (
+                typeDict.keys()
+            ):  # If only 1, set it to 0 so we don't append a number to the name
+                if (
+                    typeDict[x] == 1
+                ):  # Else, set to 1 so that we count up as we encounter the features
+                    typeDict[x] = 0
+                else:
+                    typeDict[x] = 1
+
+            if not topFeat:
+                if makeGene:
+                    if midFeat:
+                        possibleStrand = midFeat.strand
+                    else:
+                        possibleStrand = bottomFeats[0].strand
+                    tempName = recID + ".gene." + str(geneNum)
+                    tempQuals = {
+                        identifier: [locus],
+                        "ID": [tempName],
+                        "Notes": [
+                            "Gene feature automatically generated by Gbk to GFF conversion"
+                        ],
+                    }
+                    topFeat = gffSeqFeature(
+                        FeatureLocation(minLoc, maxLoc, possibleStrand),
+                        "gene",
+                        "",
+                        possibleStrand,
+                        tempName,
+                        tempQuals,
+                        None,
+                        None,
+                        None,
+                        ".",
+                        0,
+                        "GbkToGff",
+                    )
+                else:
+                    sys.stderr.write(
+                        "Unable to create a feature heirarchy at location [%d, %d] with features: \n"
+                        % (minLoc, maxLoc)
+                    )
+                    for x in locBucket[locus]:
+                        sys.stderr.write(str(x))
+                        sys.stderr.write("\n")
+                        failed = 1
+                    continue
+
+            outFeats.append(makeGffFeat(topFeat, 0, recID, identifier))
+            if not midFeat and topFeat.type == "gene" and makeMRNA:
+                if identifier in topFeat.qualifiers.keys():
+                    tempName = topFeat.qualifiers[identifier][0] + ".mRNA"
+                    tempQuals = {
+                        identifier: topFeat.qualifiers[identifier],
+                        "ID": [tempName],
+                        "Notes": [
+                            "mRNA feature automatically generated by Gbk to GFF conversion"
+                        ],
+                    }
+                else:
+                    tempName = outFeats[-1].ID + ".mRNA"
+                    tempQuals = {
+                        identifier: topFeat.qualifiers[identifier],
+                        "ID": [tempName],
+                        "Notes": [
+                            "mRNA feature automatically generated by Gbk to GFF conversion"
+                        ],
+                    }
+                midFeat = gffSeqFeature(
+                    FeatureLocation(minLoc, maxLoc, topFeat.strand),
+                    "mRNA",
+                    "",
+                    topFeat.strand,
+                    tempName,
+                    tempQuals,
+                    None,
+                    None,
+                    None,
+                    ".",
+                    0,
+                    "GbkToGff",
+                )
+
+            if (
+                midFeat
+            ):  # Again, need a new if statement if we want to handle multiple mid-tier features
+                outFeats[-1].sub_features.append(
+                    makeGffFeat(midFeat, 0, recID, identifier)
+                )
+                outFeats[-1].sub_features[-1].qualifiers["Parent"] = [outFeats[-1].id]
+                for x in bottomFeats:
+                    typeDict[x.type] += 1
+                    outFeats[-1].sub_features[-1].sub_features.append(
+                        makeGffFeat(x, typeDict[x.type], recID, identifier)
+                    )
+                    outFeats[-1].sub_features[-1].sub_features[-1].qualifiers[
+                        "Parent"
+                    ] = [outFeats[-1].sub_features[-1].id]
+            else:  # No midFeat, append bottom feats directly to top feats
+                for x in bottomFeats:
+                    typeDict[x.type] += 1
+                    outFeats[-1].sub_features.append(
+                        makeGffFeat(x, typeDict[x.type], recID, identifier)
+                    )
+                    outFeats[-1].sub_features[-1].qualifiers["Parent"] = [
+                        outFeats[-1].id
+                    ]
+
+        outRec.append(
+            SeqRecord(
+                rec.seq,
+                recID,
+                rec.name,
+                rec.description,
+                rec.dbxrefs,
+                sorted(outFeats, key=lambda x: x.location.start),
+                rec.annotations,
+                rec.letter_annotations,
+            )
+        )
+        SeqIO.write([outRec[-1]], fastaFile, "fasta")
+    gffWrite(outRec, ofh)
+    exit(failed)  # 0 if all features handled, 1 if unable to handle some
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Biopython solution to Gbk to GFF conversion"
+    )
+
+    parser.add_argument(
+        "inFile", type=argparse.FileType("r"), help="Path to an input GBK file"
+    )
+    parser.add_argument(
+        "--makeMRNA",
+        action="store_true",
+        required=False,
+        help="Automatically create mRNA features",
+    )
+    parser.add_argument(
+        "--makeGene",
+        action="store_true",
+        required=False,
+        help="Automatically create missing Gene features",
+    )
+    parser.add_argument(
+        "--identifier",
+        type=str,
+        default="locus_tag",
+        required=False,
+        help="Qualifier to derive ID property from",
+    )
+    parser.add_argument(
+        "--fastaFile", type=argparse.FileType("w"), help="Fasta output for sequences"
+    )
+    parser.add_argument(
+        "--outFile", type=argparse.FileType("w"), help="GFF feature output"
+    )
+    args = parser.parse_args()
+    main(**vars(args))