view gbk_to_gff3.py @ 2:a921d6148d88 draft default tip

planemo upload commit f33bdf952d796c5d7a240b132af3c4cbd102decc
author cpt
date Fri, 05 Jan 2024 05:51:44 +0000
parents bb6332a85aa6
children
line wrap: on
line source

#!/usr/bin/env python

import argparse
import sys

from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import FeatureLocation
from CPT_GFFParser import gffSeqFeature, gffWrite

bottomFeatTypes = ["exon", "RBS", "CDS"]


def makeGffFeat(inFeat, num, recName, identifier):
    if inFeat.type == "RBS" or (
        inFeat.type == "regulatory"
        and "regulatory_class" in inFeat.qualifiers.keys()
        and inFeat.qualifiers["regulatory_class"][0] == "ribosome_binding_site"
    ):
        inFeat.type = "Shine_Dalgarno_sequence"
    if "codon_start" in inFeat.qualifiers.keys():
        shift = int(inFeat.qualifiers["codon_start"][0]) - 1
    else:
        shift = "."
    if identifier in inFeat.qualifiers.keys():
        name = inFeat.qualifiers[identifier][0] + "." + inFeat.type
        if num > 0:
            name += "." + str(num)
    else:
        name = recName + "." + inFeat.type + "." + str(num)

    outFeat = gffSeqFeature(
        inFeat.location,
        inFeat.type,
        "",
        inFeat.strand,
        name,
        inFeat.qualifiers,
        None,
        None,
        None,
        shift,
        0,
        "GbkToGff",
    )
    outFeat.qualifiers["ID"] = [name]
    return outFeat


def main(inFile, makeMRNA, makeGene, identifier, fastaFile, outFile):

    ofh = sys.stdout
    if outFile:
        ofh = outFile

    outRec = []
    failed = 0
    for rec in SeqIO.parse(inFile, "genbank"):
        recID = rec.name

        if len(str(rec.seq)) > 0:
            seqs_pending_writes = True
            outSeq = str(rec.seq)
            seqLen = len(outSeq)

        locBucket = {}
        outFeats = []
        topTypeDict = {}
        seekingParent = []
        geneNum = 0
        autoGeneNum = 0
        for feat in rec.features:
            if (
                identifier not in feat.qualifiers.keys()
            ):  # Allow metadata features and other features with no ID (Output warning?) - AJC
                if feat.type in bottomFeatTypes:
                    seekingParent.append(
                        [feat, [], []]
                    )  # [Feature, all parent candidates, strongest parent candidates]
                    continue
                elif feat.type not in topTypeDict.keys():
                    topTypeDict[feat.type] = 1
                else:
                    topTypeDict[feat.type] += 1
                outFeats.append(
                    makeGffFeat(feat, topTypeDict[feat.type], recID, identifier)
                )
                continue
            elif feat.qualifiers[identifier][0] not in locBucket.keys():
                locBucket[feat.qualifiers[identifier][0]] = []
            locBucket[feat.qualifiers[identifier][0]].append(feat)

        for locus in locBucket.keys():
            minLoc = locBucket[locus][0].location.start
            maxLoc = locBucket[locus][0].location.end
            for feat in locBucket[locus]:
                minLoc = min(minLoc, feat.location.start)
                maxLoc = max(maxLoc, feat.location.end)
            for x in seekingParent:
                if x[0].location.start >= minLoc and x[0].location.end <= maxLoc:
                    x[1].append(locus)
                if x[0].location.start == minLoc or x[0].location.end == maxLoc:
                    x[2].append(locus)

        for x in seekingParent:  # Reformat to [Feature, Locus, Unused/Free]
            if len(x[2]) == 1:
                finList = ""
                if len(x[1]) > 1:
                    for loc in x[1]:
                        if loc != x[2][0]:
                            finList += loc + ", "
                    finList = (
                        str(x[0].type)
                        + " had no locus tag set in .gbk file, automatically derived. Other, weaker candidate(s) were "
                        + finList[0:-2]
                        + "."
                    )
                else:
                    finList = (
                        str(x[0].type)
                        + " had no locus tag set in .gbk file, automatically derived."
                    )
                if "Notes" not in x[0].qualifiers.keys():
                    x[0].qualifiers["Notes"] = []
                x[0].qualifiers["Notes"].append(finList)
                x[1] = x[2][0]
            elif len(x[2]) > 1:
                candidate = x[2][0]  # Arbitrarily choose first one
                finList = ""
                strongList = ""
                for loc in x[2]:
                    if loc != candidate:
                        finList += loc + ", "
                        strongList += loc + ", "
                for loc in x[1]:
                    if loc not in x[2]:
                        finList += loc + ", "
                finList = (
                    str(x[0].type)
                    + " had no locus tag set in .gbk file, automatically derived. Other candidate(s) were "
                    + finList[0:-2]
                    + " (Equally strong candidate(s): "
                    + strongList[0:-2]
                    + ")."
                )
                if "Notes" not in x[0].qualifiers.keys():
                    x[0].qualifiers["Notes"] = []
                x[0].qualifiers["Notes"].append(finList)
                x[1] = candidate
            elif len(x[1]) == 1:
                x[1] = x[1][0]
                if "Notes" not in x[0].qualifiers.keys():
                    x[0].qualifiers["Notes"] = []
                finList = (
                    str(x[0].type)
                    + " had no locus tag set in .gbk file, automatically derived."
                )
                x[0].qualifiers["Notes"].append(finList)
            elif len(x[1]) > 1:
                candidate = x[1][0]  # Arbitrarily choose first one
                finList = ""
                for loc in x[1]:
                    if loc != candidate:
                        finList += loc + ", "
                finList = (
                    str(x[0].type)
                    + " had no locus tag set in .gbk file, automatically derived. Other candidates were "
                    + finList[0:-2]
                    + "."
                )
                if "Notes" not in x[0].qualifiers.keys():
                    x[0].qualifiers["Notes"] = []
                x[0].qualifiers["Notes"].append(finList)
                x[1] = candidate
            else:
                if makeGene:
                    sys.stderr.write(
                        "Warning: Unable to find potential parent for feature with no "
                        + identifier
                        + " of type "
                        + str(x[0].type)
                        + " at location ["
                        + str(x[0].location.start + 1)
                        + ", "
                        + str(x[0].location.end)
                        + "], creating standalone gene.\n"
                    )
                    autoGeneNum += 1
                    x[0].source = "GbkToGff"
                    x[0].score = 0
                    x[0].shift = 0
                    if "ID" not in x[0].qualifiers.keys():
                        x[0].qualifiers["ID"] = [
                            recID + ".standalone_" + x[0].type + "." + str(autoGeneNum)
                        ]
                    tempName = recID + ".derived_Gene." + str(autoGeneNum)
                    tempQuals = {
                        "ID": [tempName],
                        "Notes": [
                            "Gene feature automatically generated by Gbk to GFF conversion"
                        ],
                    }
                    tempGene = gffSeqFeature(
                        FeatureLocation(
                            x[0].location.start, x[0].location.end, x[0].location.strand
                        ),
                        "gene",
                        "",
                        x[0].strand,
                        tempName,
                        tempQuals,
                        None,
                        None,
                        None,
                        ".",
                        0,
                        "GbkToGff",
                    )
                    if makeMRNA:
                        tempName = recID + ".derived_mRNA." + str(autoGeneNum)
                        tempQuals = {
                            "ID": [tempName],
                            "Notes": [
                                "mRNA feature automatically generated by Gbk to GFF conversion"
                            ],
                        }
                        tempGene.sub_features.append(
                            gffSeqFeature(
                                FeatureLocation(
                                    x[0].location.start,
                                    x[0].location.end,
                                    x[0].location.strand,
                                ),
                                "mRNA",
                                "",
                                x[0].strand,
                                tempName,
                                tempQuals,
                                None,
                                None,
                                None,
                                ".",
                                0,
                                "GbkToGff",
                            )
                        )
                        tempGene.sub_features[-1].sub_features.append(x[0])
                    else:
                        tempGene.sub_features.append(x[0])

                    outFeats.append(tempGene)
                else:
                    sys.stderr.write(
                        "Warning: Unable to find potential parent for feature with no "
                        + identifier
                        + " of type "
                        + str(x[0].type)
                        + " at location ["
                        + str(x[0].location.start + 1)
                        + ", "
                        + str(x[0].location.end)
                        + "].\n"
                    )
                    if x[0].type not in topTypeDict.keys():
                        topTypeDict[x[0].type] = 1
                    else:
                        topTypeDict[x[0].type] += 1
                    outFeats.append(
                        makeGffFeat(x[0], topTypeDict[x[0].type], recID, identifier)
                    )

        for locus in locBucket.keys():
            if len(locBucket[locus]) == 1:  # No heirarchy to be made
                outFeats.append(makeGffFeat(locBucket[locus][0], 0, recID, identifier))
                continue
            topFeat = None
            midFeat = None
            bottomFeats = []
            typeDict = {}
            minLoc = locBucket[locus][0].location.start
            maxLoc = locBucket[locus][0].location.end
            geneNum += 1
            for feat in locBucket[locus]:
                # If we want to make our own top-level feat?
                minLoc = min(minLoc, feat.location.start)
                maxLoc = max(maxLoc, feat.location.end)

                # Gene->mRNA->CDS included as example, to add other feature-heirarchys in the appropriate slot
                if feat.type in ["gene"]:
                    if not topFeat:
                        topFeat = feat
                    # Else handle multiple top features
                elif feat.type in ["mRNA", "tRNA", "rRNA"]:
                    if not midFeat:
                        midFeat = feat
                    # Else handle multiple mid feats (May need another elif type-in-list statement if we actually expect a list of mid feats)
                else:
                    if feat.type not in typeDict.keys():
                        typeDict[feat.type] = 1
                    else:
                        typeDict[feat.type] += 1
                    bottomFeats.append(feat)

            for x in seekingParent:
                if type(x[1]) != "list" and locus == x[1]:
                    x[0].qualifiers[identifier] = [locus]
                    bottomFeats.append(x[0])
                    if x[0].type not in typeDict.keys():
                        typeDict[x[0].type] = 1
                    else:
                        typeDict[x[0].type] += 1

            # if not topFeat: # Make our own top-level feature based off minLoc, maxLoc bounds

            for (
                x
            ) in (
                typeDict.keys()
            ):  # If only 1, set it to 0 so we don't append a number to the name
                if (
                    typeDict[x] == 1
                ):  # Else, set to 1 so that we count up as we encounter the features
                    typeDict[x] = 0
                else:
                    typeDict[x] = 1

            if not topFeat:
                if makeGene:
                    if midFeat:
                        possibleStrand = midFeat.strand
                    else:
                        possibleStrand = bottomFeats[0].strand
                    tempName = recID + ".gene." + str(geneNum)
                    tempQuals = {
                        identifier: [locus],
                        "ID": [tempName],
                        "Notes": [
                            "Gene feature automatically generated by Gbk to GFF conversion"
                        ],
                    }
                    topFeat = gffSeqFeature(
                        FeatureLocation(minLoc, maxLoc, possibleStrand),
                        "gene",
                        "",
                        possibleStrand,
                        tempName,
                        tempQuals,
                        None,
                        None,
                        None,
                        ".",
                        0,
                        "GbkToGff",
                    )
                else:
                    sys.stderr.write(
                        "Unable to create a feature heirarchy at location [%d, %d] with features: \n"
                        % (minLoc, maxLoc)
                    )
                    for x in locBucket[locus]:
                        sys.stderr.write(str(x))
                        sys.stderr.write("\n")
                        failed = 1
                    continue

            outFeats.append(makeGffFeat(topFeat, 0, recID, identifier))
            if not midFeat and topFeat.type == "gene" and makeMRNA:
                if identifier in topFeat.qualifiers.keys():
                    tempName = topFeat.qualifiers[identifier][0] + ".mRNA"
                    tempQuals = {
                        identifier: topFeat.qualifiers[identifier],
                        "ID": [tempName],
                        "Notes": [
                            "mRNA feature automatically generated by Gbk to GFF conversion"
                        ],
                    }
                else:
                    tempName = outFeats[-1].ID + ".mRNA"
                    tempQuals = {
                        identifier: topFeat.qualifiers[identifier],
                        "ID": [tempName],
                        "Notes": [
                            "mRNA feature automatically generated by Gbk to GFF conversion"
                        ],
                    }
                midFeat = gffSeqFeature(
                    FeatureLocation(minLoc, maxLoc, topFeat.strand),
                    "mRNA",
                    "",
                    topFeat.strand,
                    tempName,
                    tempQuals,
                    None,
                    None,
                    None,
                    ".",
                    0,
                    "GbkToGff",
                )

            if (
                midFeat
            ):  # Again, need a new if statement if we want to handle multiple mid-tier features
                outFeats[-1].sub_features.append(
                    makeGffFeat(midFeat, 0, recID, identifier)
                )
                outFeats[-1].sub_features[-1].qualifiers["Parent"] = [outFeats[-1].id]
                for x in bottomFeats:
                    typeDict[x.type] += 1
                    outFeats[-1].sub_features[-1].sub_features.append(
                        makeGffFeat(x, typeDict[x.type], recID, identifier)
                    )
                    outFeats[-1].sub_features[-1].sub_features[-1].qualifiers[
                        "Parent"
                    ] = [outFeats[-1].sub_features[-1].id]
            else:  # No midFeat, append bottom feats directly to top feats
                for x in bottomFeats:
                    typeDict[x.type] += 1
                    outFeats[-1].sub_features.append(
                        makeGffFeat(x, typeDict[x.type], recID, identifier)
                    )
                    outFeats[-1].sub_features[-1].qualifiers["Parent"] = [
                        outFeats[-1].id
                    ]

        outRec.append(
            SeqRecord(
                rec.seq,
                recID,
                rec.name,
                rec.description,
                rec.dbxrefs,
                sorted(outFeats, key=lambda x: x.location.start),
                rec.annotations,
                rec.letter_annotations,
            )
        )
        SeqIO.write([outRec[-1]], fastaFile, "fasta")
    gffWrite(outRec, ofh)
    exit(failed)  # 0 if all features handled, 1 if unable to handle some


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Biopython solution to Gbk to GFF conversion"
    )

    parser.add_argument(
        "inFile", type=argparse.FileType("r"), help="Path to an input GBK file"
    )
    parser.add_argument(
        "--makeMRNA",
        action="store_true",
        required=False,
        help="Automatically create mRNA features",
    )
    parser.add_argument(
        "--makeGene",
        action="store_true",
        required=False,
        help="Automatically create missing Gene features",
    )
    parser.add_argument(
        "--identifier",
        type=str,
        default="locus_tag",
        required=False,
        help="Qualifier to derive ID property from",
    )
    parser.add_argument(
        "--fastaFile", type=argparse.FileType("w"), help="Fasta output for sequences"
    )
    parser.add_argument(
        "--outFile", type=argparse.FileType("w"), help="GFF feature output"
    )
    args = parser.parse_args()
    main(**vars(args))