view cpt_gff_apollo_prep/gff3.py @ 0:eb0c42719156 draft

Uploaded
author cpt
date Fri, 13 May 2022 04:55:55 +0000
parents
children
line wrap: on
line source

import copy
import logging

log = logging.getLogger()
log.setLevel(logging.WARN)


def feature_lambda(
    feature_list,
    test,
    test_kwargs,
    subfeatures=True,
    parent=None,
    invert=False,
    recurse=True,
):
    """Recursively search through features, testing each with a test function, yielding matches.

    GFF3 is a hierachical data structure, so we need to be able to recursively
    search through features. E.g. if you're looking for a feature with
    ID='bob.42', you can't just do a simple list comprehension with a test
    case. You don't know how deeply burried bob.42 will be in the feature tree. This is where feature_lambda steps in.

    :type feature_list: list
    :param feature_list: an iterable of features

    :type test: function reference
    :param test: a closure with the method signature (feature, **kwargs) where
                 the kwargs are those passed in the next argument. This
                 function should return True or False, True if the feature is
                 to be yielded as part of the main feature_lambda function, or
                 False if it is to be ignored. This function CAN mutate the
                 features passed to it (think "apply").

    :type test_kwargs: dictionary
    :param test_kwargs: kwargs to pass to your closure when it is called.

    :type subfeatures: boolean
    :param subfeatures: when a feature is matched, should just that feature be
                        yielded to the caller, or should the entire sub_feature
                        tree for that feature be included? subfeatures=True is
                        useful in cases such as searching for a gene feature,
                        and wanting to know what RBS/Shine_Dalgarno_sequences
                        are in the sub_feature tree (which can be accomplished
                        with two feature_lambda calls). subfeatures=False is
                        useful in cases when you want to process (and possibly
                        return) the entire feature tree, such as applying a
                        qualifier to every single feature.

    :type invert: boolean
    :param invert: Negate/invert the result of the filter.

    :rtype: yielded list
    :return: Yields a list of matching features.
    """
    # Either the top level set of [features] or the subfeature attribute
    for feature in feature_list:
        feature._parent = parent
        if not parent:
            # Set to self so we cannot go above root.
            feature._parent = feature
        test_result = test(feature, **test_kwargs)
        # if (not invert and test_result) or (invert and not test_result):
        if invert ^ test_result:
            if not subfeatures:
                feature_copy = copy.deepcopy(feature)
                feature_copy.sub_features = list()
                yield feature_copy
            else:
                yield feature

        if recurse and hasattr(feature, "sub_features"):
            for x in feature_lambda(
                feature.sub_features,
                test,
                test_kwargs,
                subfeatures=subfeatures,
                parent=feature,
                invert=invert,
                recurse=recurse,
            ):
                yield x


def fetchParent(feature):
    if not hasattr(feature, "_parent") or feature._parent is None:
        return feature
    else:
        return fetchParent(feature._parent)


def feature_test_true(feature, **kwargs):
    return True


def feature_test_type(feature, **kwargs):
    if "type" in kwargs:
        return str(feature.type).upper() == str(kwargs["type"]).upper()
    elif "types" in kwargs:
      for x in kwargs["types"]:
        if str(feature.type).upper() == str(x).upper():
          return True
      return False
    raise Exception("Incorrect feature_test_type call, need type or types")


def feature_test_qual_value(feature, **kwargs):
    """Test qualifier values.

    For every feature, check that at least one value in
    feature.quailfiers(kwargs['qualifier']) is in kwargs['attribute_list']
    """
    if isinstance(kwargs["qualifier"], list):
        for qualifier in kwargs["qualifier"]:
            for attribute_value in feature.qualifiers.get(qualifier, []):
                if attribute_value in kwargs["attribute_list"]:
                    return True
    else:
        for attribute_value in feature.qualifiers.get(kwargs["qualifier"], []):
            if attribute_value in kwargs["attribute_list"]:
                return True
    return False


def feature_test_location(feature, **kwargs):
    if "strand" in kwargs:
        if feature.location.strand != kwargs["strand"]:
            return False

    return feature.location.start <= kwargs["loc"] <= feature.location.end


def feature_test_quals(feature, **kwargs):
    """
    Example::

        a = Feature(qualifiers={'Note': ['Some notes', 'Aasdf']})

        # Check if a contains a Note
        feature_test_quals(a, {'Note': None})  # Returns True
        feature_test_quals(a, {'Product': None})  # Returns False

        # Check if a contains a note with specific value
        feature_test_quals(a, {'Note': ['ome']})  # Returns True

        # Check if a contains a note with specific value
        feature_test_quals(a, {'Note': ['other']})  # Returns False
    """
    for key in kwargs:
        if key not in feature.qualifiers:
            return False

        # Key is present, no value specified
        if kwargs[key] is None:
            return True

        # Otherwise there is a key value we're looking for.
        # so we make a list of matches
        matches = []
        # And check all of the feature qualifier valuse
        for value in feature.qualifiers[key]:
            # For that kwargs[key] value
            for x in kwargs[key]:
                matches.append(x in value)

        # If none matched, then we return false.
        if not any(matches):
            return False

    return True


def feature_test_contains(feature, **kwargs):
    if "index" in kwargs:
        return feature.location.start < kwargs["index"] < feature.location.end
    elif "range" in kwargs:
        return (
            feature.location.start < kwargs["range"]["start"] < feature.location.end
            and feature.location.start < kwargs["range"]["end"] < feature.location.end
        )
    else:
        raise RuntimeError("Must use index or range keyword")


def get_id(feature=None, parent_prefix=None):
    result = ""
    if parent_prefix is not None:
        result += parent_prefix + "|"
    if "locus_tag" in feature.qualifiers:
        result += feature.qualifiers["locus_tag"][0]
    elif "gene" in feature.qualifiers:
        result += feature.qualifiers["gene"][0]
    elif "Gene" in feature.qualifiers:
        result += feature.qualifiers["Gene"][0]
    elif "product" in feature.qualifiers:
        result += feature.qualifiers["product"][0]
    elif "Product" in feature.qualifiers:
        result += feature.qualifiers["Product"][0]
    elif "Name" in feature.qualifiers:
        result += feature.qualifiers["Name"][0]
    else:
        return feature.id
        # Leaving in case bad things happen.
        # result += '%s_%s_%s_%s' % (
        # feature.id,
        # feature.location.start,
        # feature.location.end,
        # feature.location.strand
        # )
    return result


def get_gff3_id(gene):
    return gene.qualifiers.get("Name", [gene.id])[0]


def ensure_location_in_bounds(start=0, end=0, parent_length=0):
    # This prevents frameshift errors
    while start < 0:
        start += 3
    while end < 0:
        end += 3
    while start > parent_length:
        start -= 3
    while end > parent_length:
        end -= 3
    return (start, end)


def coding_genes(feature_list):
    for x in genes(feature_list):
        if (
            len(
                list(
                    feature_lambda(
                        x.sub_features,
                        feature_test_type,
                        {"type": "CDS"},
                        subfeatures=False,
                    )
                )
            )
            > 0
        ):
            yield x


def genes(feature_list, feature_type="gene", sort=False):
    """
    Simple filter to extract gene features from the feature set.
    """

    if not sort:
        for x in feature_lambda(
            feature_list, feature_test_type, {"type": feature_type}, subfeatures=True
        ):
            yield x
    else:
        data = list(genes(feature_list, feature_type=feature_type, sort=False))
        data = sorted(data, key=lambda feature: feature.location.start)
        for x in data:
            yield x


def wa_unified_product_name(feature):
    """
    Try and figure out a name. We gave conflicting instructions, so
    this isn't as trivial as it should be. Sometimes it will be in
    'product' or 'Product', othertimes in 'Name'
    """
    # Manually applied tags.
    protein_product = feature.qualifiers.get(
        "product", feature.qualifiers.get("Product", [None])
    )[0]

    # If neither of those are available ...
    if protein_product is None:
        # And there's a name...
        if "Name" in feature.qualifiers:
            if not is_uuid(feature.qualifiers["Name"][0]):
                protein_product = feature.qualifiers["Name"][0]

    return protein_product


def is_uuid(name):
    return name.count("-") == 4 and len(name) == 36


def get_rbs_from(gene):
    # Normal RBS annotation types
    rbs_rbs = list(
        feature_lambda(
            gene.sub_features, feature_test_type, {"type": "RBS"}, subfeatures=False
        )
    )
    rbs_sds = list(
        feature_lambda(
            gene.sub_features,
            feature_test_type,
            {"type": "Shine_Dalgarno_sequence"},
            subfeatures=False,
        )
    )
    # Fraking apollo
    apollo_exons = list(
        feature_lambda(
            gene.sub_features, feature_test_type, {"type": "exon"}, subfeatures=False
        )
    )
    apollo_exons = [x for x in apollo_exons if len(x) < 10]
    # These are more NCBI's style
    regulatory_elements = list(
        feature_lambda(
            gene.sub_features,
            feature_test_type,
            {"type": "regulatory"},
            subfeatures=False,
        )
    )
    rbs_regulatory = list(
        feature_lambda(
            regulatory_elements,
            feature_test_quals,
            {"regulatory_class": ["ribosome_binding_site"]},
            subfeatures=False,
        )
    )
    # Here's hoping you find just one ;)
    return rbs_rbs + rbs_sds + rbs_regulatory + apollo_exons


def nice_name(record):
    """
    get the real name rather than NCBI IDs and so on. If fails, will return record.id
    """
    name = record.id
    likely_parental_contig = list(genes(record.features, feature_type="contig"))
    if len(likely_parental_contig) == 1:
        name = likely_parental_contig[0].qualifiers.get("organism", [name])[0]
    return name


def fsort(it):
    for i in sorted(it, key=lambda x: int(x.location.start)):
        yield i