view cpt_renumber_gbk/renumber.py @ 0:8cac332dbc77 draft default tip

Uploaded
author cpt
date Fri, 17 Jun 2022 13:13:47 +0000
parents
children
line wrap: on
line source

#!/usr/bin/env python
import BIO_FIX_TOPO  # NOQA
import argparse
import sys  # noqa
from Bio import SeqIO

import logging

logging.basicConfig(level=logging.INFO)
log = logging.getLogger()

# gene and RBS features are also included in the tagged features list, but are dealt with specifically elsewhere.
# This is used to filter out just valid "in gene" features
TAGGED_FEATURES = ["CDS", "tRNA", "intron", "mat_peptide"]


def renumber_genes(
    gbk_files,
    tag_to_update="locus_tag",
    string_prefix="display_id",
    leading_zeros=3,
    forceTagMatch=False,
    change_table=None,
):

    for gbk_file in gbk_files:
        for record in SeqIO.parse(gbk_file, "genbank"):
            if string_prefix == "display_id":
                format_string = record.id + "_%0" + str(leading_zeros) + "d"
            else:
                format_string = string_prefix + "%0" + str(leading_zeros) + "d"

            # f_cds = [f for f in record.features if f.type == 'CDS']
            # f_rbs = [f for f in record.features if f.type == 'RBS']
            # f_gene = [f for f in record.features if f.type == 'gene']
            # f_intron = [f for f in record.features if f.type == 'intron']
            # f_trna = [f for f in record.features if f.type == 'tRNA']
            # f_pep = [f for f in record.features if f.type == 'mat_peptide']
            # f_oth = [f for f in record.features if f.type not in ['CDS', 'RBS',
            #                                                      'gene', 'intron',
            #                                                      'tRNA', 'mat_peptide']]
            # Apparently we're numbering tRNAs now, thanks for telling me.
            # f_oth2 = []
            # for q in sorted(f_oth, key=lambda x: x.location.start):
            #    if q.type == 'tRNA':
            #        q.qualifiers['locus_tag'] = format_string_t % tRNA_count
            #        tRNA_count += 1
            #        f_oth2.append(q)
            #    else:
            #        f_oth2.append(q)
            # f_oth = f_oth2

            # f_care_about = []

            # Make sure we've hit every RBS and gene
            # for cds in f_cds:
            # If there's an associated gene feature, it will share a stop codon
            #    if cds.location.strand > 0:
            #        associated_genes = [f for f in f_gene if f.location.end ==
            #                            cds.location.end]
            #    else:
            #        associated_genes = [f for f in f_gene if f.location.start ==
            #                            cds.location.start]

            #    # If there's an RBS it'll be upstream a bit.
            #    if cds.location.strand > 0:
            #        associated_rbss = [f for f in f_rbs if f.location.end <
            #                           cds.location.start and f.location.end >
            #                           cds.location.start - 24]
            #    else:
            #        associated_rbss = [f for f in f_rbs if f.location.start >
            #                           cds.location.end and f.location.start <
            #                           cds.location.end + 24]
            #    tmp_result = [cds]
            #    if len(associated_genes) > 0:
            #        tmp_result.append(associated_genes[0])

            #   if len(associated_rbss) == 1:
            #       tmp_result.append(associated_rbss[0])
            #   else:
            #       log.warning("%s RBSs found for %s", len(associated_rbss), cds.location)
            # We choose to append to f_other as that has all features not
            # already accessed. It may mean that some gene/RBS features are
            # missed if they aren't detected here, which we'll need to handle.
            #    f_care_about.append(tmp_result)

            #####-----------------------------------------------------------------------------------------------------------#####
            # Build list of genes, then iterate over non-gene features and sort into containing genes.
            # tags are assigned based on genes, so start the lists with the gene features
            f_gene = sorted(
                [f for f in record.features if f.type == "gene"],
                key=lambda x: x.location.start,
            )
            oldNames = []
            for x in f_gene:
              if tag_to_update in x.qualifiers.keys():
                oldNames.append(x.qualifiers[tag_to_update])
            
            f_rbs = sorted(
                [f for f in record.features if f.type == "RBS"],
                key=lambda x: x.location.start,
            )
            f_tag = list()
            f_sorted = sorted(
                [f for f in record.features if f.type in TAGGED_FEATURES],
                key=lambda x: x.location.start,
            )
            # genes not in the TAGGED_FEATURES list are exluded from the processing and assumed to already be clean
            clean_features = sorted(
                [
                    f
                    for f in record.features
                    if f.type not in TAGGED_FEATURES and f.type not in ["gene", "RBS"]
                ],
                key=lambda x: x.location.start,
            )

            f_processed = []
            for gene in f_gene:
                tag = [gene]
                if gene.location.strand >= 0:  # Be strict on where to find starting RBS
                    geneComp = gene.location.start
                else:
                    geneComp = gene.location.end
                # find the gene's RBS feature
                for rbs in [f for f in f_rbs if f not in f_processed]:
                    if is_within(rbs, gene) and (
                        rbs.location.start == geneComp or rbs.location.end == geneComp
                    ):
                        if (tag_to_update not in rbs.qualifiers.keys()): 
                          tag.append(rbs)
                          f_processed.append(rbs)
                          break
                        elif (tag_to_update not in gene.qualifiers.keys()): # This will gurantee qual is in gene and RBS for next check
                          tag.append(rbs)
                          f_processed.append(rbs)
                          break
                        elif (not forceTagMatch) or (rbs.qualifiers[tag_to_update] == gene.qualifiers[tag_to_update]):
                          tag.append(rbs)
                          f_processed.append(rbs)
                          break 
                        
                # find all other non-RBS features
                for feature in [f for f in f_sorted if f not in f_processed]:
                    # If the feature is within the gene boundaries (genes are the first entry in tag list),
                    # add it to the same locus tag group, does not process RBS
                    if is_within(feature, gene):
                        if tag_to_update not in feature.qualifiers.keys():
                        # catches genes and CDS feature that are intron-contained.
                          if feature.type == "CDS":
                            if (
                                feature.location.start == gene.location.start
                                or feature.location.end == gene.location.end
                            ):
                                
                                tag.append(feature)
                                f_processed.append(feature)
                          else:
                            tag.append(feature)
                            f_processed.append(feature)
                        elif (not forceTagMatch) or (tag_to_update in gene.qualifiers.keys() and feature.qualifiers[tag_to_update] == gene.qualifiers[tag_to_update]):
                          tag.append(feature)
                          f_processed.append(feature)
                    elif feature.location.start > gene.location.end:
                        # because the features are sorted by coordinates,
                        # no features further down  on the list will be in this gene
                        break
                f_tag.append(tag)

            # Process for frameshifts and mat_peptides (inteins)

            # check for overlapped genes
            # at this point, relevant features are put into tag buckets along with the containing gene
            # matin the form of [gene, feature1, feature2, ...]
            for rbs in [f for f in f_rbs if f not in f_processed]:
                dupeRBS = False
                for x in f_processed:
                  if x.type == "RBS" and (tag_to_update in rbs.qualifiers.keys() and tag_to_update in x.qualifiers.keys() and rbs.qualifiers[tag_to_update] == x.qualifiers[tag_to_update]):
                    dupeRBS = True
                if dupeRBS:
                  change_table.write(
                    record.id
                    + "\t"
                    + rbs.type
                    + ":"
                    + (rbs.qualifiers[tag_to_update][0])
                    + "\t[Removed: Parent gene already had an RBS]\n"
                  )
                else:
                  change_table.write(
                    record.id
                    + "\t"
                    + rbs.type
                    + ":"
                    + (rbs.qualifiers[tag_to_update][0])
                    + "\t[Removed: RBS did not both fall within boundary of gene and share a boundary with a gene]\n"
                  )


            tag_index = 1
            delta = []
            for tag in f_tag:  # each tag list is one 'bucket'
                new_tag_value = format_string % tag_index
                for feature in tag:
                    original_tag_value = delta_old(feature, tag_to_update)
                    feature.qualifiers[tag_to_update] = [new_tag_value]
                    # Once the tag is renumbered, it's added to the clean list for later output
                    clean_features.append(feature)
                    delta.append(
                        "\t".join((record.id, original_tag_value, new_tag_value))
                    )
                tag_index += 1

            # Why must these people start at 1
            # Because we have to modify the array we work on a separate one
            # clean_features = f_oth
            # delta = []
            # for index, feature_list in enumerate(sorted(f_care_about, key=lambda x: x[0].location.start)):
            #    for f in feature_list:
            #        original_tag_value = delta_old(f, tag_to_update)
            #        # Add 1 to index for 1-indexed counting for happy scientists
            #        new_tag_value = format_string % (index+1)
            #        f.qualifiers[tag_to_update] = [new_tag_value]
            #        clean_features.append(f)
            #        delta.append('\t'.join((record.id, original_tag_value, new_tag_value)))

            # Update all features
            record.features = sorted(clean_features, key=lambda x: x.location.start)
            
            for feature in [f for f in f_sorted if f not in f_processed]:
                if feature.type == "CDS":
                  if tag_to_update in feature.qualifiers.keys() and forceTagMatch:
                    failNameCheck = True
                    for x in oldNames:
                      for tag in feature.qualifiers[tag_to_update]:
                          if tag in x:
                            failNameCheck = False
                      if not failNameCheck:
                        break
                    if failNameCheck:
                      change_table.write(
                        record.id
                        + "\t"
                        + feature.type
                        + ":"
                        + (feature.qualifiers[tag_to_update][0])
                        + "\t[Removed: (Tag check enabled) CDS did not both share a start/end with and fall within a gene with the same " + tag_to_update + " value]\n"
                      )
                    else:
                      change_table.write(
                        record.id
                        + "\t"
                        + feature.type
                        + ":"
                        + (feature.qualifiers[tag_to_update][0])
                        + "\t[Removed: CDS did not both fall within boundary of gene and share a boundary with a gene]\n"
                      )  
                  elif tag_to_update in feature.qualifiers.keys():
                    change_table.write(
                        record.id
                        + "\t"
                        + feature.type
                        + ":"
                        + (feature.qualifiers[tag_to_update][0])
                        + "\t[Removed: CDS did not both fall within boundary of gene and share a boundary with a gene]\n"
                    )
                  else:
                    change_table.write(
                        record.id
                        + "\t"
                        + feature.type
                        + ": No "
                        + tag_to_update
                        + "\t[Removed: CDS at (" + str(feature.location.start) + "," + str(feature.location.end) + ") did not both fall within boundary of gene and share a boundary with a gene]\n"
                    )
                else:
                  if tag_to_update in feature.qualifiers.keys() and forceTagMatch:
                    failNameCheck = True
                    for x in oldNames:
                      for tag in feature.qualifiers[tag_to_update]:
                          if tag in x:
                            failNameCheck = False
                      if not failNameCheck:
                        break
                    if failNameCheck:
                      change_table.write(
                        record.id
                        + "\t"
                        + feature.type
                        + ":"
                        + (feature.qualifiers[tag_to_update][0])
                        + "\t[Removed: (Tag check enabled) Feature did not fall within a gene it shared a " + tag_to_update + " value with]\n"
                      )
                    else:
                      change_table.write(
                        record.id
                        + "\t"
                        + feature.type
                        + ":"
                        + (feature.qualifiers[tag_to_update][0])
                        + "\t[Removed: Feature not within boundary of a gene]\n"
                      )
                  elif tag_to_update in feature.qualifiers.keys():
                    change_table.write(
                        record.id
                        + "\t"
                        + feature.type
                        + ":"
                        + (feature.qualifiers[tag_to_update][0])
                        + "\t[Removed: Feature not within boundary of a gene]\n"
                    )
                  else:
                    change_table.write(
                        record.id
                        + "\t"
                        + feature.type
                        + ": (has no "
                        + tag_to_update
                        + ")\t[Removed: Feature not within boundary of a gene]\n"
                    )
            change_table.write("\n".join(delta) + "\n")

            # Output
            yield record


def delta_old(feature, tag_to_update):
    # First part of delta entry, old name
    if tag_to_update in feature.qualifiers:
        return feature.qualifiers[tag_to_update][0]
    else:
        return "%s %s %s" % (
            feature.location.start,
            feature.location.end,
            feature.location.strand,
        )


def is_within(query, feature):
    # checks if the query item is within the bounds of the given feature
    sortedList = sorted(query.location.parts, key=lambda x: x.start)
    for x in sortedList:
      if (
          feature.location.start <= x.start
          and feature.location.end >= x.end
      ):
        if x.strand < 0 and x == sortedList[-1]:
          return True
        elif x.strand >= 0 and x == sortedList[0]:
          return True
    #else:
    return False


# def fix_frameshift(a, b):
#    #checks if gene a and gene b are a frameshifted gene (either shares a start or an end and an RBS)
#    if a[0].location.start == b[0].location.start or a[0].location.end == b[0].location.end:
#        # It is likely a frameshift. Treat is as such. Find shared RBS, determine which CDS is which
#        big_gene = a if (a[0].location.end - a[0].location.start) > (b[0].location.end - b[0].location.start) else b
#        small_gene = a if big_gene==b else b
#        rbs = [f for f in a if f.type == 'RBS']
#        # In the way that the tag lists are generated, the larger gene should contain both CDS features.
#        # Retrieve and dermine big/small CDS
#        cdss = [f for f in big_gene if f.type == 'CDS']
#        big_cds = cdss[0] if (cdss[0].location.end - cdss[0].location.start) > (cdss[1].location.end - cdss[1].location.start) else cdss[1]
#        small_cds = cdss[0] if big_cds==cdss[1] else cdss[1]


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Renumber genbank files")
    parser.add_argument(
        "gbk_files", type=argparse.FileType("r"), nargs="+", help="Genbank files"
    )
    parser.add_argument(
        "--tag_to_update", type=str, help="Tag to update", default="locus_tag"
    )
    parser.add_argument(
        "--string_prefix", type=str, help="Prefix string", default="display_id"
    )
    parser.add_argument(
        "--leading_zeros", type=int, help="# of leading zeroes", default=3
    )

    parser.add_argument(
        "--forceTagMatch", action="store_true", help="Make non-CDS features match tag initially"
    )

    parser.add_argument(
        "--change_table",
        type=argparse.FileType("w"),
        help="Location to store change table in",
        default="renumber.tsv",
    )

    args = parser.parse_args()
    for record in renumber_genes(**vars(args)):
        SeqIO.write(record, sys.stdout, "genbank")