Mercurial > repos > cpt > cpt_gbk_renumber
diff cpt_renumber_gbk/renumber.py @ 0:8cac332dbc77 draft default tip
Uploaded
author | cpt |
---|---|
date | Fri, 17 Jun 2022 13:13:47 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_renumber_gbk/renumber.py Fri Jun 17 13:13:47 2022 +0000 @@ -0,0 +1,397 @@ +#!/usr/bin/env python +import BIO_FIX_TOPO # NOQA +import argparse +import sys # noqa +from Bio import SeqIO + +import logging + +logging.basicConfig(level=logging.INFO) +log = logging.getLogger() + +# gene and RBS features are also included in the tagged features list, but are dealt with specifically elsewhere. +# This is used to filter out just valid "in gene" features +TAGGED_FEATURES = ["CDS", "tRNA", "intron", "mat_peptide"] + + +def renumber_genes( + gbk_files, + tag_to_update="locus_tag", + string_prefix="display_id", + leading_zeros=3, + forceTagMatch=False, + change_table=None, +): + + for gbk_file in gbk_files: + for record in SeqIO.parse(gbk_file, "genbank"): + if string_prefix == "display_id": + format_string = record.id + "_%0" + str(leading_zeros) + "d" + else: + format_string = string_prefix + "%0" + str(leading_zeros) + "d" + + # f_cds = [f for f in record.features if f.type == 'CDS'] + # f_rbs = [f for f in record.features if f.type == 'RBS'] + # f_gene = [f for f in record.features if f.type == 'gene'] + # f_intron = [f for f in record.features if f.type == 'intron'] + # f_trna = [f for f in record.features if f.type == 'tRNA'] + # f_pep = [f for f in record.features if f.type == 'mat_peptide'] + # f_oth = [f for f in record.features if f.type not in ['CDS', 'RBS', + # 'gene', 'intron', + # 'tRNA', 'mat_peptide']] + # Apparently we're numbering tRNAs now, thanks for telling me. + # f_oth2 = [] + # for q in sorted(f_oth, key=lambda x: x.location.start): + # if q.type == 'tRNA': + # q.qualifiers['locus_tag'] = format_string_t % tRNA_count + # tRNA_count += 1 + # f_oth2.append(q) + # else: + # f_oth2.append(q) + # f_oth = f_oth2 + + # f_care_about = [] + + # Make sure we've hit every RBS and gene + # for cds in f_cds: + # If there's an associated gene feature, it will share a stop codon + # if cds.location.strand > 0: + # associated_genes = [f for f in f_gene if f.location.end == + # cds.location.end] + # else: + # associated_genes = [f for f in f_gene if f.location.start == + # cds.location.start] + + # # If there's an RBS it'll be upstream a bit. + # if cds.location.strand > 0: + # associated_rbss = [f for f in f_rbs if f.location.end < + # cds.location.start and f.location.end > + # cds.location.start - 24] + # else: + # associated_rbss = [f for f in f_rbs if f.location.start > + # cds.location.end and f.location.start < + # cds.location.end + 24] + # tmp_result = [cds] + # if len(associated_genes) > 0: + # tmp_result.append(associated_genes[0]) + + # if len(associated_rbss) == 1: + # tmp_result.append(associated_rbss[0]) + # else: + # log.warning("%s RBSs found for %s", len(associated_rbss), cds.location) + # We choose to append to f_other as that has all features not + # already accessed. It may mean that some gene/RBS features are + # missed if they aren't detected here, which we'll need to handle. + # f_care_about.append(tmp_result) + + #####-----------------------------------------------------------------------------------------------------------##### + # Build list of genes, then iterate over non-gene features and sort into containing genes. + # tags are assigned based on genes, so start the lists with the gene features + f_gene = sorted( + [f for f in record.features if f.type == "gene"], + key=lambda x: x.location.start, + ) + oldNames = [] + for x in f_gene: + if tag_to_update in x.qualifiers.keys(): + oldNames.append(x.qualifiers[tag_to_update]) + + f_rbs = sorted( + [f for f in record.features if f.type == "RBS"], + key=lambda x: x.location.start, + ) + f_tag = list() + f_sorted = sorted( + [f for f in record.features if f.type in TAGGED_FEATURES], + key=lambda x: x.location.start, + ) + # genes not in the TAGGED_FEATURES list are exluded from the processing and assumed to already be clean + clean_features = sorted( + [ + f + for f in record.features + if f.type not in TAGGED_FEATURES and f.type not in ["gene", "RBS"] + ], + key=lambda x: x.location.start, + ) + + f_processed = [] + for gene in f_gene: + tag = [gene] + if gene.location.strand >= 0: # Be strict on where to find starting RBS + geneComp = gene.location.start + else: + geneComp = gene.location.end + # find the gene's RBS feature + for rbs in [f for f in f_rbs if f not in f_processed]: + if is_within(rbs, gene) and ( + rbs.location.start == geneComp or rbs.location.end == geneComp + ): + if (tag_to_update not in rbs.qualifiers.keys()): + tag.append(rbs) + f_processed.append(rbs) + break + elif (tag_to_update not in gene.qualifiers.keys()): # This will gurantee qual is in gene and RBS for next check + tag.append(rbs) + f_processed.append(rbs) + break + elif (not forceTagMatch) or (rbs.qualifiers[tag_to_update] == gene.qualifiers[tag_to_update]): + tag.append(rbs) + f_processed.append(rbs) + break + + # find all other non-RBS features + for feature in [f for f in f_sorted if f not in f_processed]: + # If the feature is within the gene boundaries (genes are the first entry in tag list), + # add it to the same locus tag group, does not process RBS + if is_within(feature, gene): + if tag_to_update not in feature.qualifiers.keys(): + # catches genes and CDS feature that are intron-contained. + if feature.type == "CDS": + if ( + feature.location.start == gene.location.start + or feature.location.end == gene.location.end + ): + + tag.append(feature) + f_processed.append(feature) + else: + tag.append(feature) + f_processed.append(feature) + elif (not forceTagMatch) or (tag_to_update in gene.qualifiers.keys() and feature.qualifiers[tag_to_update] == gene.qualifiers[tag_to_update]): + tag.append(feature) + f_processed.append(feature) + elif feature.location.start > gene.location.end: + # because the features are sorted by coordinates, + # no features further down on the list will be in this gene + break + f_tag.append(tag) + + # Process for frameshifts and mat_peptides (inteins) + + # check for overlapped genes + # at this point, relevant features are put into tag buckets along with the containing gene + # matin the form of [gene, feature1, feature2, ...] + for rbs in [f for f in f_rbs if f not in f_processed]: + dupeRBS = False + for x in f_processed: + if x.type == "RBS" and (tag_to_update in rbs.qualifiers.keys() and tag_to_update in x.qualifiers.keys() and rbs.qualifiers[tag_to_update] == x.qualifiers[tag_to_update]): + dupeRBS = True + if dupeRBS: + change_table.write( + record.id + + "\t" + + rbs.type + + ":" + + (rbs.qualifiers[tag_to_update][0]) + + "\t[Removed: Parent gene already had an RBS]\n" + ) + else: + change_table.write( + record.id + + "\t" + + rbs.type + + ":" + + (rbs.qualifiers[tag_to_update][0]) + + "\t[Removed: RBS did not both fall within boundary of gene and share a boundary with a gene]\n" + ) + + + tag_index = 1 + delta = [] + for tag in f_tag: # each tag list is one 'bucket' + new_tag_value = format_string % tag_index + for feature in tag: + original_tag_value = delta_old(feature, tag_to_update) + feature.qualifiers[tag_to_update] = [new_tag_value] + # Once the tag is renumbered, it's added to the clean list for later output + clean_features.append(feature) + delta.append( + "\t".join((record.id, original_tag_value, new_tag_value)) + ) + tag_index += 1 + + # Why must these people start at 1 + # Because we have to modify the array we work on a separate one + # clean_features = f_oth + # delta = [] + # for index, feature_list in enumerate(sorted(f_care_about, key=lambda x: x[0].location.start)): + # for f in feature_list: + # original_tag_value = delta_old(f, tag_to_update) + # # Add 1 to index for 1-indexed counting for happy scientists + # new_tag_value = format_string % (index+1) + # f.qualifiers[tag_to_update] = [new_tag_value] + # clean_features.append(f) + # delta.append('\t'.join((record.id, original_tag_value, new_tag_value))) + + # Update all features + record.features = sorted(clean_features, key=lambda x: x.location.start) + + for feature in [f for f in f_sorted if f not in f_processed]: + if feature.type == "CDS": + if tag_to_update in feature.qualifiers.keys() and forceTagMatch: + failNameCheck = True + for x in oldNames: + for tag in feature.qualifiers[tag_to_update]: + if tag in x: + failNameCheck = False + if not failNameCheck: + break + if failNameCheck: + change_table.write( + record.id + + "\t" + + feature.type + + ":" + + (feature.qualifiers[tag_to_update][0]) + + "\t[Removed: (Tag check enabled) CDS did not both share a start/end with and fall within a gene with the same " + tag_to_update + " value]\n" + ) + else: + change_table.write( + record.id + + "\t" + + feature.type + + ":" + + (feature.qualifiers[tag_to_update][0]) + + "\t[Removed: CDS did not both fall within boundary of gene and share a boundary with a gene]\n" + ) + elif tag_to_update in feature.qualifiers.keys(): + change_table.write( + record.id + + "\t" + + feature.type + + ":" + + (feature.qualifiers[tag_to_update][0]) + + "\t[Removed: CDS did not both fall within boundary of gene and share a boundary with a gene]\n" + ) + else: + change_table.write( + record.id + + "\t" + + feature.type + + ": No " + + tag_to_update + + "\t[Removed: CDS at (" + str(feature.location.start) + "," + str(feature.location.end) + ") did not both fall within boundary of gene and share a boundary with a gene]\n" + ) + else: + if tag_to_update in feature.qualifiers.keys() and forceTagMatch: + failNameCheck = True + for x in oldNames: + for tag in feature.qualifiers[tag_to_update]: + if tag in x: + failNameCheck = False + if not failNameCheck: + break + if failNameCheck: + change_table.write( + record.id + + "\t" + + feature.type + + ":" + + (feature.qualifiers[tag_to_update][0]) + + "\t[Removed: (Tag check enabled) Feature did not fall within a gene it shared a " + tag_to_update + " value with]\n" + ) + else: + change_table.write( + record.id + + "\t" + + feature.type + + ":" + + (feature.qualifiers[tag_to_update][0]) + + "\t[Removed: Feature not within boundary of a gene]\n" + ) + elif tag_to_update in feature.qualifiers.keys(): + change_table.write( + record.id + + "\t" + + feature.type + + ":" + + (feature.qualifiers[tag_to_update][0]) + + "\t[Removed: Feature not within boundary of a gene]\n" + ) + else: + change_table.write( + record.id + + "\t" + + feature.type + + ": (has no " + + tag_to_update + + ")\t[Removed: Feature not within boundary of a gene]\n" + ) + change_table.write("\n".join(delta) + "\n") + + # Output + yield record + + +def delta_old(feature, tag_to_update): + # First part of delta entry, old name + if tag_to_update in feature.qualifiers: + return feature.qualifiers[tag_to_update][0] + else: + return "%s %s %s" % ( + feature.location.start, + feature.location.end, + feature.location.strand, + ) + + +def is_within(query, feature): + # checks if the query item is within the bounds of the given feature + sortedList = sorted(query.location.parts, key=lambda x: x.start) + for x in sortedList: + if ( + feature.location.start <= x.start + and feature.location.end >= x.end + ): + if x.strand < 0 and x == sortedList[-1]: + return True + elif x.strand >= 0 and x == sortedList[0]: + return True + #else: + return False + + +# def fix_frameshift(a, b): +# #checks if gene a and gene b are a frameshifted gene (either shares a start or an end and an RBS) +# if a[0].location.start == b[0].location.start or a[0].location.end == b[0].location.end: +# # It is likely a frameshift. Treat is as such. Find shared RBS, determine which CDS is which +# big_gene = a if (a[0].location.end - a[0].location.start) > (b[0].location.end - b[0].location.start) else b +# small_gene = a if big_gene==b else b +# rbs = [f for f in a if f.type == 'RBS'] +# # In the way that the tag lists are generated, the larger gene should contain both CDS features. +# # Retrieve and dermine big/small CDS +# cdss = [f for f in big_gene if f.type == 'CDS'] +# big_cds = cdss[0] if (cdss[0].location.end - cdss[0].location.start) > (cdss[1].location.end - cdss[1].location.start) else cdss[1] +# small_cds = cdss[0] if big_cds==cdss[1] else cdss[1] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Renumber genbank files") + parser.add_argument( + "gbk_files", type=argparse.FileType("r"), nargs="+", help="Genbank files" + ) + parser.add_argument( + "--tag_to_update", type=str, help="Tag to update", default="locus_tag" + ) + parser.add_argument( + "--string_prefix", type=str, help="Prefix string", default="display_id" + ) + parser.add_argument( + "--leading_zeros", type=int, help="# of leading zeroes", default=3 + ) + + parser.add_argument( + "--forceTagMatch", action="store_true", help="Make non-CDS features match tag initially" + ) + + parser.add_argument( + "--change_table", + type=argparse.FileType("w"), + help="Location to store change table in", + default="renumber.tsv", + ) + + args = parser.parse_args() + for record in renumber_genes(**vars(args)): + SeqIO.write(record, sys.stdout, "genbank")