view cpt_stops/stop_stats.py @ 0:89c78d269676 draft

Uploaded
author cpt
date Fri, 13 May 2022 05:39:51 +0000
parents
children
line wrap: on
line source

#!/usr/bin/env python
import argparse
from CPT_GFFParser import gffParse, gffWrite
from Bio import SeqIO
from gff3 import feature_lambda, feature_test_type


def main(fasta, gff3):
    seq_dict = SeqIO.to_dict(SeqIO.parse(fasta, "fasta"))

    codon_usage = {}

    for rec in gffParse(gff3, base_dict=seq_dict):
        for feat in feature_lambda(
            rec.features, feature_test_type, {"type": "CDS"}, subfeatures=True
        ):
            seq = str(feat.extract(rec).seq)[-3:]
            try:
                codon_usage[seq] += 1
            except KeyError:
                codon_usage[seq] = 1

    names = {
        "TAG": "Amber",
        "TAA": "Ochre",
        "TGA": "Opal",
    }

    # TODO: print all actg combinations? Or just ones that are there
    print ("# Name\tCodon\tCount")
    for key in sorted(codon_usage):
        print ("\t".join((names.get(key.upper(), "None"), key, str(codon_usage[key]))))


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Summarise stop codon usage", epilog=""
    )
    parser.add_argument("fasta", type=argparse.FileType("r"), help="Fasta Genome")
    parser.add_argument("gff3", type=argparse.FileType("r"), help="GFF3 File")
    args = parser.parse_args()
    main(**vars(args))