Mercurial > repos > cpt > cpt_start_stats
diff cpt_starts/start_stats.py @ 0:9f2517655a1e draft
Uploaded
author | cpt |
---|---|
date | Fri, 13 May 2022 05:38:37 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_starts/start_stats.py Fri May 13 05:38:37 2022 +0000 @@ -0,0 +1,36 @@ +#!/usr/bin/env python +import argparse +from CPT_GFFParser import gffParse, gffWrite +from Bio import SeqIO +from gff3 import feature_lambda, feature_test_type + + +def main(fasta, gff3): + seq_dict = SeqIO.to_dict(SeqIO.parse(fasta, "fasta")) + + codon_usage = {} + + for rec in gffParse(gff3, base_dict=seq_dict): + for feat in feature_lambda( + rec.features, feature_test_type, {"type": "CDS"}, subfeatures=True + ): + seq = str(feat.extract(rec).seq)[0:3] + try: + codon_usage[seq] += 1 + except KeyError: + codon_usage[seq] = 1 + + # TODO: print all actg combinations? Or just ones that are there + print ("# Codon\tCount") + for key in sorted(codon_usage): + print ("\t".join((key, str(codon_usage[key])))) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Summarise start codon usage", epilog="" + ) + parser.add_argument("fasta", type=argparse.FileType("r"), help="Fasta Genome") + parser.add_argument("gff3", type=argparse.FileType("r"), help="GFF3 File") + args = parser.parse_args() + main(**vars(args))