diff cpt_stops/stop_stats.py @ 0:89c78d269676 draft

Uploaded
author cpt
date Fri, 13 May 2022 05:39:51 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_stops/stop_stats.py	Fri May 13 05:39:51 2022 +0000
@@ -0,0 +1,42 @@
+#!/usr/bin/env python
+import argparse
+from CPT_GFFParser import gffParse, gffWrite
+from Bio import SeqIO
+from gff3 import feature_lambda, feature_test_type
+
+
+def main(fasta, gff3):
+    seq_dict = SeqIO.to_dict(SeqIO.parse(fasta, "fasta"))
+
+    codon_usage = {}
+
+    for rec in gffParse(gff3, base_dict=seq_dict):
+        for feat in feature_lambda(
+            rec.features, feature_test_type, {"type": "CDS"}, subfeatures=True
+        ):
+            seq = str(feat.extract(rec).seq)[-3:]
+            try:
+                codon_usage[seq] += 1
+            except KeyError:
+                codon_usage[seq] = 1
+
+    names = {
+        "TAG": "Amber",
+        "TAA": "Ochre",
+        "TGA": "Opal",
+    }
+
+    # TODO: print all actg combinations? Or just ones that are there
+    print ("# Name\tCodon\tCount")
+    for key in sorted(codon_usage):
+        print ("\t".join((names.get(key.upper(), "None"), key, str(codon_usage[key]))))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Summarise stop codon usage", epilog=""
+    )
+    parser.add_argument("fasta", type=argparse.FileType("r"), help="Fasta Genome")
+    parser.add_argument("gff3", type=argparse.FileType("r"), help="GFF3 File")
+    args = parser.parse_args()
+    main(**vars(args))