diff lipory.py @ 4:b79df4966ebb draft

planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author cpt
date Mon, 05 Jun 2023 02:45:43 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lipory.py	Mon Jun 05 02:45:43 2023 +0000
@@ -0,0 +1,114 @@
+#!/usr/bin/env python
+import re
+import sys
+import argparse
+import logging
+from Bio import SeqIO
+from CPT_GFFParser import gffParse, gffWrite, gffSeqFeature
+from gff3 import feature_lambda, feature_test_type, get_id
+from Bio.SeqFeature import SeqFeature, FeatureLocation
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger(__name__)
+
+
+def find_lipoprotein(gff3_file, fasta_genome, lipobox_mindist=10, lipobox_maxdist=40):
+    seq_dict = SeqIO.to_dict(SeqIO.parse(fasta_genome, "fasta"))
+
+    CASES = [
+        re.compile(
+            "^.{%s,%s}[ILMFTV][^REKD][GAS]C" % (lipobox_mindist, lipobox_maxdist)
+        ),
+        re.compile("^.{%s,%s}AW[AGS]C" % (lipobox_mindist, lipobox_maxdist)),
+        # Make sure to not have multiple cases that share matches, will introduce duplicate features into gff3 file
+    ]
+
+    for record in gffParse(gff3_file, base_dict=seq_dict):
+        good_features = []
+
+        genes = list(
+            feature_lambda(
+                record.features, feature_test_type, {"type": "gene"}, subfeatures=True
+            )
+        )
+        for gene in genes:
+            cdss = list(
+                feature_lambda(
+                    gene.sub_features,
+                    feature_test_type,
+                    {"type": "CDS"},
+                    subfeatures=False,
+                )
+            )
+            if len(cdss) == 0:
+                continue
+
+            for cds in cdss:
+                try:
+                    tmpseq = str(
+                        cds.extract(record.seq).translate(table=11, cds=True)
+                    ).replace("*", "")
+                except:
+                    continue
+
+                for case in CASES:
+                    m = case.search(tmpseq)
+                    if m:
+                        if cds.location.strand > 0:
+                            start = cds.location.start + (3 * (m.end() - 4))
+                            end = cds.location.start + (3 * m.end())
+                        else:
+                            start = cds.location.end - (3 * (m.end() - 4))
+                            end = cds.location.end - (3 * m.end())
+
+                        tmp = gffSeqFeature(
+                            FeatureLocation(
+                                min(start, end),
+                                max(start, end),
+                                strand=cds.location.strand,
+                            ),
+                            type="Lipobox",
+                            qualifiers={
+                                "source": "CPT_LipoRy",
+                                "ID": "%s.lipobox" % get_id(gene),
+                            },
+                        )
+                        tmp.qualifiers["sequence"] = str(
+                            tmp.extract(record).seq.translate()
+                        )
+
+                        gene.sub_features.append(tmp)
+                        good_features.append(gene)
+
+            record.features = good_features
+        yield [record]
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Filter out lipoproteins", epilog="")
+    parser.add_argument(
+        "gff3_file", type=argparse.FileType("r"), help="Naive ORF Calls"
+    )
+    parser.add_argument(
+        "fasta_genome", type=argparse.FileType("r"), help="Fasta genome sequence"
+    )
+
+    parser.add_argument(
+        "--lipobox_mindist",
+        type=int,
+        help="Minimum distance in codons to start of lipobox",
+        default=10,
+    )
+    parser.add_argument(
+        "--lipobox_maxdist",
+        type=int,
+        help="Maximum distance in codons to start of lipobox",
+        default=40,
+    )
+
+    args = parser.parse_args()
+
+    args = vars(parser.parse_args())
+    for record in find_lipoprotein(**args):
+        record[0].annotations = {}
+        gffWrite(record, sys.stdout)