comparison lipory.py @ 4:b79df4966ebb draft

planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author cpt
date Mon, 05 Jun 2023 02:45:43 +0000
parents
children
comparison
equal deleted inserted replaced
3:68e1e56e338a 4:b79df4966ebb
1 #!/usr/bin/env python
2 import re
3 import sys
4 import argparse
5 import logging
6 from Bio import SeqIO
7 from CPT_GFFParser import gffParse, gffWrite, gffSeqFeature
8 from gff3 import feature_lambda, feature_test_type, get_id
9 from Bio.SeqFeature import SeqFeature, FeatureLocation
10
11 logging.basicConfig(level=logging.INFO)
12 log = logging.getLogger(__name__)
13
14
15 def find_lipoprotein(gff3_file, fasta_genome, lipobox_mindist=10, lipobox_maxdist=40):
16 seq_dict = SeqIO.to_dict(SeqIO.parse(fasta_genome, "fasta"))
17
18 CASES = [
19 re.compile(
20 "^.{%s,%s}[ILMFTV][^REKD][GAS]C" % (lipobox_mindist, lipobox_maxdist)
21 ),
22 re.compile("^.{%s,%s}AW[AGS]C" % (lipobox_mindist, lipobox_maxdist)),
23 # Make sure to not have multiple cases that share matches, will introduce duplicate features into gff3 file
24 ]
25
26 for record in gffParse(gff3_file, base_dict=seq_dict):
27 good_features = []
28
29 genes = list(
30 feature_lambda(
31 record.features, feature_test_type, {"type": "gene"}, subfeatures=True
32 )
33 )
34 for gene in genes:
35 cdss = list(
36 feature_lambda(
37 gene.sub_features,
38 feature_test_type,
39 {"type": "CDS"},
40 subfeatures=False,
41 )
42 )
43 if len(cdss) == 0:
44 continue
45
46 for cds in cdss:
47 try:
48 tmpseq = str(
49 cds.extract(record.seq).translate(table=11, cds=True)
50 ).replace("*", "")
51 except:
52 continue
53
54 for case in CASES:
55 m = case.search(tmpseq)
56 if m:
57 if cds.location.strand > 0:
58 start = cds.location.start + (3 * (m.end() - 4))
59 end = cds.location.start + (3 * m.end())
60 else:
61 start = cds.location.end - (3 * (m.end() - 4))
62 end = cds.location.end - (3 * m.end())
63
64 tmp = gffSeqFeature(
65 FeatureLocation(
66 min(start, end),
67 max(start, end),
68 strand=cds.location.strand,
69 ),
70 type="Lipobox",
71 qualifiers={
72 "source": "CPT_LipoRy",
73 "ID": "%s.lipobox" % get_id(gene),
74 },
75 )
76 tmp.qualifiers["sequence"] = str(
77 tmp.extract(record).seq.translate()
78 )
79
80 gene.sub_features.append(tmp)
81 good_features.append(gene)
82
83 record.features = good_features
84 yield [record]
85
86
87 if __name__ == "__main__":
88 parser = argparse.ArgumentParser(description="Filter out lipoproteins", epilog="")
89 parser.add_argument(
90 "gff3_file", type=argparse.FileType("r"), help="Naive ORF Calls"
91 )
92 parser.add_argument(
93 "fasta_genome", type=argparse.FileType("r"), help="Fasta genome sequence"
94 )
95
96 parser.add_argument(
97 "--lipobox_mindist",
98 type=int,
99 help="Minimum distance in codons to start of lipobox",
100 default=10,
101 )
102 parser.add_argument(
103 "--lipobox_maxdist",
104 type=int,
105 help="Maximum distance in codons to start of lipobox",
106 default=40,
107 )
108
109 args = parser.parse_args()
110
111 args = vars(parser.parse_args())
112 for record in find_lipoprotein(**args):
113 record[0].annotations = {}
114 gffWrite(record, sys.stdout)