Mercurial > repos > cpt > cpt_lipory
comparison lipory.py @ 4:b79df4966ebb draft
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author | cpt |
---|---|
date | Mon, 05 Jun 2023 02:45:43 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
3:68e1e56e338a | 4:b79df4966ebb |
---|---|
1 #!/usr/bin/env python | |
2 import re | |
3 import sys | |
4 import argparse | |
5 import logging | |
6 from Bio import SeqIO | |
7 from CPT_GFFParser import gffParse, gffWrite, gffSeqFeature | |
8 from gff3 import feature_lambda, feature_test_type, get_id | |
9 from Bio.SeqFeature import SeqFeature, FeatureLocation | |
10 | |
11 logging.basicConfig(level=logging.INFO) | |
12 log = logging.getLogger(__name__) | |
13 | |
14 | |
15 def find_lipoprotein(gff3_file, fasta_genome, lipobox_mindist=10, lipobox_maxdist=40): | |
16 seq_dict = SeqIO.to_dict(SeqIO.parse(fasta_genome, "fasta")) | |
17 | |
18 CASES = [ | |
19 re.compile( | |
20 "^.{%s,%s}[ILMFTV][^REKD][GAS]C" % (lipobox_mindist, lipobox_maxdist) | |
21 ), | |
22 re.compile("^.{%s,%s}AW[AGS]C" % (lipobox_mindist, lipobox_maxdist)), | |
23 # Make sure to not have multiple cases that share matches, will introduce duplicate features into gff3 file | |
24 ] | |
25 | |
26 for record in gffParse(gff3_file, base_dict=seq_dict): | |
27 good_features = [] | |
28 | |
29 genes = list( | |
30 feature_lambda( | |
31 record.features, feature_test_type, {"type": "gene"}, subfeatures=True | |
32 ) | |
33 ) | |
34 for gene in genes: | |
35 cdss = list( | |
36 feature_lambda( | |
37 gene.sub_features, | |
38 feature_test_type, | |
39 {"type": "CDS"}, | |
40 subfeatures=False, | |
41 ) | |
42 ) | |
43 if len(cdss) == 0: | |
44 continue | |
45 | |
46 for cds in cdss: | |
47 try: | |
48 tmpseq = str( | |
49 cds.extract(record.seq).translate(table=11, cds=True) | |
50 ).replace("*", "") | |
51 except: | |
52 continue | |
53 | |
54 for case in CASES: | |
55 m = case.search(tmpseq) | |
56 if m: | |
57 if cds.location.strand > 0: | |
58 start = cds.location.start + (3 * (m.end() - 4)) | |
59 end = cds.location.start + (3 * m.end()) | |
60 else: | |
61 start = cds.location.end - (3 * (m.end() - 4)) | |
62 end = cds.location.end - (3 * m.end()) | |
63 | |
64 tmp = gffSeqFeature( | |
65 FeatureLocation( | |
66 min(start, end), | |
67 max(start, end), | |
68 strand=cds.location.strand, | |
69 ), | |
70 type="Lipobox", | |
71 qualifiers={ | |
72 "source": "CPT_LipoRy", | |
73 "ID": "%s.lipobox" % get_id(gene), | |
74 }, | |
75 ) | |
76 tmp.qualifiers["sequence"] = str( | |
77 tmp.extract(record).seq.translate() | |
78 ) | |
79 | |
80 gene.sub_features.append(tmp) | |
81 good_features.append(gene) | |
82 | |
83 record.features = good_features | |
84 yield [record] | |
85 | |
86 | |
87 if __name__ == "__main__": | |
88 parser = argparse.ArgumentParser(description="Filter out lipoproteins", epilog="") | |
89 parser.add_argument( | |
90 "gff3_file", type=argparse.FileType("r"), help="Naive ORF Calls" | |
91 ) | |
92 parser.add_argument( | |
93 "fasta_genome", type=argparse.FileType("r"), help="Fasta genome sequence" | |
94 ) | |
95 | |
96 parser.add_argument( | |
97 "--lipobox_mindist", | |
98 type=int, | |
99 help="Minimum distance in codons to start of lipobox", | |
100 default=10, | |
101 ) | |
102 parser.add_argument( | |
103 "--lipobox_maxdist", | |
104 type=int, | |
105 help="Maximum distance in codons to start of lipobox", | |
106 default=40, | |
107 ) | |
108 | |
109 args = parser.parse_args() | |
110 | |
111 args = vars(parser.parse_args()) | |
112 for record in find_lipoprotein(**args): | |
113 record[0].annotations = {} | |
114 gffWrite(record, sys.stdout) |