Mercurial > repos > cpt > cpt_find_spanins
comparison cpt.py @ 0:9708448ce811 draft
Uploaded
author | cpt |
---|---|
date | Fri, 17 Jun 2022 12:39:30 +0000 |
parents | |
children | 46b252c89e9e |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:9708448ce811 |
---|---|
1 #!/usr/bin/env python | |
2 from Bio.Seq import Seq, reverse_complement, translate | |
3 from Bio.SeqRecord import SeqRecord | |
4 from Bio import SeqIO | |
5 from Bio.Data import CodonTable | |
6 import logging | |
7 | |
8 logging.basicConfig() | |
9 log = logging.getLogger() | |
10 | |
11 PHAGE_IN_MIDDLE = re.compile("^(?P<host>.*)\s*phage (?P<phage>.*)$") | |
12 BACTERIOPHAGE_IN_MIDDLE = re.compile("^(?P<host>.*)\s*bacteriophage (?P<phage>.*)$") | |
13 STARTS_WITH_PHAGE = re.compile( | |
14 "^(bacterio|vibrio|Bacterio|Vibrio|)?[Pp]hage (?P<phage>.*)$" | |
15 ) | |
16 NEW_STYLE_NAMES = re.compile("(?P<phage>v[A-Z]_[A-Z][a-z]{2}_.*)") | |
17 | |
18 | |
19 def phage_name_parser(name): | |
20 host = None | |
21 phage = None | |
22 name = name.replace(", complete genome.", "") | |
23 name = name.replace(", complete genome", "") | |
24 | |
25 m = BACTERIOPHAGE_IN_MIDDLE.match(name) | |
26 if m: | |
27 host = m.group("host") | |
28 phage = m.group("phage") | |
29 return (host, phage) | |
30 | |
31 m = PHAGE_IN_MIDDLE.match(name) | |
32 if m: | |
33 host = m.group("host") | |
34 phage = m.group("phage") | |
35 return (host, phage) | |
36 | |
37 m = STARTS_WITH_PHAGE.match(name) | |
38 if m: | |
39 phage = m.group("phage") | |
40 return (host, phage) | |
41 | |
42 m = NEW_STYLE_NAMES.match(name) | |
43 if m: | |
44 phage = m.group("phage") | |
45 return (host, phage) | |
46 | |
47 return (host, phage) | |
48 | |
49 | |
50 class OrfFinder(object): | |
51 def __init__(self, table, ftype, ends, min_len, strand): | |
52 self.table = table | |
53 self.table_obj = CodonTable.ambiguous_generic_by_id[table] | |
54 self.ends = ends | |
55 self.ftype = ftype | |
56 self.min_len = min_len | |
57 self.starts = sorted(self.table_obj.start_codons) | |
58 self.stops = sorted(self.table_obj.stop_codons) | |
59 self.re_starts = re.compile("|".join(self.starts)) | |
60 self.re_stops = re.compile("|".join(self.stops)) | |
61 self.strand = strand | |
62 | |
63 def locate(self, fasta_file, out_nuc, out_prot, out_bed, out_gff3): | |
64 seq_format = "fasta" | |
65 log.debug("Genetic code table %i" % self.table) | |
66 log.debug("Minimum length %i aa" % self.min_len) | |
67 | |
68 out_count = 0 | |
69 | |
70 out_gff3.write("##gff-version 3\n") | |
71 | |
72 for idx, record in enumerate(SeqIO.parse(fasta_file, seq_format)): | |
73 for i, (f_start, f_end, f_strand, n, t) in enumerate( | |
74 self.get_all_peptides(str(record.seq).upper()) | |
75 ): | |
76 out_count += 1 | |
77 | |
78 descr = "length %i aa, %i bp, from %s..%s[%s] of %s" % ( | |
79 len(t), | |
80 len(n), | |
81 f_start, | |
82 f_end, | |
83 f_strand, | |
84 record.description, | |
85 ) | |
86 fid = record.id + "|%s%i" % (self.ftype, i + 1) | |
87 | |
88 r = SeqRecord(Seq(n), id=fid, name="", description=descr) | |
89 t = SeqRecord(Seq(t), id=fid, name="", description=descr) | |
90 | |
91 SeqIO.write(r, out_nuc, "fasta") | |
92 SeqIO.write(t, out_prot, "fasta") | |
93 | |
94 nice_strand = "+" if f_strand == +1 else "-" | |
95 | |
96 out_bed.write( | |
97 "\t".join( | |
98 map(str, [record.id, f_start, f_end, fid, 0, nice_strand]) | |
99 ) | |
100 + "\n" | |
101 ) | |
102 | |
103 out_gff3.write( | |
104 "\t".join( | |
105 map( | |
106 str, | |
107 [ | |
108 record.id, | |
109 "getOrfsOrCds", | |
110 "CDS", | |
111 f_start + 1, | |
112 f_end, | |
113 ".", | |
114 nice_strand, | |
115 0, | |
116 "ID=%s.%s.%s" % (self.ftype, idx, i + 1), | |
117 ], | |
118 ) | |
119 ) | |
120 + "\n" | |
121 ) | |
122 log.info("Found %i %ss", out_count, self.ftype) | |
123 | |
124 def start_chop_and_trans(self, s, strict=True): | |
125 """Returns offset, trimmed nuc, protein.""" | |
126 if strict: | |
127 assert s[-3:] in self.stops, s | |
128 assert len(s) % 3 == 0 | |
129 for match in self.re_starts.finditer(s, overlapped=True): | |
130 # Must check the start is in frame | |
131 start = match.start() | |
132 if start % 3 == 0: | |
133 n = s[start:] | |
134 assert len(n) % 3 == 0, "%s is len %i" % (n, len(n)) | |
135 if strict: | |
136 t = translate(n, self.table) | |
137 else: | |
138 # Use when missing stop codon, | |
139 t = "M" + translate(n[3:], self.table, to_stop=True) | |
140 yield start, n, t # Edited by CPT to be a generator | |
141 | |
142 def break_up_frame(self, s): | |
143 """Returns offset, nuc, protein.""" | |
144 start = 0 | |
145 for match in self.re_stops.finditer(s, overlapped=True): | |
146 index = match.start() + 3 | |
147 if index % 3 != 0: | |
148 continue | |
149 n = s[start:index] | |
150 for (offset, n, t) in self.start_chop_and_trans(n): | |
151 if n and len(t) >= self.min_len: | |
152 yield start + offset, n, t | |
153 start = index | |
154 | |
155 def putative_genes_in_sequence(self, nuc_seq): | |
156 """Returns start, end, strand, nucleotides, protein. | |
157 Co-ordinates are Python style zero-based. | |
158 """ | |
159 nuc_seq = nuc_seq.upper() | |
160 # TODO - Refactor to use a generator function (in start order) | |
161 # rather than making a list and sorting? | |
162 answer = [] | |
163 full_len = len(nuc_seq) | |
164 | |
165 for frame in range(0, 3): | |
166 for offset, n, t in self.break_up_frame(nuc_seq[frame:]): | |
167 start = frame + offset # zero based | |
168 answer.append((start, start + len(n), +1, n, t)) | |
169 | |
170 rc = reverse_complement(nuc_seq) | |
171 for frame in range(0, 3): | |
172 for offset, n, t in self.break_up_frame(rc[frame:]): | |
173 start = full_len - frame - offset # zero based | |
174 answer.append((start, start - len(n), -1, n, t)) | |
175 answer.sort() | |
176 return answer | |
177 | |
178 def get_all_peptides(self, nuc_seq): | |
179 """Returns start, end, strand, nucleotides, protein. | |
180 | |
181 Co-ordinates are Python style zero-based. | |
182 """ | |
183 # Refactored into generator by CPT | |
184 full_len = len(nuc_seq) | |
185 if self.strand != "reverse": | |
186 for frame in range(0, 3): | |
187 for offset, n, t in self.break_up_frame(nuc_seq[frame:]): | |
188 start = frame + offset # zero based | |
189 yield (start, start + len(n), +1, n, t) | |
190 if self.strand != "forward": | |
191 rc = reverse_complement(nuc_seq) | |
192 for frame in range(0, 3): | |
193 for offset, n, t in self.break_up_frame(rc[frame:]): | |
194 start = full_len - frame - offset # zero based | |
195 yield (start - len(n), start, -1, n, t) | |
196 | |
197 | |
198 class MGAFinder(object): | |
199 def __init__(self, table, ftype, ends, min_len): | |
200 self.table = table | |
201 self.table_obj = CodonTable.ambiguous_generic_by_id[table] | |
202 self.ends = ends | |
203 self.ftype = ftype | |
204 self.min_len = min_len | |
205 self.starts = sorted(self.table_obj.start_codons) | |
206 self.stops = sorted(self.table_obj.stop_codons) | |
207 self.re_starts = re.compile("|".join(self.starts)) | |
208 self.re_stops = re.compile("|".join(self.stops)) | |
209 | |
210 def locate(self, fasta_file, out_nuc, out_prot, out_bed, out_gff3): | |
211 seq_format = "fasta" | |
212 log.debug("Genetic code table %i" % self.table) | |
213 log.debug("Minimum length %i aa" % self.min_len) | |
214 | |
215 out_count = 0 | |
216 | |
217 out_gff3.write("##gff-version 3\n") | |
218 | |
219 for idx, record in enumerate(SeqIO.parse(fasta_file, seq_format)): | |
220 for i, (f_start, f_end, f_strand, n, t) in enumerate( | |
221 self.get_all_peptides(str(record.seq).upper()) | |
222 ): | |
223 out_count += 1 | |
224 | |
225 descr = "length %i aa, %i bp, from %s..%s[%s] of %s" % ( | |
226 len(t), | |
227 len(n), | |
228 f_start, | |
229 f_end, | |
230 f_strand, | |
231 record.description, | |
232 ) | |
233 fid = record.id + "|%s%i" % (self.ftype, i + 1) | |
234 | |
235 r = SeqRecord(Seq(n), id=fid, name="", description=descr) | |
236 t = SeqRecord(Seq(t), id=fid, name="", description=descr) | |
237 | |
238 SeqIO.write(r, out_nuc, "fasta") | |
239 SeqIO.write(t, out_prot, "fasta") | |
240 | |
241 nice_strand = "+" if f_strand == +1 else "-" | |
242 | |
243 out_bed.write( | |
244 "\t".join( | |
245 map(str, [record.id, f_start, f_end, fid, 0, nice_strand]) | |
246 ) | |
247 + "\n" | |
248 ) | |
249 | |
250 out_gff3.write( | |
251 "\t".join( | |
252 map( | |
253 str, | |
254 [ | |
255 record.id, | |
256 "getOrfsOrCds", | |
257 "CDS", | |
258 f_start + 1, | |
259 f_end, | |
260 ".", | |
261 nice_strand, | |
262 0, | |
263 "ID=%s.%s.%s" % (self.ftype, idx, i + 1), | |
264 ], | |
265 ) | |
266 ) | |
267 + "\n" | |
268 ) | |
269 log.info("Found %i %ss", out_count, self.ftype) | |
270 | |
271 def start_chop_and_trans(self, s, strict=True): | |
272 """Returns offset, trimmed nuc, protein.""" | |
273 if strict: | |
274 assert s[-3:] in self.stops, s | |
275 assert len(s) % 3 == 0 | |
276 for match in self.re_starts.finditer(s, overlapped=True): | |
277 # Must check the start is in frame | |
278 start = match.start() | |
279 if start % 3 == 0: | |
280 n = s[start:] | |
281 assert len(n) % 3 == 0, "%s is len %i" % (n, len(n)) | |
282 if strict: | |
283 t = translate(n, self.table) | |
284 else: | |
285 # Use when missing stop codon, | |
286 t = "M" + translate(n[3:], self.table, to_stop=True) | |
287 yield start, n, t | |
288 | |
289 def break_up_frame(self, s): | |
290 """Returns offset, nuc, protein.""" | |
291 start = 0 | |
292 for match in self.re_stops.finditer(s, overlapped=True): | |
293 index = match.start() + 3 | |
294 if index % 3 != 0: | |
295 continue | |
296 n = s[start:index] | |
297 for (offset, n, t) in self.start_chop_and_trans(n): | |
298 if n and len(t) >= self.min_len: | |
299 yield start + offset, n, t | |
300 start = index | |
301 | |
302 def putative_genes_in_sequence(self, nuc_seq): | |
303 """Returns start, end, strand, nucleotides, protein. | |
304 Co-ordinates are Python style zero-based. | |
305 """ | |
306 nuc_seq = nuc_seq.upper() | |
307 # TODO - Refactor to use a generator function (in start order) | |
308 # rather than making a list and sorting? | |
309 answer = [] | |
310 full_len = len(nuc_seq) | |
311 | |
312 for frame in range(0, 3): | |
313 for offset, n, t in self.break_up_frame(nuc_seq[frame:]): | |
314 start = frame + offset # zero based | |
315 answer.append((start, start + len(n), +1, n, t)) | |
316 | |
317 rc = reverse_complement(nuc_seq) | |
318 for frame in range(0, 3): | |
319 for offset, n, t in self.break_up_frame(rc[frame:]): | |
320 start = full_len - frame - offset # zero based | |
321 answer.append((start, start - len(n), -1, n, t)) | |
322 answer.sort() | |
323 return answer | |
324 | |
325 def get_all_peptides(self, nuc_seq): | |
326 """Returns start, end, strand, nucleotides, protein. | |
327 | |
328 Co-ordinates are Python style zero-based. | |
329 """ | |
330 # Refactored into generator by CPT | |
331 | |
332 full_len = len(nuc_seq) | |
333 for frame in range(0, 3): | |
334 for offset, n, t in self.break_up_frame(nuc_seq[frame:]): | |
335 start = frame + offset # zero based | |
336 yield (start, start + len(n), +1, n, t) | |
337 rc = reverse_complement(nuc_seq) | |
338 for frame in range(0, 3): | |
339 for offset, n, t in self.break_up_frame(rc[frame:]): | |
340 start = full_len - frame - offset # zero based | |
341 yield (start - len(n), start, -1, n, t) |