0
|
1 #!/usr/bin/env python
|
|
2 from Bio.Seq import Seq, reverse_complement, translate
|
|
3 from Bio.SeqRecord import SeqRecord
|
|
4 from Bio import SeqIO
|
|
5 from Bio.Data import CodonTable
|
|
6 import logging
|
|
7
|
|
8 logging.basicConfig()
|
|
9 log = logging.getLogger()
|
|
10
|
|
11 PHAGE_IN_MIDDLE = re.compile("^(?P<host>.*)\s*phage (?P<phage>.*)$")
|
|
12 BACTERIOPHAGE_IN_MIDDLE = re.compile("^(?P<host>.*)\s*bacteriophage (?P<phage>.*)$")
|
|
13 STARTS_WITH_PHAGE = re.compile(
|
|
14 "^(bacterio|vibrio|Bacterio|Vibrio|)?[Pp]hage (?P<phage>.*)$"
|
|
15 )
|
|
16 NEW_STYLE_NAMES = re.compile("(?P<phage>v[A-Z]_[A-Z][a-z]{2}_.*)")
|
|
17
|
|
18
|
|
19 def phage_name_parser(name):
|
|
20 host = None
|
|
21 phage = None
|
|
22 name = name.replace(", complete genome.", "")
|
|
23 name = name.replace(", complete genome", "")
|
|
24
|
|
25 m = BACTERIOPHAGE_IN_MIDDLE.match(name)
|
|
26 if m:
|
|
27 host = m.group("host")
|
|
28 phage = m.group("phage")
|
|
29 return (host, phage)
|
|
30
|
|
31 m = PHAGE_IN_MIDDLE.match(name)
|
|
32 if m:
|
|
33 host = m.group("host")
|
|
34 phage = m.group("phage")
|
|
35 return (host, phage)
|
|
36
|
|
37 m = STARTS_WITH_PHAGE.match(name)
|
|
38 if m:
|
|
39 phage = m.group("phage")
|
|
40 return (host, phage)
|
|
41
|
|
42 m = NEW_STYLE_NAMES.match(name)
|
|
43 if m:
|
|
44 phage = m.group("phage")
|
|
45 return (host, phage)
|
|
46
|
|
47 return (host, phage)
|
|
48
|
|
49
|
|
50 class OrfFinder(object):
|
|
51 def __init__(self, table, ftype, ends, min_len, strand):
|
|
52 self.table = table
|
|
53 self.table_obj = CodonTable.ambiguous_generic_by_id[table]
|
|
54 self.ends = ends
|
|
55 self.ftype = ftype
|
|
56 self.min_len = min_len
|
|
57 self.starts = sorted(self.table_obj.start_codons)
|
|
58 self.stops = sorted(self.table_obj.stop_codons)
|
|
59 self.re_starts = re.compile("|".join(self.starts))
|
|
60 self.re_stops = re.compile("|".join(self.stops))
|
|
61 self.strand = strand
|
|
62
|
|
63 def locate(self, fasta_file, out_nuc, out_prot, out_bed, out_gff3):
|
|
64 seq_format = "fasta"
|
|
65 log.debug("Genetic code table %i" % self.table)
|
|
66 log.debug("Minimum length %i aa" % self.min_len)
|
|
67
|
|
68 out_count = 0
|
|
69
|
|
70 out_gff3.write("##gff-version 3\n")
|
|
71
|
|
72 for idx, record in enumerate(SeqIO.parse(fasta_file, seq_format)):
|
|
73 for i, (f_start, f_end, f_strand, n, t) in enumerate(
|
|
74 self.get_all_peptides(str(record.seq).upper())
|
|
75 ):
|
|
76 out_count += 1
|
|
77
|
|
78 descr = "length %i aa, %i bp, from %s..%s[%s] of %s" % (
|
|
79 len(t),
|
|
80 len(n),
|
|
81 f_start,
|
|
82 f_end,
|
|
83 f_strand,
|
|
84 record.description,
|
|
85 )
|
|
86 fid = record.id + "|%s%i" % (self.ftype, i + 1)
|
|
87
|
|
88 r = SeqRecord(Seq(n), id=fid, name="", description=descr)
|
|
89 t = SeqRecord(Seq(t), id=fid, name="", description=descr)
|
|
90
|
|
91 SeqIO.write(r, out_nuc, "fasta")
|
|
92 SeqIO.write(t, out_prot, "fasta")
|
|
93
|
|
94 nice_strand = "+" if f_strand == +1 else "-"
|
|
95
|
|
96 out_bed.write(
|
|
97 "\t".join(
|
|
98 map(str, [record.id, f_start, f_end, fid, 0, nice_strand])
|
|
99 )
|
|
100 + "\n"
|
|
101 )
|
|
102
|
|
103 out_gff3.write(
|
|
104 "\t".join(
|
|
105 map(
|
|
106 str,
|
|
107 [
|
|
108 record.id,
|
|
109 "getOrfsOrCds",
|
|
110 "CDS",
|
|
111 f_start + 1,
|
|
112 f_end,
|
|
113 ".",
|
|
114 nice_strand,
|
|
115 0,
|
|
116 "ID=%s.%s.%s" % (self.ftype, idx, i + 1),
|
|
117 ],
|
|
118 )
|
|
119 )
|
|
120 + "\n"
|
|
121 )
|
|
122 log.info("Found %i %ss", out_count, self.ftype)
|
|
123
|
|
124 def start_chop_and_trans(self, s, strict=True):
|
|
125 """Returns offset, trimmed nuc, protein."""
|
|
126 if strict:
|
|
127 assert s[-3:] in self.stops, s
|
|
128 assert len(s) % 3 == 0
|
|
129 for match in self.re_starts.finditer(s, overlapped=True):
|
|
130 # Must check the start is in frame
|
|
131 start = match.start()
|
|
132 if start % 3 == 0:
|
|
133 n = s[start:]
|
|
134 assert len(n) % 3 == 0, "%s is len %i" % (n, len(n))
|
|
135 if strict:
|
|
136 t = translate(n, self.table)
|
|
137 else:
|
|
138 # Use when missing stop codon,
|
|
139 t = "M" + translate(n[3:], self.table, to_stop=True)
|
|
140 yield start, n, t # Edited by CPT to be a generator
|
|
141
|
|
142 def break_up_frame(self, s):
|
|
143 """Returns offset, nuc, protein."""
|
|
144 start = 0
|
|
145 for match in self.re_stops.finditer(s, overlapped=True):
|
|
146 index = match.start() + 3
|
|
147 if index % 3 != 0:
|
|
148 continue
|
|
149 n = s[start:index]
|
|
150 for (offset, n, t) in self.start_chop_and_trans(n):
|
|
151 if n and len(t) >= self.min_len:
|
|
152 yield start + offset, n, t
|
|
153 start = index
|
|
154
|
|
155 def putative_genes_in_sequence(self, nuc_seq):
|
|
156 """Returns start, end, strand, nucleotides, protein.
|
|
157 Co-ordinates are Python style zero-based.
|
|
158 """
|
|
159 nuc_seq = nuc_seq.upper()
|
|
160 # TODO - Refactor to use a generator function (in start order)
|
|
161 # rather than making a list and sorting?
|
|
162 answer = []
|
|
163 full_len = len(nuc_seq)
|
|
164
|
|
165 for frame in range(0, 3):
|
|
166 for offset, n, t in self.break_up_frame(nuc_seq[frame:]):
|
|
167 start = frame + offset # zero based
|
|
168 answer.append((start, start + len(n), +1, n, t))
|
|
169
|
|
170 rc = reverse_complement(nuc_seq)
|
|
171 for frame in range(0, 3):
|
|
172 for offset, n, t in self.break_up_frame(rc[frame:]):
|
|
173 start = full_len - frame - offset # zero based
|
|
174 answer.append((start, start - len(n), -1, n, t))
|
|
175 answer.sort()
|
|
176 return answer
|
|
177
|
|
178 def get_all_peptides(self, nuc_seq):
|
|
179 """Returns start, end, strand, nucleotides, protein.
|
|
180
|
|
181 Co-ordinates are Python style zero-based.
|
|
182 """
|
|
183 # Refactored into generator by CPT
|
|
184 full_len = len(nuc_seq)
|
|
185 if self.strand != "reverse":
|
|
186 for frame in range(0, 3):
|
|
187 for offset, n, t in self.break_up_frame(nuc_seq[frame:]):
|
|
188 start = frame + offset # zero based
|
|
189 yield (start, start + len(n), +1, n, t)
|
|
190 if self.strand != "forward":
|
|
191 rc = reverse_complement(nuc_seq)
|
|
192 for frame in range(0, 3):
|
|
193 for offset, n, t in self.break_up_frame(rc[frame:]):
|
|
194 start = full_len - frame - offset # zero based
|
|
195 yield (start - len(n), start, -1, n, t)
|
|
196
|
|
197
|
|
198 class MGAFinder(object):
|
|
199 def __init__(self, table, ftype, ends, min_len):
|
|
200 self.table = table
|
|
201 self.table_obj = CodonTable.ambiguous_generic_by_id[table]
|
|
202 self.ends = ends
|
|
203 self.ftype = ftype
|
|
204 self.min_len = min_len
|
|
205 self.starts = sorted(self.table_obj.start_codons)
|
|
206 self.stops = sorted(self.table_obj.stop_codons)
|
|
207 self.re_starts = re.compile("|".join(self.starts))
|
|
208 self.re_stops = re.compile("|".join(self.stops))
|
|
209
|
|
210 def locate(self, fasta_file, out_nuc, out_prot, out_bed, out_gff3):
|
|
211 seq_format = "fasta"
|
|
212 log.debug("Genetic code table %i" % self.table)
|
|
213 log.debug("Minimum length %i aa" % self.min_len)
|
|
214
|
|
215 out_count = 0
|
|
216
|
|
217 out_gff3.write("##gff-version 3\n")
|
|
218
|
|
219 for idx, record in enumerate(SeqIO.parse(fasta_file, seq_format)):
|
|
220 for i, (f_start, f_end, f_strand, n, t) in enumerate(
|
|
221 self.get_all_peptides(str(record.seq).upper())
|
|
222 ):
|
|
223 out_count += 1
|
|
224
|
|
225 descr = "length %i aa, %i bp, from %s..%s[%s] of %s" % (
|
|
226 len(t),
|
|
227 len(n),
|
|
228 f_start,
|
|
229 f_end,
|
|
230 f_strand,
|
|
231 record.description,
|
|
232 )
|
|
233 fid = record.id + "|%s%i" % (self.ftype, i + 1)
|
|
234
|
|
235 r = SeqRecord(Seq(n), id=fid, name="", description=descr)
|
|
236 t = SeqRecord(Seq(t), id=fid, name="", description=descr)
|
|
237
|
|
238 SeqIO.write(r, out_nuc, "fasta")
|
|
239 SeqIO.write(t, out_prot, "fasta")
|
|
240
|
|
241 nice_strand = "+" if f_strand == +1 else "-"
|
|
242
|
|
243 out_bed.write(
|
|
244 "\t".join(
|
|
245 map(str, [record.id, f_start, f_end, fid, 0, nice_strand])
|
|
246 )
|
|
247 + "\n"
|
|
248 )
|
|
249
|
|
250 out_gff3.write(
|
|
251 "\t".join(
|
|
252 map(
|
|
253 str,
|
|
254 [
|
|
255 record.id,
|
|
256 "getOrfsOrCds",
|
|
257 "CDS",
|
|
258 f_start + 1,
|
|
259 f_end,
|
|
260 ".",
|
|
261 nice_strand,
|
|
262 0,
|
|
263 "ID=%s.%s.%s" % (self.ftype, idx, i + 1),
|
|
264 ],
|
|
265 )
|
|
266 )
|
|
267 + "\n"
|
|
268 )
|
|
269 log.info("Found %i %ss", out_count, self.ftype)
|
|
270
|
|
271 def start_chop_and_trans(self, s, strict=True):
|
|
272 """Returns offset, trimmed nuc, protein."""
|
|
273 if strict:
|
|
274 assert s[-3:] in self.stops, s
|
|
275 assert len(s) % 3 == 0
|
|
276 for match in self.re_starts.finditer(s, overlapped=True):
|
|
277 # Must check the start is in frame
|
|
278 start = match.start()
|
|
279 if start % 3 == 0:
|
|
280 n = s[start:]
|
|
281 assert len(n) % 3 == 0, "%s is len %i" % (n, len(n))
|
|
282 if strict:
|
|
283 t = translate(n, self.table)
|
|
284 else:
|
|
285 # Use when missing stop codon,
|
|
286 t = "M" + translate(n[3:], self.table, to_stop=True)
|
|
287 yield start, n, t
|
|
288
|
|
289 def break_up_frame(self, s):
|
|
290 """Returns offset, nuc, protein."""
|
|
291 start = 0
|
|
292 for match in self.re_stops.finditer(s, overlapped=True):
|
|
293 index = match.start() + 3
|
|
294 if index % 3 != 0:
|
|
295 continue
|
|
296 n = s[start:index]
|
|
297 for (offset, n, t) in self.start_chop_and_trans(n):
|
|
298 if n and len(t) >= self.min_len:
|
|
299 yield start + offset, n, t
|
|
300 start = index
|
|
301
|
|
302 def putative_genes_in_sequence(self, nuc_seq):
|
|
303 """Returns start, end, strand, nucleotides, protein.
|
|
304 Co-ordinates are Python style zero-based.
|
|
305 """
|
|
306 nuc_seq = nuc_seq.upper()
|
|
307 # TODO - Refactor to use a generator function (in start order)
|
|
308 # rather than making a list and sorting?
|
|
309 answer = []
|
|
310 full_len = len(nuc_seq)
|
|
311
|
|
312 for frame in range(0, 3):
|
|
313 for offset, n, t in self.break_up_frame(nuc_seq[frame:]):
|
|
314 start = frame + offset # zero based
|
|
315 answer.append((start, start + len(n), +1, n, t))
|
|
316
|
|
317 rc = reverse_complement(nuc_seq)
|
|
318 for frame in range(0, 3):
|
|
319 for offset, n, t in self.break_up_frame(rc[frame:]):
|
|
320 start = full_len - frame - offset # zero based
|
|
321 answer.append((start, start - len(n), -1, n, t))
|
|
322 answer.sort()
|
|
323 return answer
|
|
324
|
|
325 def get_all_peptides(self, nuc_seq):
|
|
326 """Returns start, end, strand, nucleotides, protein.
|
|
327
|
|
328 Co-ordinates are Python style zero-based.
|
|
329 """
|
|
330 # Refactored into generator by CPT
|
|
331
|
|
332 full_len = len(nuc_seq)
|
|
333 for frame in range(0, 3):
|
|
334 for offset, n, t in self.break_up_frame(nuc_seq[frame:]):
|
|
335 start = frame + offset # zero based
|
|
336 yield (start, start + len(n), +1, n, t)
|
|
337 rc = reverse_complement(nuc_seq)
|
|
338 for frame in range(0, 3):
|
|
339 for offset, n, t in self.break_up_frame(rc[frame:]):
|
|
340 start = full_len - frame - offset # zero based
|
|
341 yield (start - len(n), start, -1, n, t)
|