annotate cpt.py @ 0:9708448ce811 draft

Uploaded
author cpt
date Fri, 17 Jun 2022 12:39:30 +0000
parents
children 46b252c89e9e
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9708448ce811 Uploaded
cpt
parents:
diff changeset
1 #!/usr/bin/env python
9708448ce811 Uploaded
cpt
parents:
diff changeset
2 from Bio.Seq import Seq, reverse_complement, translate
9708448ce811 Uploaded
cpt
parents:
diff changeset
3 from Bio.SeqRecord import SeqRecord
9708448ce811 Uploaded
cpt
parents:
diff changeset
4 from Bio import SeqIO
9708448ce811 Uploaded
cpt
parents:
diff changeset
5 from Bio.Data import CodonTable
9708448ce811 Uploaded
cpt
parents:
diff changeset
6 import logging
9708448ce811 Uploaded
cpt
parents:
diff changeset
7
9708448ce811 Uploaded
cpt
parents:
diff changeset
8 logging.basicConfig()
9708448ce811 Uploaded
cpt
parents:
diff changeset
9 log = logging.getLogger()
9708448ce811 Uploaded
cpt
parents:
diff changeset
10
9708448ce811 Uploaded
cpt
parents:
diff changeset
11 PHAGE_IN_MIDDLE = re.compile("^(?P<host>.*)\s*phage (?P<phage>.*)$")
9708448ce811 Uploaded
cpt
parents:
diff changeset
12 BACTERIOPHAGE_IN_MIDDLE = re.compile("^(?P<host>.*)\s*bacteriophage (?P<phage>.*)$")
9708448ce811 Uploaded
cpt
parents:
diff changeset
13 STARTS_WITH_PHAGE = re.compile(
9708448ce811 Uploaded
cpt
parents:
diff changeset
14 "^(bacterio|vibrio|Bacterio|Vibrio|)?[Pp]hage (?P<phage>.*)$"
9708448ce811 Uploaded
cpt
parents:
diff changeset
15 )
9708448ce811 Uploaded
cpt
parents:
diff changeset
16 NEW_STYLE_NAMES = re.compile("(?P<phage>v[A-Z]_[A-Z][a-z]{2}_.*)")
9708448ce811 Uploaded
cpt
parents:
diff changeset
17
9708448ce811 Uploaded
cpt
parents:
diff changeset
18
9708448ce811 Uploaded
cpt
parents:
diff changeset
19 def phage_name_parser(name):
9708448ce811 Uploaded
cpt
parents:
diff changeset
20 host = None
9708448ce811 Uploaded
cpt
parents:
diff changeset
21 phage = None
9708448ce811 Uploaded
cpt
parents:
diff changeset
22 name = name.replace(", complete genome.", "")
9708448ce811 Uploaded
cpt
parents:
diff changeset
23 name = name.replace(", complete genome", "")
9708448ce811 Uploaded
cpt
parents:
diff changeset
24
9708448ce811 Uploaded
cpt
parents:
diff changeset
25 m = BACTERIOPHAGE_IN_MIDDLE.match(name)
9708448ce811 Uploaded
cpt
parents:
diff changeset
26 if m:
9708448ce811 Uploaded
cpt
parents:
diff changeset
27 host = m.group("host")
9708448ce811 Uploaded
cpt
parents:
diff changeset
28 phage = m.group("phage")
9708448ce811 Uploaded
cpt
parents:
diff changeset
29 return (host, phage)
9708448ce811 Uploaded
cpt
parents:
diff changeset
30
9708448ce811 Uploaded
cpt
parents:
diff changeset
31 m = PHAGE_IN_MIDDLE.match(name)
9708448ce811 Uploaded
cpt
parents:
diff changeset
32 if m:
9708448ce811 Uploaded
cpt
parents:
diff changeset
33 host = m.group("host")
9708448ce811 Uploaded
cpt
parents:
diff changeset
34 phage = m.group("phage")
9708448ce811 Uploaded
cpt
parents:
diff changeset
35 return (host, phage)
9708448ce811 Uploaded
cpt
parents:
diff changeset
36
9708448ce811 Uploaded
cpt
parents:
diff changeset
37 m = STARTS_WITH_PHAGE.match(name)
9708448ce811 Uploaded
cpt
parents:
diff changeset
38 if m:
9708448ce811 Uploaded
cpt
parents:
diff changeset
39 phage = m.group("phage")
9708448ce811 Uploaded
cpt
parents:
diff changeset
40 return (host, phage)
9708448ce811 Uploaded
cpt
parents:
diff changeset
41
9708448ce811 Uploaded
cpt
parents:
diff changeset
42 m = NEW_STYLE_NAMES.match(name)
9708448ce811 Uploaded
cpt
parents:
diff changeset
43 if m:
9708448ce811 Uploaded
cpt
parents:
diff changeset
44 phage = m.group("phage")
9708448ce811 Uploaded
cpt
parents:
diff changeset
45 return (host, phage)
9708448ce811 Uploaded
cpt
parents:
diff changeset
46
9708448ce811 Uploaded
cpt
parents:
diff changeset
47 return (host, phage)
9708448ce811 Uploaded
cpt
parents:
diff changeset
48
9708448ce811 Uploaded
cpt
parents:
diff changeset
49
9708448ce811 Uploaded
cpt
parents:
diff changeset
50 class OrfFinder(object):
9708448ce811 Uploaded
cpt
parents:
diff changeset
51 def __init__(self, table, ftype, ends, min_len, strand):
9708448ce811 Uploaded
cpt
parents:
diff changeset
52 self.table = table
9708448ce811 Uploaded
cpt
parents:
diff changeset
53 self.table_obj = CodonTable.ambiguous_generic_by_id[table]
9708448ce811 Uploaded
cpt
parents:
diff changeset
54 self.ends = ends
9708448ce811 Uploaded
cpt
parents:
diff changeset
55 self.ftype = ftype
9708448ce811 Uploaded
cpt
parents:
diff changeset
56 self.min_len = min_len
9708448ce811 Uploaded
cpt
parents:
diff changeset
57 self.starts = sorted(self.table_obj.start_codons)
9708448ce811 Uploaded
cpt
parents:
diff changeset
58 self.stops = sorted(self.table_obj.stop_codons)
9708448ce811 Uploaded
cpt
parents:
diff changeset
59 self.re_starts = re.compile("|".join(self.starts))
9708448ce811 Uploaded
cpt
parents:
diff changeset
60 self.re_stops = re.compile("|".join(self.stops))
9708448ce811 Uploaded
cpt
parents:
diff changeset
61 self.strand = strand
9708448ce811 Uploaded
cpt
parents:
diff changeset
62
9708448ce811 Uploaded
cpt
parents:
diff changeset
63 def locate(self, fasta_file, out_nuc, out_prot, out_bed, out_gff3):
9708448ce811 Uploaded
cpt
parents:
diff changeset
64 seq_format = "fasta"
9708448ce811 Uploaded
cpt
parents:
diff changeset
65 log.debug("Genetic code table %i" % self.table)
9708448ce811 Uploaded
cpt
parents:
diff changeset
66 log.debug("Minimum length %i aa" % self.min_len)
9708448ce811 Uploaded
cpt
parents:
diff changeset
67
9708448ce811 Uploaded
cpt
parents:
diff changeset
68 out_count = 0
9708448ce811 Uploaded
cpt
parents:
diff changeset
69
9708448ce811 Uploaded
cpt
parents:
diff changeset
70 out_gff3.write("##gff-version 3\n")
9708448ce811 Uploaded
cpt
parents:
diff changeset
71
9708448ce811 Uploaded
cpt
parents:
diff changeset
72 for idx, record in enumerate(SeqIO.parse(fasta_file, seq_format)):
9708448ce811 Uploaded
cpt
parents:
diff changeset
73 for i, (f_start, f_end, f_strand, n, t) in enumerate(
9708448ce811 Uploaded
cpt
parents:
diff changeset
74 self.get_all_peptides(str(record.seq).upper())
9708448ce811 Uploaded
cpt
parents:
diff changeset
75 ):
9708448ce811 Uploaded
cpt
parents:
diff changeset
76 out_count += 1
9708448ce811 Uploaded
cpt
parents:
diff changeset
77
9708448ce811 Uploaded
cpt
parents:
diff changeset
78 descr = "length %i aa, %i bp, from %s..%s[%s] of %s" % (
9708448ce811 Uploaded
cpt
parents:
diff changeset
79 len(t),
9708448ce811 Uploaded
cpt
parents:
diff changeset
80 len(n),
9708448ce811 Uploaded
cpt
parents:
diff changeset
81 f_start,
9708448ce811 Uploaded
cpt
parents:
diff changeset
82 f_end,
9708448ce811 Uploaded
cpt
parents:
diff changeset
83 f_strand,
9708448ce811 Uploaded
cpt
parents:
diff changeset
84 record.description,
9708448ce811 Uploaded
cpt
parents:
diff changeset
85 )
9708448ce811 Uploaded
cpt
parents:
diff changeset
86 fid = record.id + "|%s%i" % (self.ftype, i + 1)
9708448ce811 Uploaded
cpt
parents:
diff changeset
87
9708448ce811 Uploaded
cpt
parents:
diff changeset
88 r = SeqRecord(Seq(n), id=fid, name="", description=descr)
9708448ce811 Uploaded
cpt
parents:
diff changeset
89 t = SeqRecord(Seq(t), id=fid, name="", description=descr)
9708448ce811 Uploaded
cpt
parents:
diff changeset
90
9708448ce811 Uploaded
cpt
parents:
diff changeset
91 SeqIO.write(r, out_nuc, "fasta")
9708448ce811 Uploaded
cpt
parents:
diff changeset
92 SeqIO.write(t, out_prot, "fasta")
9708448ce811 Uploaded
cpt
parents:
diff changeset
93
9708448ce811 Uploaded
cpt
parents:
diff changeset
94 nice_strand = "+" if f_strand == +1 else "-"
9708448ce811 Uploaded
cpt
parents:
diff changeset
95
9708448ce811 Uploaded
cpt
parents:
diff changeset
96 out_bed.write(
9708448ce811 Uploaded
cpt
parents:
diff changeset
97 "\t".join(
9708448ce811 Uploaded
cpt
parents:
diff changeset
98 map(str, [record.id, f_start, f_end, fid, 0, nice_strand])
9708448ce811 Uploaded
cpt
parents:
diff changeset
99 )
9708448ce811 Uploaded
cpt
parents:
diff changeset
100 + "\n"
9708448ce811 Uploaded
cpt
parents:
diff changeset
101 )
9708448ce811 Uploaded
cpt
parents:
diff changeset
102
9708448ce811 Uploaded
cpt
parents:
diff changeset
103 out_gff3.write(
9708448ce811 Uploaded
cpt
parents:
diff changeset
104 "\t".join(
9708448ce811 Uploaded
cpt
parents:
diff changeset
105 map(
9708448ce811 Uploaded
cpt
parents:
diff changeset
106 str,
9708448ce811 Uploaded
cpt
parents:
diff changeset
107 [
9708448ce811 Uploaded
cpt
parents:
diff changeset
108 record.id,
9708448ce811 Uploaded
cpt
parents:
diff changeset
109 "getOrfsOrCds",
9708448ce811 Uploaded
cpt
parents:
diff changeset
110 "CDS",
9708448ce811 Uploaded
cpt
parents:
diff changeset
111 f_start + 1,
9708448ce811 Uploaded
cpt
parents:
diff changeset
112 f_end,
9708448ce811 Uploaded
cpt
parents:
diff changeset
113 ".",
9708448ce811 Uploaded
cpt
parents:
diff changeset
114 nice_strand,
9708448ce811 Uploaded
cpt
parents:
diff changeset
115 0,
9708448ce811 Uploaded
cpt
parents:
diff changeset
116 "ID=%s.%s.%s" % (self.ftype, idx, i + 1),
9708448ce811 Uploaded
cpt
parents:
diff changeset
117 ],
9708448ce811 Uploaded
cpt
parents:
diff changeset
118 )
9708448ce811 Uploaded
cpt
parents:
diff changeset
119 )
9708448ce811 Uploaded
cpt
parents:
diff changeset
120 + "\n"
9708448ce811 Uploaded
cpt
parents:
diff changeset
121 )
9708448ce811 Uploaded
cpt
parents:
diff changeset
122 log.info("Found %i %ss", out_count, self.ftype)
9708448ce811 Uploaded
cpt
parents:
diff changeset
123
9708448ce811 Uploaded
cpt
parents:
diff changeset
124 def start_chop_and_trans(self, s, strict=True):
9708448ce811 Uploaded
cpt
parents:
diff changeset
125 """Returns offset, trimmed nuc, protein."""
9708448ce811 Uploaded
cpt
parents:
diff changeset
126 if strict:
9708448ce811 Uploaded
cpt
parents:
diff changeset
127 assert s[-3:] in self.stops, s
9708448ce811 Uploaded
cpt
parents:
diff changeset
128 assert len(s) % 3 == 0
9708448ce811 Uploaded
cpt
parents:
diff changeset
129 for match in self.re_starts.finditer(s, overlapped=True):
9708448ce811 Uploaded
cpt
parents:
diff changeset
130 # Must check the start is in frame
9708448ce811 Uploaded
cpt
parents:
diff changeset
131 start = match.start()
9708448ce811 Uploaded
cpt
parents:
diff changeset
132 if start % 3 == 0:
9708448ce811 Uploaded
cpt
parents:
diff changeset
133 n = s[start:]
9708448ce811 Uploaded
cpt
parents:
diff changeset
134 assert len(n) % 3 == 0, "%s is len %i" % (n, len(n))
9708448ce811 Uploaded
cpt
parents:
diff changeset
135 if strict:
9708448ce811 Uploaded
cpt
parents:
diff changeset
136 t = translate(n, self.table)
9708448ce811 Uploaded
cpt
parents:
diff changeset
137 else:
9708448ce811 Uploaded
cpt
parents:
diff changeset
138 # Use when missing stop codon,
9708448ce811 Uploaded
cpt
parents:
diff changeset
139 t = "M" + translate(n[3:], self.table, to_stop=True)
9708448ce811 Uploaded
cpt
parents:
diff changeset
140 yield start, n, t # Edited by CPT to be a generator
9708448ce811 Uploaded
cpt
parents:
diff changeset
141
9708448ce811 Uploaded
cpt
parents:
diff changeset
142 def break_up_frame(self, s):
9708448ce811 Uploaded
cpt
parents:
diff changeset
143 """Returns offset, nuc, protein."""
9708448ce811 Uploaded
cpt
parents:
diff changeset
144 start = 0
9708448ce811 Uploaded
cpt
parents:
diff changeset
145 for match in self.re_stops.finditer(s, overlapped=True):
9708448ce811 Uploaded
cpt
parents:
diff changeset
146 index = match.start() + 3
9708448ce811 Uploaded
cpt
parents:
diff changeset
147 if index % 3 != 0:
9708448ce811 Uploaded
cpt
parents:
diff changeset
148 continue
9708448ce811 Uploaded
cpt
parents:
diff changeset
149 n = s[start:index]
9708448ce811 Uploaded
cpt
parents:
diff changeset
150 for (offset, n, t) in self.start_chop_and_trans(n):
9708448ce811 Uploaded
cpt
parents:
diff changeset
151 if n and len(t) >= self.min_len:
9708448ce811 Uploaded
cpt
parents:
diff changeset
152 yield start + offset, n, t
9708448ce811 Uploaded
cpt
parents:
diff changeset
153 start = index
9708448ce811 Uploaded
cpt
parents:
diff changeset
154
9708448ce811 Uploaded
cpt
parents:
diff changeset
155 def putative_genes_in_sequence(self, nuc_seq):
9708448ce811 Uploaded
cpt
parents:
diff changeset
156 """Returns start, end, strand, nucleotides, protein.
9708448ce811 Uploaded
cpt
parents:
diff changeset
157 Co-ordinates are Python style zero-based.
9708448ce811 Uploaded
cpt
parents:
diff changeset
158 """
9708448ce811 Uploaded
cpt
parents:
diff changeset
159 nuc_seq = nuc_seq.upper()
9708448ce811 Uploaded
cpt
parents:
diff changeset
160 # TODO - Refactor to use a generator function (in start order)
9708448ce811 Uploaded
cpt
parents:
diff changeset
161 # rather than making a list and sorting?
9708448ce811 Uploaded
cpt
parents:
diff changeset
162 answer = []
9708448ce811 Uploaded
cpt
parents:
diff changeset
163 full_len = len(nuc_seq)
9708448ce811 Uploaded
cpt
parents:
diff changeset
164
9708448ce811 Uploaded
cpt
parents:
diff changeset
165 for frame in range(0, 3):
9708448ce811 Uploaded
cpt
parents:
diff changeset
166 for offset, n, t in self.break_up_frame(nuc_seq[frame:]):
9708448ce811 Uploaded
cpt
parents:
diff changeset
167 start = frame + offset # zero based
9708448ce811 Uploaded
cpt
parents:
diff changeset
168 answer.append((start, start + len(n), +1, n, t))
9708448ce811 Uploaded
cpt
parents:
diff changeset
169
9708448ce811 Uploaded
cpt
parents:
diff changeset
170 rc = reverse_complement(nuc_seq)
9708448ce811 Uploaded
cpt
parents:
diff changeset
171 for frame in range(0, 3):
9708448ce811 Uploaded
cpt
parents:
diff changeset
172 for offset, n, t in self.break_up_frame(rc[frame:]):
9708448ce811 Uploaded
cpt
parents:
diff changeset
173 start = full_len - frame - offset # zero based
9708448ce811 Uploaded
cpt
parents:
diff changeset
174 answer.append((start, start - len(n), -1, n, t))
9708448ce811 Uploaded
cpt
parents:
diff changeset
175 answer.sort()
9708448ce811 Uploaded
cpt
parents:
diff changeset
176 return answer
9708448ce811 Uploaded
cpt
parents:
diff changeset
177
9708448ce811 Uploaded
cpt
parents:
diff changeset
178 def get_all_peptides(self, nuc_seq):
9708448ce811 Uploaded
cpt
parents:
diff changeset
179 """Returns start, end, strand, nucleotides, protein.
9708448ce811 Uploaded
cpt
parents:
diff changeset
180
9708448ce811 Uploaded
cpt
parents:
diff changeset
181 Co-ordinates are Python style zero-based.
9708448ce811 Uploaded
cpt
parents:
diff changeset
182 """
9708448ce811 Uploaded
cpt
parents:
diff changeset
183 # Refactored into generator by CPT
9708448ce811 Uploaded
cpt
parents:
diff changeset
184 full_len = len(nuc_seq)
9708448ce811 Uploaded
cpt
parents:
diff changeset
185 if self.strand != "reverse":
9708448ce811 Uploaded
cpt
parents:
diff changeset
186 for frame in range(0, 3):
9708448ce811 Uploaded
cpt
parents:
diff changeset
187 for offset, n, t in self.break_up_frame(nuc_seq[frame:]):
9708448ce811 Uploaded
cpt
parents:
diff changeset
188 start = frame + offset # zero based
9708448ce811 Uploaded
cpt
parents:
diff changeset
189 yield (start, start + len(n), +1, n, t)
9708448ce811 Uploaded
cpt
parents:
diff changeset
190 if self.strand != "forward":
9708448ce811 Uploaded
cpt
parents:
diff changeset
191 rc = reverse_complement(nuc_seq)
9708448ce811 Uploaded
cpt
parents:
diff changeset
192 for frame in range(0, 3):
9708448ce811 Uploaded
cpt
parents:
diff changeset
193 for offset, n, t in self.break_up_frame(rc[frame:]):
9708448ce811 Uploaded
cpt
parents:
diff changeset
194 start = full_len - frame - offset # zero based
9708448ce811 Uploaded
cpt
parents:
diff changeset
195 yield (start - len(n), start, -1, n, t)
9708448ce811 Uploaded
cpt
parents:
diff changeset
196
9708448ce811 Uploaded
cpt
parents:
diff changeset
197
9708448ce811 Uploaded
cpt
parents:
diff changeset
198 class MGAFinder(object):
9708448ce811 Uploaded
cpt
parents:
diff changeset
199 def __init__(self, table, ftype, ends, min_len):
9708448ce811 Uploaded
cpt
parents:
diff changeset
200 self.table = table
9708448ce811 Uploaded
cpt
parents:
diff changeset
201 self.table_obj = CodonTable.ambiguous_generic_by_id[table]
9708448ce811 Uploaded
cpt
parents:
diff changeset
202 self.ends = ends
9708448ce811 Uploaded
cpt
parents:
diff changeset
203 self.ftype = ftype
9708448ce811 Uploaded
cpt
parents:
diff changeset
204 self.min_len = min_len
9708448ce811 Uploaded
cpt
parents:
diff changeset
205 self.starts = sorted(self.table_obj.start_codons)
9708448ce811 Uploaded
cpt
parents:
diff changeset
206 self.stops = sorted(self.table_obj.stop_codons)
9708448ce811 Uploaded
cpt
parents:
diff changeset
207 self.re_starts = re.compile("|".join(self.starts))
9708448ce811 Uploaded
cpt
parents:
diff changeset
208 self.re_stops = re.compile("|".join(self.stops))
9708448ce811 Uploaded
cpt
parents:
diff changeset
209
9708448ce811 Uploaded
cpt
parents:
diff changeset
210 def locate(self, fasta_file, out_nuc, out_prot, out_bed, out_gff3):
9708448ce811 Uploaded
cpt
parents:
diff changeset
211 seq_format = "fasta"
9708448ce811 Uploaded
cpt
parents:
diff changeset
212 log.debug("Genetic code table %i" % self.table)
9708448ce811 Uploaded
cpt
parents:
diff changeset
213 log.debug("Minimum length %i aa" % self.min_len)
9708448ce811 Uploaded
cpt
parents:
diff changeset
214
9708448ce811 Uploaded
cpt
parents:
diff changeset
215 out_count = 0
9708448ce811 Uploaded
cpt
parents:
diff changeset
216
9708448ce811 Uploaded
cpt
parents:
diff changeset
217 out_gff3.write("##gff-version 3\n")
9708448ce811 Uploaded
cpt
parents:
diff changeset
218
9708448ce811 Uploaded
cpt
parents:
diff changeset
219 for idx, record in enumerate(SeqIO.parse(fasta_file, seq_format)):
9708448ce811 Uploaded
cpt
parents:
diff changeset
220 for i, (f_start, f_end, f_strand, n, t) in enumerate(
9708448ce811 Uploaded
cpt
parents:
diff changeset
221 self.get_all_peptides(str(record.seq).upper())
9708448ce811 Uploaded
cpt
parents:
diff changeset
222 ):
9708448ce811 Uploaded
cpt
parents:
diff changeset
223 out_count += 1
9708448ce811 Uploaded
cpt
parents:
diff changeset
224
9708448ce811 Uploaded
cpt
parents:
diff changeset
225 descr = "length %i aa, %i bp, from %s..%s[%s] of %s" % (
9708448ce811 Uploaded
cpt
parents:
diff changeset
226 len(t),
9708448ce811 Uploaded
cpt
parents:
diff changeset
227 len(n),
9708448ce811 Uploaded
cpt
parents:
diff changeset
228 f_start,
9708448ce811 Uploaded
cpt
parents:
diff changeset
229 f_end,
9708448ce811 Uploaded
cpt
parents:
diff changeset
230 f_strand,
9708448ce811 Uploaded
cpt
parents:
diff changeset
231 record.description,
9708448ce811 Uploaded
cpt
parents:
diff changeset
232 )
9708448ce811 Uploaded
cpt
parents:
diff changeset
233 fid = record.id + "|%s%i" % (self.ftype, i + 1)
9708448ce811 Uploaded
cpt
parents:
diff changeset
234
9708448ce811 Uploaded
cpt
parents:
diff changeset
235 r = SeqRecord(Seq(n), id=fid, name="", description=descr)
9708448ce811 Uploaded
cpt
parents:
diff changeset
236 t = SeqRecord(Seq(t), id=fid, name="", description=descr)
9708448ce811 Uploaded
cpt
parents:
diff changeset
237
9708448ce811 Uploaded
cpt
parents:
diff changeset
238 SeqIO.write(r, out_nuc, "fasta")
9708448ce811 Uploaded
cpt
parents:
diff changeset
239 SeqIO.write(t, out_prot, "fasta")
9708448ce811 Uploaded
cpt
parents:
diff changeset
240
9708448ce811 Uploaded
cpt
parents:
diff changeset
241 nice_strand = "+" if f_strand == +1 else "-"
9708448ce811 Uploaded
cpt
parents:
diff changeset
242
9708448ce811 Uploaded
cpt
parents:
diff changeset
243 out_bed.write(
9708448ce811 Uploaded
cpt
parents:
diff changeset
244 "\t".join(
9708448ce811 Uploaded
cpt
parents:
diff changeset
245 map(str, [record.id, f_start, f_end, fid, 0, nice_strand])
9708448ce811 Uploaded
cpt
parents:
diff changeset
246 )
9708448ce811 Uploaded
cpt
parents:
diff changeset
247 + "\n"
9708448ce811 Uploaded
cpt
parents:
diff changeset
248 )
9708448ce811 Uploaded
cpt
parents:
diff changeset
249
9708448ce811 Uploaded
cpt
parents:
diff changeset
250 out_gff3.write(
9708448ce811 Uploaded
cpt
parents:
diff changeset
251 "\t".join(
9708448ce811 Uploaded
cpt
parents:
diff changeset
252 map(
9708448ce811 Uploaded
cpt
parents:
diff changeset
253 str,
9708448ce811 Uploaded
cpt
parents:
diff changeset
254 [
9708448ce811 Uploaded
cpt
parents:
diff changeset
255 record.id,
9708448ce811 Uploaded
cpt
parents:
diff changeset
256 "getOrfsOrCds",
9708448ce811 Uploaded
cpt
parents:
diff changeset
257 "CDS",
9708448ce811 Uploaded
cpt
parents:
diff changeset
258 f_start + 1,
9708448ce811 Uploaded
cpt
parents:
diff changeset
259 f_end,
9708448ce811 Uploaded
cpt
parents:
diff changeset
260 ".",
9708448ce811 Uploaded
cpt
parents:
diff changeset
261 nice_strand,
9708448ce811 Uploaded
cpt
parents:
diff changeset
262 0,
9708448ce811 Uploaded
cpt
parents:
diff changeset
263 "ID=%s.%s.%s" % (self.ftype, idx, i + 1),
9708448ce811 Uploaded
cpt
parents:
diff changeset
264 ],
9708448ce811 Uploaded
cpt
parents:
diff changeset
265 )
9708448ce811 Uploaded
cpt
parents:
diff changeset
266 )
9708448ce811 Uploaded
cpt
parents:
diff changeset
267 + "\n"
9708448ce811 Uploaded
cpt
parents:
diff changeset
268 )
9708448ce811 Uploaded
cpt
parents:
diff changeset
269 log.info("Found %i %ss", out_count, self.ftype)
9708448ce811 Uploaded
cpt
parents:
diff changeset
270
9708448ce811 Uploaded
cpt
parents:
diff changeset
271 def start_chop_and_trans(self, s, strict=True):
9708448ce811 Uploaded
cpt
parents:
diff changeset
272 """Returns offset, trimmed nuc, protein."""
9708448ce811 Uploaded
cpt
parents:
diff changeset
273 if strict:
9708448ce811 Uploaded
cpt
parents:
diff changeset
274 assert s[-3:] in self.stops, s
9708448ce811 Uploaded
cpt
parents:
diff changeset
275 assert len(s) % 3 == 0
9708448ce811 Uploaded
cpt
parents:
diff changeset
276 for match in self.re_starts.finditer(s, overlapped=True):
9708448ce811 Uploaded
cpt
parents:
diff changeset
277 # Must check the start is in frame
9708448ce811 Uploaded
cpt
parents:
diff changeset
278 start = match.start()
9708448ce811 Uploaded
cpt
parents:
diff changeset
279 if start % 3 == 0:
9708448ce811 Uploaded
cpt
parents:
diff changeset
280 n = s[start:]
9708448ce811 Uploaded
cpt
parents:
diff changeset
281 assert len(n) % 3 == 0, "%s is len %i" % (n, len(n))
9708448ce811 Uploaded
cpt
parents:
diff changeset
282 if strict:
9708448ce811 Uploaded
cpt
parents:
diff changeset
283 t = translate(n, self.table)
9708448ce811 Uploaded
cpt
parents:
diff changeset
284 else:
9708448ce811 Uploaded
cpt
parents:
diff changeset
285 # Use when missing stop codon,
9708448ce811 Uploaded
cpt
parents:
diff changeset
286 t = "M" + translate(n[3:], self.table, to_stop=True)
9708448ce811 Uploaded
cpt
parents:
diff changeset
287 yield start, n, t
9708448ce811 Uploaded
cpt
parents:
diff changeset
288
9708448ce811 Uploaded
cpt
parents:
diff changeset
289 def break_up_frame(self, s):
9708448ce811 Uploaded
cpt
parents:
diff changeset
290 """Returns offset, nuc, protein."""
9708448ce811 Uploaded
cpt
parents:
diff changeset
291 start = 0
9708448ce811 Uploaded
cpt
parents:
diff changeset
292 for match in self.re_stops.finditer(s, overlapped=True):
9708448ce811 Uploaded
cpt
parents:
diff changeset
293 index = match.start() + 3
9708448ce811 Uploaded
cpt
parents:
diff changeset
294 if index % 3 != 0:
9708448ce811 Uploaded
cpt
parents:
diff changeset
295 continue
9708448ce811 Uploaded
cpt
parents:
diff changeset
296 n = s[start:index]
9708448ce811 Uploaded
cpt
parents:
diff changeset
297 for (offset, n, t) in self.start_chop_and_trans(n):
9708448ce811 Uploaded
cpt
parents:
diff changeset
298 if n and len(t) >= self.min_len:
9708448ce811 Uploaded
cpt
parents:
diff changeset
299 yield start + offset, n, t
9708448ce811 Uploaded
cpt
parents:
diff changeset
300 start = index
9708448ce811 Uploaded
cpt
parents:
diff changeset
301
9708448ce811 Uploaded
cpt
parents:
diff changeset
302 def putative_genes_in_sequence(self, nuc_seq):
9708448ce811 Uploaded
cpt
parents:
diff changeset
303 """Returns start, end, strand, nucleotides, protein.
9708448ce811 Uploaded
cpt
parents:
diff changeset
304 Co-ordinates are Python style zero-based.
9708448ce811 Uploaded
cpt
parents:
diff changeset
305 """
9708448ce811 Uploaded
cpt
parents:
diff changeset
306 nuc_seq = nuc_seq.upper()
9708448ce811 Uploaded
cpt
parents:
diff changeset
307 # TODO - Refactor to use a generator function (in start order)
9708448ce811 Uploaded
cpt
parents:
diff changeset
308 # rather than making a list and sorting?
9708448ce811 Uploaded
cpt
parents:
diff changeset
309 answer = []
9708448ce811 Uploaded
cpt
parents:
diff changeset
310 full_len = len(nuc_seq)
9708448ce811 Uploaded
cpt
parents:
diff changeset
311
9708448ce811 Uploaded
cpt
parents:
diff changeset
312 for frame in range(0, 3):
9708448ce811 Uploaded
cpt
parents:
diff changeset
313 for offset, n, t in self.break_up_frame(nuc_seq[frame:]):
9708448ce811 Uploaded
cpt
parents:
diff changeset
314 start = frame + offset # zero based
9708448ce811 Uploaded
cpt
parents:
diff changeset
315 answer.append((start, start + len(n), +1, n, t))
9708448ce811 Uploaded
cpt
parents:
diff changeset
316
9708448ce811 Uploaded
cpt
parents:
diff changeset
317 rc = reverse_complement(nuc_seq)
9708448ce811 Uploaded
cpt
parents:
diff changeset
318 for frame in range(0, 3):
9708448ce811 Uploaded
cpt
parents:
diff changeset
319 for offset, n, t in self.break_up_frame(rc[frame:]):
9708448ce811 Uploaded
cpt
parents:
diff changeset
320 start = full_len - frame - offset # zero based
9708448ce811 Uploaded
cpt
parents:
diff changeset
321 answer.append((start, start - len(n), -1, n, t))
9708448ce811 Uploaded
cpt
parents:
diff changeset
322 answer.sort()
9708448ce811 Uploaded
cpt
parents:
diff changeset
323 return answer
9708448ce811 Uploaded
cpt
parents:
diff changeset
324
9708448ce811 Uploaded
cpt
parents:
diff changeset
325 def get_all_peptides(self, nuc_seq):
9708448ce811 Uploaded
cpt
parents:
diff changeset
326 """Returns start, end, strand, nucleotides, protein.
9708448ce811 Uploaded
cpt
parents:
diff changeset
327
9708448ce811 Uploaded
cpt
parents:
diff changeset
328 Co-ordinates are Python style zero-based.
9708448ce811 Uploaded
cpt
parents:
diff changeset
329 """
9708448ce811 Uploaded
cpt
parents:
diff changeset
330 # Refactored into generator by CPT
9708448ce811 Uploaded
cpt
parents:
diff changeset
331
9708448ce811 Uploaded
cpt
parents:
diff changeset
332 full_len = len(nuc_seq)
9708448ce811 Uploaded
cpt
parents:
diff changeset
333 for frame in range(0, 3):
9708448ce811 Uploaded
cpt
parents:
diff changeset
334 for offset, n, t in self.break_up_frame(nuc_seq[frame:]):
9708448ce811 Uploaded
cpt
parents:
diff changeset
335 start = frame + offset # zero based
9708448ce811 Uploaded
cpt
parents:
diff changeset
336 yield (start, start + len(n), +1, n, t)
9708448ce811 Uploaded
cpt
parents:
diff changeset
337 rc = reverse_complement(nuc_seq)
9708448ce811 Uploaded
cpt
parents:
diff changeset
338 for frame in range(0, 3):
9708448ce811 Uploaded
cpt
parents:
diff changeset
339 for offset, n, t in self.break_up_frame(rc[frame:]):
9708448ce811 Uploaded
cpt
parents:
diff changeset
340 start = full_len - frame - offset # zero based
9708448ce811 Uploaded
cpt
parents:
diff changeset
341 yield (start - len(n), start, -1, n, t)