annotate cpt.py @ 8:bf32db94fe46 draft default tip

planemo upload commit cd3216893a76b4b9485c2615448b41dbf7133107
author cpt
date Fri, 20 Sep 2024 05:17:06 +0000
parents 46b252c89e9e
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9708448ce811 Uploaded
cpt
parents:
diff changeset
1 #!/usr/bin/env python
9708448ce811 Uploaded
cpt
parents:
diff changeset
2 from Bio.Seq import Seq, reverse_complement, translate
9708448ce811 Uploaded
cpt
parents:
diff changeset
3 from Bio.SeqRecord import SeqRecord
9708448ce811 Uploaded
cpt
parents:
diff changeset
4 from Bio import SeqIO
9708448ce811 Uploaded
cpt
parents:
diff changeset
5 from Bio.Data import CodonTable
9708448ce811 Uploaded
cpt
parents:
diff changeset
6 import logging
5
46b252c89e9e planemo upload commit 25fff8b3887beeb66c2d53e2a32f9af9f34e40b6
cpt
parents: 0
diff changeset
7 import regex as re
0
9708448ce811 Uploaded
cpt
parents:
diff changeset
8
9708448ce811 Uploaded
cpt
parents:
diff changeset
9 logging.basicConfig()
9708448ce811 Uploaded
cpt
parents:
diff changeset
10 log = logging.getLogger()
9708448ce811 Uploaded
cpt
parents:
diff changeset
11
9708448ce811 Uploaded
cpt
parents:
diff changeset
12 PHAGE_IN_MIDDLE = re.compile("^(?P<host>.*)\s*phage (?P<phage>.*)$")
9708448ce811 Uploaded
cpt
parents:
diff changeset
13 BACTERIOPHAGE_IN_MIDDLE = re.compile("^(?P<host>.*)\s*bacteriophage (?P<phage>.*)$")
9708448ce811 Uploaded
cpt
parents:
diff changeset
14 STARTS_WITH_PHAGE = re.compile(
9708448ce811 Uploaded
cpt
parents:
diff changeset
15 "^(bacterio|vibrio|Bacterio|Vibrio|)?[Pp]hage (?P<phage>.*)$"
9708448ce811 Uploaded
cpt
parents:
diff changeset
16 )
9708448ce811 Uploaded
cpt
parents:
diff changeset
17 NEW_STYLE_NAMES = re.compile("(?P<phage>v[A-Z]_[A-Z][a-z]{2}_.*)")
9708448ce811 Uploaded
cpt
parents:
diff changeset
18
9708448ce811 Uploaded
cpt
parents:
diff changeset
19
9708448ce811 Uploaded
cpt
parents:
diff changeset
20 def phage_name_parser(name):
9708448ce811 Uploaded
cpt
parents:
diff changeset
21 host = None
9708448ce811 Uploaded
cpt
parents:
diff changeset
22 phage = None
9708448ce811 Uploaded
cpt
parents:
diff changeset
23 name = name.replace(", complete genome.", "")
9708448ce811 Uploaded
cpt
parents:
diff changeset
24 name = name.replace(", complete genome", "")
9708448ce811 Uploaded
cpt
parents:
diff changeset
25
9708448ce811 Uploaded
cpt
parents:
diff changeset
26 m = BACTERIOPHAGE_IN_MIDDLE.match(name)
9708448ce811 Uploaded
cpt
parents:
diff changeset
27 if m:
9708448ce811 Uploaded
cpt
parents:
diff changeset
28 host = m.group("host")
9708448ce811 Uploaded
cpt
parents:
diff changeset
29 phage = m.group("phage")
9708448ce811 Uploaded
cpt
parents:
diff changeset
30 return (host, phage)
9708448ce811 Uploaded
cpt
parents:
diff changeset
31
9708448ce811 Uploaded
cpt
parents:
diff changeset
32 m = PHAGE_IN_MIDDLE.match(name)
9708448ce811 Uploaded
cpt
parents:
diff changeset
33 if m:
9708448ce811 Uploaded
cpt
parents:
diff changeset
34 host = m.group("host")
9708448ce811 Uploaded
cpt
parents:
diff changeset
35 phage = m.group("phage")
9708448ce811 Uploaded
cpt
parents:
diff changeset
36 return (host, phage)
9708448ce811 Uploaded
cpt
parents:
diff changeset
37
9708448ce811 Uploaded
cpt
parents:
diff changeset
38 m = STARTS_WITH_PHAGE.match(name)
9708448ce811 Uploaded
cpt
parents:
diff changeset
39 if m:
9708448ce811 Uploaded
cpt
parents:
diff changeset
40 phage = m.group("phage")
9708448ce811 Uploaded
cpt
parents:
diff changeset
41 return (host, phage)
9708448ce811 Uploaded
cpt
parents:
diff changeset
42
9708448ce811 Uploaded
cpt
parents:
diff changeset
43 m = NEW_STYLE_NAMES.match(name)
9708448ce811 Uploaded
cpt
parents:
diff changeset
44 if m:
9708448ce811 Uploaded
cpt
parents:
diff changeset
45 phage = m.group("phage")
9708448ce811 Uploaded
cpt
parents:
diff changeset
46 return (host, phage)
9708448ce811 Uploaded
cpt
parents:
diff changeset
47
9708448ce811 Uploaded
cpt
parents:
diff changeset
48 return (host, phage)
9708448ce811 Uploaded
cpt
parents:
diff changeset
49
9708448ce811 Uploaded
cpt
parents:
diff changeset
50
9708448ce811 Uploaded
cpt
parents:
diff changeset
51 class OrfFinder(object):
9708448ce811 Uploaded
cpt
parents:
diff changeset
52 def __init__(self, table, ftype, ends, min_len, strand):
9708448ce811 Uploaded
cpt
parents:
diff changeset
53 self.table = table
9708448ce811 Uploaded
cpt
parents:
diff changeset
54 self.table_obj = CodonTable.ambiguous_generic_by_id[table]
9708448ce811 Uploaded
cpt
parents:
diff changeset
55 self.ends = ends
9708448ce811 Uploaded
cpt
parents:
diff changeset
56 self.ftype = ftype
9708448ce811 Uploaded
cpt
parents:
diff changeset
57 self.min_len = min_len
9708448ce811 Uploaded
cpt
parents:
diff changeset
58 self.starts = sorted(self.table_obj.start_codons)
9708448ce811 Uploaded
cpt
parents:
diff changeset
59 self.stops = sorted(self.table_obj.stop_codons)
9708448ce811 Uploaded
cpt
parents:
diff changeset
60 self.re_starts = re.compile("|".join(self.starts))
9708448ce811 Uploaded
cpt
parents:
diff changeset
61 self.re_stops = re.compile("|".join(self.stops))
9708448ce811 Uploaded
cpt
parents:
diff changeset
62 self.strand = strand
9708448ce811 Uploaded
cpt
parents:
diff changeset
63
9708448ce811 Uploaded
cpt
parents:
diff changeset
64 def locate(self, fasta_file, out_nuc, out_prot, out_bed, out_gff3):
9708448ce811 Uploaded
cpt
parents:
diff changeset
65 seq_format = "fasta"
9708448ce811 Uploaded
cpt
parents:
diff changeset
66 log.debug("Genetic code table %i" % self.table)
9708448ce811 Uploaded
cpt
parents:
diff changeset
67 log.debug("Minimum length %i aa" % self.min_len)
9708448ce811 Uploaded
cpt
parents:
diff changeset
68
9708448ce811 Uploaded
cpt
parents:
diff changeset
69 out_count = 0
9708448ce811 Uploaded
cpt
parents:
diff changeset
70
9708448ce811 Uploaded
cpt
parents:
diff changeset
71 out_gff3.write("##gff-version 3\n")
9708448ce811 Uploaded
cpt
parents:
diff changeset
72
9708448ce811 Uploaded
cpt
parents:
diff changeset
73 for idx, record in enumerate(SeqIO.parse(fasta_file, seq_format)):
9708448ce811 Uploaded
cpt
parents:
diff changeset
74 for i, (f_start, f_end, f_strand, n, t) in enumerate(
9708448ce811 Uploaded
cpt
parents:
diff changeset
75 self.get_all_peptides(str(record.seq).upper())
9708448ce811 Uploaded
cpt
parents:
diff changeset
76 ):
9708448ce811 Uploaded
cpt
parents:
diff changeset
77 out_count += 1
9708448ce811 Uploaded
cpt
parents:
diff changeset
78
9708448ce811 Uploaded
cpt
parents:
diff changeset
79 descr = "length %i aa, %i bp, from %s..%s[%s] of %s" % (
9708448ce811 Uploaded
cpt
parents:
diff changeset
80 len(t),
9708448ce811 Uploaded
cpt
parents:
diff changeset
81 len(n),
9708448ce811 Uploaded
cpt
parents:
diff changeset
82 f_start,
9708448ce811 Uploaded
cpt
parents:
diff changeset
83 f_end,
9708448ce811 Uploaded
cpt
parents:
diff changeset
84 f_strand,
9708448ce811 Uploaded
cpt
parents:
diff changeset
85 record.description,
9708448ce811 Uploaded
cpt
parents:
diff changeset
86 )
9708448ce811 Uploaded
cpt
parents:
diff changeset
87 fid = record.id + "|%s%i" % (self.ftype, i + 1)
9708448ce811 Uploaded
cpt
parents:
diff changeset
88
9708448ce811 Uploaded
cpt
parents:
diff changeset
89 r = SeqRecord(Seq(n), id=fid, name="", description=descr)
9708448ce811 Uploaded
cpt
parents:
diff changeset
90 t = SeqRecord(Seq(t), id=fid, name="", description=descr)
9708448ce811 Uploaded
cpt
parents:
diff changeset
91
9708448ce811 Uploaded
cpt
parents:
diff changeset
92 SeqIO.write(r, out_nuc, "fasta")
9708448ce811 Uploaded
cpt
parents:
diff changeset
93 SeqIO.write(t, out_prot, "fasta")
9708448ce811 Uploaded
cpt
parents:
diff changeset
94
9708448ce811 Uploaded
cpt
parents:
diff changeset
95 nice_strand = "+" if f_strand == +1 else "-"
9708448ce811 Uploaded
cpt
parents:
diff changeset
96
9708448ce811 Uploaded
cpt
parents:
diff changeset
97 out_bed.write(
9708448ce811 Uploaded
cpt
parents:
diff changeset
98 "\t".join(
9708448ce811 Uploaded
cpt
parents:
diff changeset
99 map(str, [record.id, f_start, f_end, fid, 0, nice_strand])
9708448ce811 Uploaded
cpt
parents:
diff changeset
100 )
9708448ce811 Uploaded
cpt
parents:
diff changeset
101 + "\n"
9708448ce811 Uploaded
cpt
parents:
diff changeset
102 )
9708448ce811 Uploaded
cpt
parents:
diff changeset
103
9708448ce811 Uploaded
cpt
parents:
diff changeset
104 out_gff3.write(
9708448ce811 Uploaded
cpt
parents:
diff changeset
105 "\t".join(
9708448ce811 Uploaded
cpt
parents:
diff changeset
106 map(
9708448ce811 Uploaded
cpt
parents:
diff changeset
107 str,
9708448ce811 Uploaded
cpt
parents:
diff changeset
108 [
9708448ce811 Uploaded
cpt
parents:
diff changeset
109 record.id,
9708448ce811 Uploaded
cpt
parents:
diff changeset
110 "getOrfsOrCds",
9708448ce811 Uploaded
cpt
parents:
diff changeset
111 "CDS",
9708448ce811 Uploaded
cpt
parents:
diff changeset
112 f_start + 1,
9708448ce811 Uploaded
cpt
parents:
diff changeset
113 f_end,
9708448ce811 Uploaded
cpt
parents:
diff changeset
114 ".",
9708448ce811 Uploaded
cpt
parents:
diff changeset
115 nice_strand,
9708448ce811 Uploaded
cpt
parents:
diff changeset
116 0,
9708448ce811 Uploaded
cpt
parents:
diff changeset
117 "ID=%s.%s.%s" % (self.ftype, idx, i + 1),
9708448ce811 Uploaded
cpt
parents:
diff changeset
118 ],
9708448ce811 Uploaded
cpt
parents:
diff changeset
119 )
9708448ce811 Uploaded
cpt
parents:
diff changeset
120 )
9708448ce811 Uploaded
cpt
parents:
diff changeset
121 + "\n"
9708448ce811 Uploaded
cpt
parents:
diff changeset
122 )
9708448ce811 Uploaded
cpt
parents:
diff changeset
123 log.info("Found %i %ss", out_count, self.ftype)
9708448ce811 Uploaded
cpt
parents:
diff changeset
124
9708448ce811 Uploaded
cpt
parents:
diff changeset
125 def start_chop_and_trans(self, s, strict=True):
9708448ce811 Uploaded
cpt
parents:
diff changeset
126 """Returns offset, trimmed nuc, protein."""
9708448ce811 Uploaded
cpt
parents:
diff changeset
127 if strict:
9708448ce811 Uploaded
cpt
parents:
diff changeset
128 assert s[-3:] in self.stops, s
9708448ce811 Uploaded
cpt
parents:
diff changeset
129 assert len(s) % 3 == 0
9708448ce811 Uploaded
cpt
parents:
diff changeset
130 for match in self.re_starts.finditer(s, overlapped=True):
9708448ce811 Uploaded
cpt
parents:
diff changeset
131 # Must check the start is in frame
9708448ce811 Uploaded
cpt
parents:
diff changeset
132 start = match.start()
9708448ce811 Uploaded
cpt
parents:
diff changeset
133 if start % 3 == 0:
9708448ce811 Uploaded
cpt
parents:
diff changeset
134 n = s[start:]
9708448ce811 Uploaded
cpt
parents:
diff changeset
135 assert len(n) % 3 == 0, "%s is len %i" % (n, len(n))
9708448ce811 Uploaded
cpt
parents:
diff changeset
136 if strict:
9708448ce811 Uploaded
cpt
parents:
diff changeset
137 t = translate(n, self.table)
9708448ce811 Uploaded
cpt
parents:
diff changeset
138 else:
9708448ce811 Uploaded
cpt
parents:
diff changeset
139 # Use when missing stop codon,
9708448ce811 Uploaded
cpt
parents:
diff changeset
140 t = "M" + translate(n[3:], self.table, to_stop=True)
9708448ce811 Uploaded
cpt
parents:
diff changeset
141 yield start, n, t # Edited by CPT to be a generator
9708448ce811 Uploaded
cpt
parents:
diff changeset
142
9708448ce811 Uploaded
cpt
parents:
diff changeset
143 def break_up_frame(self, s):
9708448ce811 Uploaded
cpt
parents:
diff changeset
144 """Returns offset, nuc, protein."""
9708448ce811 Uploaded
cpt
parents:
diff changeset
145 start = 0
9708448ce811 Uploaded
cpt
parents:
diff changeset
146 for match in self.re_stops.finditer(s, overlapped=True):
9708448ce811 Uploaded
cpt
parents:
diff changeset
147 index = match.start() + 3
9708448ce811 Uploaded
cpt
parents:
diff changeset
148 if index % 3 != 0:
9708448ce811 Uploaded
cpt
parents:
diff changeset
149 continue
9708448ce811 Uploaded
cpt
parents:
diff changeset
150 n = s[start:index]
5
46b252c89e9e planemo upload commit 25fff8b3887beeb66c2d53e2a32f9af9f34e40b6
cpt
parents: 0
diff changeset
151 for offset, n, t in self.start_chop_and_trans(n):
0
9708448ce811 Uploaded
cpt
parents:
diff changeset
152 if n and len(t) >= self.min_len:
9708448ce811 Uploaded
cpt
parents:
diff changeset
153 yield start + offset, n, t
9708448ce811 Uploaded
cpt
parents:
diff changeset
154 start = index
9708448ce811 Uploaded
cpt
parents:
diff changeset
155
9708448ce811 Uploaded
cpt
parents:
diff changeset
156 def putative_genes_in_sequence(self, nuc_seq):
9708448ce811 Uploaded
cpt
parents:
diff changeset
157 """Returns start, end, strand, nucleotides, protein.
9708448ce811 Uploaded
cpt
parents:
diff changeset
158 Co-ordinates are Python style zero-based.
9708448ce811 Uploaded
cpt
parents:
diff changeset
159 """
9708448ce811 Uploaded
cpt
parents:
diff changeset
160 nuc_seq = nuc_seq.upper()
9708448ce811 Uploaded
cpt
parents:
diff changeset
161 # TODO - Refactor to use a generator function (in start order)
9708448ce811 Uploaded
cpt
parents:
diff changeset
162 # rather than making a list and sorting?
9708448ce811 Uploaded
cpt
parents:
diff changeset
163 answer = []
9708448ce811 Uploaded
cpt
parents:
diff changeset
164 full_len = len(nuc_seq)
9708448ce811 Uploaded
cpt
parents:
diff changeset
165
9708448ce811 Uploaded
cpt
parents:
diff changeset
166 for frame in range(0, 3):
9708448ce811 Uploaded
cpt
parents:
diff changeset
167 for offset, n, t in self.break_up_frame(nuc_seq[frame:]):
9708448ce811 Uploaded
cpt
parents:
diff changeset
168 start = frame + offset # zero based
9708448ce811 Uploaded
cpt
parents:
diff changeset
169 answer.append((start, start + len(n), +1, n, t))
9708448ce811 Uploaded
cpt
parents:
diff changeset
170
9708448ce811 Uploaded
cpt
parents:
diff changeset
171 rc = reverse_complement(nuc_seq)
9708448ce811 Uploaded
cpt
parents:
diff changeset
172 for frame in range(0, 3):
9708448ce811 Uploaded
cpt
parents:
diff changeset
173 for offset, n, t in self.break_up_frame(rc[frame:]):
9708448ce811 Uploaded
cpt
parents:
diff changeset
174 start = full_len - frame - offset # zero based
9708448ce811 Uploaded
cpt
parents:
diff changeset
175 answer.append((start, start - len(n), -1, n, t))
9708448ce811 Uploaded
cpt
parents:
diff changeset
176 answer.sort()
9708448ce811 Uploaded
cpt
parents:
diff changeset
177 return answer
9708448ce811 Uploaded
cpt
parents:
diff changeset
178
9708448ce811 Uploaded
cpt
parents:
diff changeset
179 def get_all_peptides(self, nuc_seq):
9708448ce811 Uploaded
cpt
parents:
diff changeset
180 """Returns start, end, strand, nucleotides, protein.
9708448ce811 Uploaded
cpt
parents:
diff changeset
181
9708448ce811 Uploaded
cpt
parents:
diff changeset
182 Co-ordinates are Python style zero-based.
9708448ce811 Uploaded
cpt
parents:
diff changeset
183 """
9708448ce811 Uploaded
cpt
parents:
diff changeset
184 # Refactored into generator by CPT
9708448ce811 Uploaded
cpt
parents:
diff changeset
185 full_len = len(nuc_seq)
9708448ce811 Uploaded
cpt
parents:
diff changeset
186 if self.strand != "reverse":
9708448ce811 Uploaded
cpt
parents:
diff changeset
187 for frame in range(0, 3):
9708448ce811 Uploaded
cpt
parents:
diff changeset
188 for offset, n, t in self.break_up_frame(nuc_seq[frame:]):
9708448ce811 Uploaded
cpt
parents:
diff changeset
189 start = frame + offset # zero based
9708448ce811 Uploaded
cpt
parents:
diff changeset
190 yield (start, start + len(n), +1, n, t)
9708448ce811 Uploaded
cpt
parents:
diff changeset
191 if self.strand != "forward":
9708448ce811 Uploaded
cpt
parents:
diff changeset
192 rc = reverse_complement(nuc_seq)
9708448ce811 Uploaded
cpt
parents:
diff changeset
193 for frame in range(0, 3):
9708448ce811 Uploaded
cpt
parents:
diff changeset
194 for offset, n, t in self.break_up_frame(rc[frame:]):
9708448ce811 Uploaded
cpt
parents:
diff changeset
195 start = full_len - frame - offset # zero based
9708448ce811 Uploaded
cpt
parents:
diff changeset
196 yield (start - len(n), start, -1, n, t)
9708448ce811 Uploaded
cpt
parents:
diff changeset
197
9708448ce811 Uploaded
cpt
parents:
diff changeset
198
9708448ce811 Uploaded
cpt
parents:
diff changeset
199 class MGAFinder(object):
9708448ce811 Uploaded
cpt
parents:
diff changeset
200 def __init__(self, table, ftype, ends, min_len):
9708448ce811 Uploaded
cpt
parents:
diff changeset
201 self.table = table
9708448ce811 Uploaded
cpt
parents:
diff changeset
202 self.table_obj = CodonTable.ambiguous_generic_by_id[table]
9708448ce811 Uploaded
cpt
parents:
diff changeset
203 self.ends = ends
9708448ce811 Uploaded
cpt
parents:
diff changeset
204 self.ftype = ftype
9708448ce811 Uploaded
cpt
parents:
diff changeset
205 self.min_len = min_len
9708448ce811 Uploaded
cpt
parents:
diff changeset
206 self.starts = sorted(self.table_obj.start_codons)
9708448ce811 Uploaded
cpt
parents:
diff changeset
207 self.stops = sorted(self.table_obj.stop_codons)
9708448ce811 Uploaded
cpt
parents:
diff changeset
208 self.re_starts = re.compile("|".join(self.starts))
9708448ce811 Uploaded
cpt
parents:
diff changeset
209 self.re_stops = re.compile("|".join(self.stops))
9708448ce811 Uploaded
cpt
parents:
diff changeset
210
9708448ce811 Uploaded
cpt
parents:
diff changeset
211 def locate(self, fasta_file, out_nuc, out_prot, out_bed, out_gff3):
9708448ce811 Uploaded
cpt
parents:
diff changeset
212 seq_format = "fasta"
9708448ce811 Uploaded
cpt
parents:
diff changeset
213 log.debug("Genetic code table %i" % self.table)
9708448ce811 Uploaded
cpt
parents:
diff changeset
214 log.debug("Minimum length %i aa" % self.min_len)
9708448ce811 Uploaded
cpt
parents:
diff changeset
215
9708448ce811 Uploaded
cpt
parents:
diff changeset
216 out_count = 0
9708448ce811 Uploaded
cpt
parents:
diff changeset
217
9708448ce811 Uploaded
cpt
parents:
diff changeset
218 out_gff3.write("##gff-version 3\n")
9708448ce811 Uploaded
cpt
parents:
diff changeset
219
9708448ce811 Uploaded
cpt
parents:
diff changeset
220 for idx, record in enumerate(SeqIO.parse(fasta_file, seq_format)):
9708448ce811 Uploaded
cpt
parents:
diff changeset
221 for i, (f_start, f_end, f_strand, n, t) in enumerate(
9708448ce811 Uploaded
cpt
parents:
diff changeset
222 self.get_all_peptides(str(record.seq).upper())
9708448ce811 Uploaded
cpt
parents:
diff changeset
223 ):
9708448ce811 Uploaded
cpt
parents:
diff changeset
224 out_count += 1
9708448ce811 Uploaded
cpt
parents:
diff changeset
225
9708448ce811 Uploaded
cpt
parents:
diff changeset
226 descr = "length %i aa, %i bp, from %s..%s[%s] of %s" % (
9708448ce811 Uploaded
cpt
parents:
diff changeset
227 len(t),
9708448ce811 Uploaded
cpt
parents:
diff changeset
228 len(n),
9708448ce811 Uploaded
cpt
parents:
diff changeset
229 f_start,
9708448ce811 Uploaded
cpt
parents:
diff changeset
230 f_end,
9708448ce811 Uploaded
cpt
parents:
diff changeset
231 f_strand,
9708448ce811 Uploaded
cpt
parents:
diff changeset
232 record.description,
9708448ce811 Uploaded
cpt
parents:
diff changeset
233 )
9708448ce811 Uploaded
cpt
parents:
diff changeset
234 fid = record.id + "|%s%i" % (self.ftype, i + 1)
9708448ce811 Uploaded
cpt
parents:
diff changeset
235
9708448ce811 Uploaded
cpt
parents:
diff changeset
236 r = SeqRecord(Seq(n), id=fid, name="", description=descr)
9708448ce811 Uploaded
cpt
parents:
diff changeset
237 t = SeqRecord(Seq(t), id=fid, name="", description=descr)
9708448ce811 Uploaded
cpt
parents:
diff changeset
238
9708448ce811 Uploaded
cpt
parents:
diff changeset
239 SeqIO.write(r, out_nuc, "fasta")
9708448ce811 Uploaded
cpt
parents:
diff changeset
240 SeqIO.write(t, out_prot, "fasta")
9708448ce811 Uploaded
cpt
parents:
diff changeset
241
9708448ce811 Uploaded
cpt
parents:
diff changeset
242 nice_strand = "+" if f_strand == +1 else "-"
9708448ce811 Uploaded
cpt
parents:
diff changeset
243
9708448ce811 Uploaded
cpt
parents:
diff changeset
244 out_bed.write(
9708448ce811 Uploaded
cpt
parents:
diff changeset
245 "\t".join(
9708448ce811 Uploaded
cpt
parents:
diff changeset
246 map(str, [record.id, f_start, f_end, fid, 0, nice_strand])
9708448ce811 Uploaded
cpt
parents:
diff changeset
247 )
9708448ce811 Uploaded
cpt
parents:
diff changeset
248 + "\n"
9708448ce811 Uploaded
cpt
parents:
diff changeset
249 )
9708448ce811 Uploaded
cpt
parents:
diff changeset
250
9708448ce811 Uploaded
cpt
parents:
diff changeset
251 out_gff3.write(
9708448ce811 Uploaded
cpt
parents:
diff changeset
252 "\t".join(
9708448ce811 Uploaded
cpt
parents:
diff changeset
253 map(
9708448ce811 Uploaded
cpt
parents:
diff changeset
254 str,
9708448ce811 Uploaded
cpt
parents:
diff changeset
255 [
9708448ce811 Uploaded
cpt
parents:
diff changeset
256 record.id,
9708448ce811 Uploaded
cpt
parents:
diff changeset
257 "getOrfsOrCds",
9708448ce811 Uploaded
cpt
parents:
diff changeset
258 "CDS",
9708448ce811 Uploaded
cpt
parents:
diff changeset
259 f_start + 1,
9708448ce811 Uploaded
cpt
parents:
diff changeset
260 f_end,
9708448ce811 Uploaded
cpt
parents:
diff changeset
261 ".",
9708448ce811 Uploaded
cpt
parents:
diff changeset
262 nice_strand,
9708448ce811 Uploaded
cpt
parents:
diff changeset
263 0,
9708448ce811 Uploaded
cpt
parents:
diff changeset
264 "ID=%s.%s.%s" % (self.ftype, idx, i + 1),
9708448ce811 Uploaded
cpt
parents:
diff changeset
265 ],
9708448ce811 Uploaded
cpt
parents:
diff changeset
266 )
9708448ce811 Uploaded
cpt
parents:
diff changeset
267 )
9708448ce811 Uploaded
cpt
parents:
diff changeset
268 + "\n"
9708448ce811 Uploaded
cpt
parents:
diff changeset
269 )
9708448ce811 Uploaded
cpt
parents:
diff changeset
270 log.info("Found %i %ss", out_count, self.ftype)
9708448ce811 Uploaded
cpt
parents:
diff changeset
271
9708448ce811 Uploaded
cpt
parents:
diff changeset
272 def start_chop_and_trans(self, s, strict=True):
9708448ce811 Uploaded
cpt
parents:
diff changeset
273 """Returns offset, trimmed nuc, protein."""
9708448ce811 Uploaded
cpt
parents:
diff changeset
274 if strict:
9708448ce811 Uploaded
cpt
parents:
diff changeset
275 assert s[-3:] in self.stops, s
9708448ce811 Uploaded
cpt
parents:
diff changeset
276 assert len(s) % 3 == 0
9708448ce811 Uploaded
cpt
parents:
diff changeset
277 for match in self.re_starts.finditer(s, overlapped=True):
9708448ce811 Uploaded
cpt
parents:
diff changeset
278 # Must check the start is in frame
9708448ce811 Uploaded
cpt
parents:
diff changeset
279 start = match.start()
9708448ce811 Uploaded
cpt
parents:
diff changeset
280 if start % 3 == 0:
9708448ce811 Uploaded
cpt
parents:
diff changeset
281 n = s[start:]
9708448ce811 Uploaded
cpt
parents:
diff changeset
282 assert len(n) % 3 == 0, "%s is len %i" % (n, len(n))
9708448ce811 Uploaded
cpt
parents:
diff changeset
283 if strict:
9708448ce811 Uploaded
cpt
parents:
diff changeset
284 t = translate(n, self.table)
9708448ce811 Uploaded
cpt
parents:
diff changeset
285 else:
9708448ce811 Uploaded
cpt
parents:
diff changeset
286 # Use when missing stop codon,
9708448ce811 Uploaded
cpt
parents:
diff changeset
287 t = "M" + translate(n[3:], self.table, to_stop=True)
9708448ce811 Uploaded
cpt
parents:
diff changeset
288 yield start, n, t
9708448ce811 Uploaded
cpt
parents:
diff changeset
289
9708448ce811 Uploaded
cpt
parents:
diff changeset
290 def break_up_frame(self, s):
9708448ce811 Uploaded
cpt
parents:
diff changeset
291 """Returns offset, nuc, protein."""
9708448ce811 Uploaded
cpt
parents:
diff changeset
292 start = 0
9708448ce811 Uploaded
cpt
parents:
diff changeset
293 for match in self.re_stops.finditer(s, overlapped=True):
9708448ce811 Uploaded
cpt
parents:
diff changeset
294 index = match.start() + 3
9708448ce811 Uploaded
cpt
parents:
diff changeset
295 if index % 3 != 0:
9708448ce811 Uploaded
cpt
parents:
diff changeset
296 continue
9708448ce811 Uploaded
cpt
parents:
diff changeset
297 n = s[start:index]
5
46b252c89e9e planemo upload commit 25fff8b3887beeb66c2d53e2a32f9af9f34e40b6
cpt
parents: 0
diff changeset
298 for offset, n, t in self.start_chop_and_trans(n):
0
9708448ce811 Uploaded
cpt
parents:
diff changeset
299 if n and len(t) >= self.min_len:
9708448ce811 Uploaded
cpt
parents:
diff changeset
300 yield start + offset, n, t
9708448ce811 Uploaded
cpt
parents:
diff changeset
301 start = index
9708448ce811 Uploaded
cpt
parents:
diff changeset
302
9708448ce811 Uploaded
cpt
parents:
diff changeset
303 def putative_genes_in_sequence(self, nuc_seq):
9708448ce811 Uploaded
cpt
parents:
diff changeset
304 """Returns start, end, strand, nucleotides, protein.
9708448ce811 Uploaded
cpt
parents:
diff changeset
305 Co-ordinates are Python style zero-based.
9708448ce811 Uploaded
cpt
parents:
diff changeset
306 """
9708448ce811 Uploaded
cpt
parents:
diff changeset
307 nuc_seq = nuc_seq.upper()
9708448ce811 Uploaded
cpt
parents:
diff changeset
308 # TODO - Refactor to use a generator function (in start order)
9708448ce811 Uploaded
cpt
parents:
diff changeset
309 # rather than making a list and sorting?
9708448ce811 Uploaded
cpt
parents:
diff changeset
310 answer = []
9708448ce811 Uploaded
cpt
parents:
diff changeset
311 full_len = len(nuc_seq)
9708448ce811 Uploaded
cpt
parents:
diff changeset
312
9708448ce811 Uploaded
cpt
parents:
diff changeset
313 for frame in range(0, 3):
9708448ce811 Uploaded
cpt
parents:
diff changeset
314 for offset, n, t in self.break_up_frame(nuc_seq[frame:]):
9708448ce811 Uploaded
cpt
parents:
diff changeset
315 start = frame + offset # zero based
9708448ce811 Uploaded
cpt
parents:
diff changeset
316 answer.append((start, start + len(n), +1, n, t))
9708448ce811 Uploaded
cpt
parents:
diff changeset
317
9708448ce811 Uploaded
cpt
parents:
diff changeset
318 rc = reverse_complement(nuc_seq)
9708448ce811 Uploaded
cpt
parents:
diff changeset
319 for frame in range(0, 3):
9708448ce811 Uploaded
cpt
parents:
diff changeset
320 for offset, n, t in self.break_up_frame(rc[frame:]):
9708448ce811 Uploaded
cpt
parents:
diff changeset
321 start = full_len - frame - offset # zero based
9708448ce811 Uploaded
cpt
parents:
diff changeset
322 answer.append((start, start - len(n), -1, n, t))
9708448ce811 Uploaded
cpt
parents:
diff changeset
323 answer.sort()
9708448ce811 Uploaded
cpt
parents:
diff changeset
324 return answer
9708448ce811 Uploaded
cpt
parents:
diff changeset
325
9708448ce811 Uploaded
cpt
parents:
diff changeset
326 def get_all_peptides(self, nuc_seq):
9708448ce811 Uploaded
cpt
parents:
diff changeset
327 """Returns start, end, strand, nucleotides, protein.
9708448ce811 Uploaded
cpt
parents:
diff changeset
328
9708448ce811 Uploaded
cpt
parents:
diff changeset
329 Co-ordinates are Python style zero-based.
9708448ce811 Uploaded
cpt
parents:
diff changeset
330 """
9708448ce811 Uploaded
cpt
parents:
diff changeset
331 # Refactored into generator by CPT
9708448ce811 Uploaded
cpt
parents:
diff changeset
332
9708448ce811 Uploaded
cpt
parents:
diff changeset
333 full_len = len(nuc_seq)
9708448ce811 Uploaded
cpt
parents:
diff changeset
334 for frame in range(0, 3):
9708448ce811 Uploaded
cpt
parents:
diff changeset
335 for offset, n, t in self.break_up_frame(nuc_seq[frame:]):
9708448ce811 Uploaded
cpt
parents:
diff changeset
336 start = frame + offset # zero based
9708448ce811 Uploaded
cpt
parents:
diff changeset
337 yield (start, start + len(n), +1, n, t)
9708448ce811 Uploaded
cpt
parents:
diff changeset
338 rc = reverse_complement(nuc_seq)
9708448ce811 Uploaded
cpt
parents:
diff changeset
339 for frame in range(0, 3):
9708448ce811 Uploaded
cpt
parents:
diff changeset
340 for offset, n, t in self.break_up_frame(rc[frame:]):
9708448ce811 Uploaded
cpt
parents:
diff changeset
341 start = full_len - frame - offset # zero based
9708448ce811 Uploaded
cpt
parents:
diff changeset
342 yield (start - len(n), start, -1, n, t)