annotate cpt_phageqc_annotation/cpt.py @ 0:c3140b08d703 draft default tip

Uploaded
author cpt
date Fri, 17 Jun 2022 13:00:50 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1 #!/usr/bin/env python
c3140b08d703 Uploaded
cpt
parents:
diff changeset
2 import regex as re
c3140b08d703 Uploaded
cpt
parents:
diff changeset
3 from Bio.Seq import Seq, reverse_complement, translate
c3140b08d703 Uploaded
cpt
parents:
diff changeset
4 from Bio.SeqRecord import SeqRecord
c3140b08d703 Uploaded
cpt
parents:
diff changeset
5 from Bio import SeqIO
c3140b08d703 Uploaded
cpt
parents:
diff changeset
6 from Bio.Data import CodonTable
c3140b08d703 Uploaded
cpt
parents:
diff changeset
7 import logging
c3140b08d703 Uploaded
cpt
parents:
diff changeset
8
c3140b08d703 Uploaded
cpt
parents:
diff changeset
9 logging.basicConfig()
c3140b08d703 Uploaded
cpt
parents:
diff changeset
10 log = logging.getLogger()
c3140b08d703 Uploaded
cpt
parents:
diff changeset
11
c3140b08d703 Uploaded
cpt
parents:
diff changeset
12 PHAGE_IN_MIDDLE = re.compile("^(?P<host>.*)\s*phage (?P<phage>.*)$")
c3140b08d703 Uploaded
cpt
parents:
diff changeset
13 BACTERIOPHAGE_IN_MIDDLE = re.compile("^(?P<host>.*)\s*bacteriophage (?P<phage>.*)$")
c3140b08d703 Uploaded
cpt
parents:
diff changeset
14 STARTS_WITH_PHAGE = re.compile(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
15 "^(bacterio|vibrio|Bacterio|Vibrio|)?[Pp]hage (?P<phage>.*)$"
c3140b08d703 Uploaded
cpt
parents:
diff changeset
16 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
17 NEW_STYLE_NAMES = re.compile("(?P<phage>v[A-Z]_[A-Z][a-z]{2}_.*)")
c3140b08d703 Uploaded
cpt
parents:
diff changeset
18
c3140b08d703 Uploaded
cpt
parents:
diff changeset
19
c3140b08d703 Uploaded
cpt
parents:
diff changeset
20 def phage_name_parser(name):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
21 host = None
c3140b08d703 Uploaded
cpt
parents:
diff changeset
22 phage = None
c3140b08d703 Uploaded
cpt
parents:
diff changeset
23 name = name.replace(", complete genome.", "")
c3140b08d703 Uploaded
cpt
parents:
diff changeset
24 name = name.replace(", complete genome", "")
c3140b08d703 Uploaded
cpt
parents:
diff changeset
25
c3140b08d703 Uploaded
cpt
parents:
diff changeset
26 m = BACTERIOPHAGE_IN_MIDDLE.match(name)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
27 if m:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
28 host = m.group("host")
c3140b08d703 Uploaded
cpt
parents:
diff changeset
29 phage = m.group("phage")
c3140b08d703 Uploaded
cpt
parents:
diff changeset
30 return (host, phage)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
31
c3140b08d703 Uploaded
cpt
parents:
diff changeset
32 m = PHAGE_IN_MIDDLE.match(name)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
33 if m:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
34 host = m.group("host")
c3140b08d703 Uploaded
cpt
parents:
diff changeset
35 phage = m.group("phage")
c3140b08d703 Uploaded
cpt
parents:
diff changeset
36 return (host, phage)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
37
c3140b08d703 Uploaded
cpt
parents:
diff changeset
38 m = STARTS_WITH_PHAGE.match(name)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
39 if m:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
40 phage = m.group("phage")
c3140b08d703 Uploaded
cpt
parents:
diff changeset
41 return (host, phage)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
42
c3140b08d703 Uploaded
cpt
parents:
diff changeset
43 m = NEW_STYLE_NAMES.match(name)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
44 if m:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
45 phage = m.group("phage")
c3140b08d703 Uploaded
cpt
parents:
diff changeset
46 return (host, phage)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
47
c3140b08d703 Uploaded
cpt
parents:
diff changeset
48 return (host, phage)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
49
c3140b08d703 Uploaded
cpt
parents:
diff changeset
50
c3140b08d703 Uploaded
cpt
parents:
diff changeset
51 class OrfFinder(object):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
52 def __init__(self, table, ftype, ends, min_len, strand):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
53 self.table = table
c3140b08d703 Uploaded
cpt
parents:
diff changeset
54 self.table_obj = CodonTable.ambiguous_generic_by_id[table]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
55 self.ends = ends
c3140b08d703 Uploaded
cpt
parents:
diff changeset
56 self.ftype = ftype
c3140b08d703 Uploaded
cpt
parents:
diff changeset
57 self.min_len = min_len
c3140b08d703 Uploaded
cpt
parents:
diff changeset
58 self.starts = sorted(self.table_obj.start_codons)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
59 self.stops = sorted(self.table_obj.stop_codons)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
60 self.re_starts = re.compile("|".join(self.starts))
c3140b08d703 Uploaded
cpt
parents:
diff changeset
61 self.re_stops = re.compile("|".join(self.stops))
c3140b08d703 Uploaded
cpt
parents:
diff changeset
62 self.strand = strand
c3140b08d703 Uploaded
cpt
parents:
diff changeset
63
c3140b08d703 Uploaded
cpt
parents:
diff changeset
64 def locate(self, fasta_file, out_nuc, out_prot, out_bed, out_gff3):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
65 seq_format = "fasta"
c3140b08d703 Uploaded
cpt
parents:
diff changeset
66 log.debug("Genetic code table %i" % self.table)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
67 log.debug("Minimum length %i aa" % self.min_len)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
68
c3140b08d703 Uploaded
cpt
parents:
diff changeset
69 out_count = 0
c3140b08d703 Uploaded
cpt
parents:
diff changeset
70
c3140b08d703 Uploaded
cpt
parents:
diff changeset
71 out_gff3.write("##gff-version 3\n")
c3140b08d703 Uploaded
cpt
parents:
diff changeset
72
c3140b08d703 Uploaded
cpt
parents:
diff changeset
73 for idx, record in enumerate(SeqIO.parse(fasta_file, seq_format)):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
74 for i, (f_start, f_end, f_strand, n, t) in enumerate(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
75 self.get_all_peptides(str(record.seq).upper())
c3140b08d703 Uploaded
cpt
parents:
diff changeset
76 ):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
77 out_count += 1
c3140b08d703 Uploaded
cpt
parents:
diff changeset
78
c3140b08d703 Uploaded
cpt
parents:
diff changeset
79 descr = "length %i aa, %i bp, from %s..%s[%s] of %s" % (
c3140b08d703 Uploaded
cpt
parents:
diff changeset
80 len(t),
c3140b08d703 Uploaded
cpt
parents:
diff changeset
81 len(n),
c3140b08d703 Uploaded
cpt
parents:
diff changeset
82 f_start,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
83 f_end,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
84 f_strand,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
85 record.description,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
86 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
87 fid = record.id + "|%s%i" % (self.ftype, i + 1)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
88
c3140b08d703 Uploaded
cpt
parents:
diff changeset
89 r = SeqRecord(Seq(n), id=fid, name="", description=descr)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
90 t = SeqRecord(Seq(t), id=fid, name="", description=descr)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
91
c3140b08d703 Uploaded
cpt
parents:
diff changeset
92 SeqIO.write(r, out_nuc, "fasta")
c3140b08d703 Uploaded
cpt
parents:
diff changeset
93 SeqIO.write(t, out_prot, "fasta")
c3140b08d703 Uploaded
cpt
parents:
diff changeset
94
c3140b08d703 Uploaded
cpt
parents:
diff changeset
95 nice_strand = "+" if f_strand == +1 else "-"
c3140b08d703 Uploaded
cpt
parents:
diff changeset
96
c3140b08d703 Uploaded
cpt
parents:
diff changeset
97 out_bed.write(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
98 "\t".join(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
99 map(str, [record.id, f_start, f_end, fid, 0, nice_strand])
c3140b08d703 Uploaded
cpt
parents:
diff changeset
100 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
101 + "\n"
c3140b08d703 Uploaded
cpt
parents:
diff changeset
102 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
103
c3140b08d703 Uploaded
cpt
parents:
diff changeset
104 out_gff3.write(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
105 "\t".join(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
106 map(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
107 str,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
108 [
c3140b08d703 Uploaded
cpt
parents:
diff changeset
109 record.id,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
110 "getOrfsOrCds",
c3140b08d703 Uploaded
cpt
parents:
diff changeset
111 "CDS",
c3140b08d703 Uploaded
cpt
parents:
diff changeset
112 f_start + 1,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
113 f_end,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
114 ".",
c3140b08d703 Uploaded
cpt
parents:
diff changeset
115 nice_strand,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
116 0,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
117 "ID=%s.%s.%s" % (self.ftype, idx, i + 1),
c3140b08d703 Uploaded
cpt
parents:
diff changeset
118 ],
c3140b08d703 Uploaded
cpt
parents:
diff changeset
119 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
120 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
121 + "\n"
c3140b08d703 Uploaded
cpt
parents:
diff changeset
122 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
123 log.info("Found %i %ss", out_count, self.ftype)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
124
c3140b08d703 Uploaded
cpt
parents:
diff changeset
125 def start_chop_and_trans(self, s, strict=True):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
126 """Returns offset, trimmed nuc, protein."""
c3140b08d703 Uploaded
cpt
parents:
diff changeset
127 if strict:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
128 assert s[-3:] in self.stops, s
c3140b08d703 Uploaded
cpt
parents:
diff changeset
129 assert len(s) % 3 == 0
c3140b08d703 Uploaded
cpt
parents:
diff changeset
130 for match in self.re_starts.finditer(s, overlapped=True):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
131 # Must check the start is in frame
c3140b08d703 Uploaded
cpt
parents:
diff changeset
132 start = match.start()
c3140b08d703 Uploaded
cpt
parents:
diff changeset
133 if start % 3 == 0:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
134 n = s[start:]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
135 assert len(n) % 3 == 0, "%s is len %i" % (n, len(n))
c3140b08d703 Uploaded
cpt
parents:
diff changeset
136 if strict:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
137 t = translate(n, self.table)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
138 else:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
139 # Use when missing stop codon,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
140 t = "M" + translate(n[3:], self.table, to_stop=True)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
141 yield start, n, t # Edited by CPT to be a generator
c3140b08d703 Uploaded
cpt
parents:
diff changeset
142
c3140b08d703 Uploaded
cpt
parents:
diff changeset
143 def break_up_frame(self, s):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
144 """Returns offset, nuc, protein."""
c3140b08d703 Uploaded
cpt
parents:
diff changeset
145 start = 0
c3140b08d703 Uploaded
cpt
parents:
diff changeset
146 for match in self.re_stops.finditer(s, overlapped=True):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
147 index = match.start() + 3
c3140b08d703 Uploaded
cpt
parents:
diff changeset
148 if index % 3 != 0:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
149 continue
c3140b08d703 Uploaded
cpt
parents:
diff changeset
150 n = s[start:index]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
151 for (offset, n, t) in self.start_chop_and_trans(n):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
152 if n and len(t) >= self.min_len:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
153 yield start + offset, n, t
c3140b08d703 Uploaded
cpt
parents:
diff changeset
154 start = index
c3140b08d703 Uploaded
cpt
parents:
diff changeset
155
c3140b08d703 Uploaded
cpt
parents:
diff changeset
156 def putative_genes_in_sequence(self, nuc_seq):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
157 """Returns start, end, strand, nucleotides, protein.
c3140b08d703 Uploaded
cpt
parents:
diff changeset
158 Co-ordinates are Python style zero-based.
c3140b08d703 Uploaded
cpt
parents:
diff changeset
159 """
c3140b08d703 Uploaded
cpt
parents:
diff changeset
160 nuc_seq = nuc_seq.upper()
c3140b08d703 Uploaded
cpt
parents:
diff changeset
161 # TODO - Refactor to use a generator function (in start order)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
162 # rather than making a list and sorting?
c3140b08d703 Uploaded
cpt
parents:
diff changeset
163 answer = []
c3140b08d703 Uploaded
cpt
parents:
diff changeset
164 full_len = len(nuc_seq)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
165
c3140b08d703 Uploaded
cpt
parents:
diff changeset
166 for frame in range(0, 3):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
167 for offset, n, t in self.break_up_frame(nuc_seq[frame:]):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
168 start = frame + offset # zero based
c3140b08d703 Uploaded
cpt
parents:
diff changeset
169 answer.append((start, start + len(n), +1, n, t))
c3140b08d703 Uploaded
cpt
parents:
diff changeset
170
c3140b08d703 Uploaded
cpt
parents:
diff changeset
171 rc = reverse_complement(nuc_seq)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
172 for frame in range(0, 3):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
173 for offset, n, t in self.break_up_frame(rc[frame:]):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
174 start = full_len - frame - offset # zero based
c3140b08d703 Uploaded
cpt
parents:
diff changeset
175 answer.append((start, start - len(n), -1, n, t))
c3140b08d703 Uploaded
cpt
parents:
diff changeset
176 answer.sort()
c3140b08d703 Uploaded
cpt
parents:
diff changeset
177 return answer
c3140b08d703 Uploaded
cpt
parents:
diff changeset
178
c3140b08d703 Uploaded
cpt
parents:
diff changeset
179 def get_all_peptides(self, nuc_seq):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
180 """Returns start, end, strand, nucleotides, protein.
c3140b08d703 Uploaded
cpt
parents:
diff changeset
181
c3140b08d703 Uploaded
cpt
parents:
diff changeset
182 Co-ordinates are Python style zero-based.
c3140b08d703 Uploaded
cpt
parents:
diff changeset
183 """
c3140b08d703 Uploaded
cpt
parents:
diff changeset
184 # Refactored into generator by CPT
c3140b08d703 Uploaded
cpt
parents:
diff changeset
185 full_len = len(nuc_seq)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
186 if self.strand != "reverse":
c3140b08d703 Uploaded
cpt
parents:
diff changeset
187 for frame in range(0, 3):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
188 for offset, n, t in self.break_up_frame(nuc_seq[frame:]):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
189 start = frame + offset # zero based
c3140b08d703 Uploaded
cpt
parents:
diff changeset
190 yield (start, start + len(n), +1, n, t)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
191 if self.strand != "forward":
c3140b08d703 Uploaded
cpt
parents:
diff changeset
192 rc = reverse_complement(nuc_seq)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
193 for frame in range(0, 3):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
194 for offset, n, t in self.break_up_frame(rc[frame:]):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
195 start = full_len - frame - offset # zero based
c3140b08d703 Uploaded
cpt
parents:
diff changeset
196 yield (start - len(n), start, -1, n, t)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
197
c3140b08d703 Uploaded
cpt
parents:
diff changeset
198
c3140b08d703 Uploaded
cpt
parents:
diff changeset
199 class MGAFinder(object):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
200 def __init__(self, table, ftype, ends, min_len):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
201 self.table = table
c3140b08d703 Uploaded
cpt
parents:
diff changeset
202 self.table_obj = CodonTable.ambiguous_generic_by_id[table]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
203 self.ends = ends
c3140b08d703 Uploaded
cpt
parents:
diff changeset
204 self.ftype = ftype
c3140b08d703 Uploaded
cpt
parents:
diff changeset
205 self.min_len = min_len
c3140b08d703 Uploaded
cpt
parents:
diff changeset
206 self.starts = sorted(self.table_obj.start_codons)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
207 self.stops = sorted(self.table_obj.stop_codons)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
208 self.re_starts = re.compile("|".join(self.starts))
c3140b08d703 Uploaded
cpt
parents:
diff changeset
209 self.re_stops = re.compile("|".join(self.stops))
c3140b08d703 Uploaded
cpt
parents:
diff changeset
210
c3140b08d703 Uploaded
cpt
parents:
diff changeset
211 def locate(self, fasta_file, out_nuc, out_prot, out_bed, out_gff3):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
212 seq_format = "fasta"
c3140b08d703 Uploaded
cpt
parents:
diff changeset
213 log.debug("Genetic code table %i" % self.table)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
214 log.debug("Minimum length %i aa" % self.min_len)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
215
c3140b08d703 Uploaded
cpt
parents:
diff changeset
216 out_count = 0
c3140b08d703 Uploaded
cpt
parents:
diff changeset
217
c3140b08d703 Uploaded
cpt
parents:
diff changeset
218 out_gff3.write("##gff-version 3\n")
c3140b08d703 Uploaded
cpt
parents:
diff changeset
219
c3140b08d703 Uploaded
cpt
parents:
diff changeset
220 for idx, record in enumerate(SeqIO.parse(fasta_file, seq_format)):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
221 for i, (f_start, f_end, f_strand, n, t) in enumerate(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
222 self.get_all_peptides(str(record.seq).upper())
c3140b08d703 Uploaded
cpt
parents:
diff changeset
223 ):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
224 out_count += 1
c3140b08d703 Uploaded
cpt
parents:
diff changeset
225
c3140b08d703 Uploaded
cpt
parents:
diff changeset
226 descr = "length %i aa, %i bp, from %s..%s[%s] of %s" % (
c3140b08d703 Uploaded
cpt
parents:
diff changeset
227 len(t),
c3140b08d703 Uploaded
cpt
parents:
diff changeset
228 len(n),
c3140b08d703 Uploaded
cpt
parents:
diff changeset
229 f_start,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
230 f_end,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
231 f_strand,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
232 record.description,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
233 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
234 fid = record.id + "|%s%i" % (self.ftype, i + 1)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
235
c3140b08d703 Uploaded
cpt
parents:
diff changeset
236 r = SeqRecord(Seq(n), id=fid, name="", description=descr)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
237 t = SeqRecord(Seq(t), id=fid, name="", description=descr)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
238
c3140b08d703 Uploaded
cpt
parents:
diff changeset
239 SeqIO.write(r, out_nuc, "fasta")
c3140b08d703 Uploaded
cpt
parents:
diff changeset
240 SeqIO.write(t, out_prot, "fasta")
c3140b08d703 Uploaded
cpt
parents:
diff changeset
241
c3140b08d703 Uploaded
cpt
parents:
diff changeset
242 nice_strand = "+" if f_strand == +1 else "-"
c3140b08d703 Uploaded
cpt
parents:
diff changeset
243
c3140b08d703 Uploaded
cpt
parents:
diff changeset
244 out_bed.write(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
245 "\t".join(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
246 map(str, [record.id, f_start, f_end, fid, 0, nice_strand])
c3140b08d703 Uploaded
cpt
parents:
diff changeset
247 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
248 + "\n"
c3140b08d703 Uploaded
cpt
parents:
diff changeset
249 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
250
c3140b08d703 Uploaded
cpt
parents:
diff changeset
251 out_gff3.write(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
252 "\t".join(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
253 map(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
254 str,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
255 [
c3140b08d703 Uploaded
cpt
parents:
diff changeset
256 record.id,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
257 "getOrfsOrCds",
c3140b08d703 Uploaded
cpt
parents:
diff changeset
258 "CDS",
c3140b08d703 Uploaded
cpt
parents:
diff changeset
259 f_start + 1,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
260 f_end,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
261 ".",
c3140b08d703 Uploaded
cpt
parents:
diff changeset
262 nice_strand,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
263 0,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
264 "ID=%s.%s.%s" % (self.ftype, idx, i + 1),
c3140b08d703 Uploaded
cpt
parents:
diff changeset
265 ],
c3140b08d703 Uploaded
cpt
parents:
diff changeset
266 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
267 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
268 + "\n"
c3140b08d703 Uploaded
cpt
parents:
diff changeset
269 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
270 log.info("Found %i %ss", out_count, self.ftype)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
271
c3140b08d703 Uploaded
cpt
parents:
diff changeset
272 def start_chop_and_trans(self, s, strict=True):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
273 """Returns offset, trimmed nuc, protein."""
c3140b08d703 Uploaded
cpt
parents:
diff changeset
274 if strict:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
275 assert s[-3:] in self.stops, s
c3140b08d703 Uploaded
cpt
parents:
diff changeset
276 assert len(s) % 3 == 0
c3140b08d703 Uploaded
cpt
parents:
diff changeset
277 for match in self.re_starts.finditer(s, overlapped=True):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
278 # Must check the start is in frame
c3140b08d703 Uploaded
cpt
parents:
diff changeset
279 start = match.start()
c3140b08d703 Uploaded
cpt
parents:
diff changeset
280 if start % 3 == 0:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
281 n = s[start:]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
282 assert len(n) % 3 == 0, "%s is len %i" % (n, len(n))
c3140b08d703 Uploaded
cpt
parents:
diff changeset
283 if strict:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
284 t = translate(n, self.table)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
285 else:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
286 # Use when missing stop codon,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
287 t = "M" + translate(n[3:], self.table, to_stop=True)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
288 yield start, n, t
c3140b08d703 Uploaded
cpt
parents:
diff changeset
289
c3140b08d703 Uploaded
cpt
parents:
diff changeset
290 def break_up_frame(self, s):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
291 """Returns offset, nuc, protein."""
c3140b08d703 Uploaded
cpt
parents:
diff changeset
292 start = 0
c3140b08d703 Uploaded
cpt
parents:
diff changeset
293 for match in self.re_stops.finditer(s, overlapped=True):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
294 index = match.start() + 3
c3140b08d703 Uploaded
cpt
parents:
diff changeset
295 if index % 3 != 0:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
296 continue
c3140b08d703 Uploaded
cpt
parents:
diff changeset
297 n = s[start:index]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
298 for (offset, n, t) in self.start_chop_and_trans(n):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
299 if n and len(t) >= self.min_len:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
300 yield start + offset, n, t
c3140b08d703 Uploaded
cpt
parents:
diff changeset
301 start = index
c3140b08d703 Uploaded
cpt
parents:
diff changeset
302
c3140b08d703 Uploaded
cpt
parents:
diff changeset
303 def putative_genes_in_sequence(self, nuc_seq):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
304 """Returns start, end, strand, nucleotides, protein.
c3140b08d703 Uploaded
cpt
parents:
diff changeset
305 Co-ordinates are Python style zero-based.
c3140b08d703 Uploaded
cpt
parents:
diff changeset
306 """
c3140b08d703 Uploaded
cpt
parents:
diff changeset
307 nuc_seq = nuc_seq.upper()
c3140b08d703 Uploaded
cpt
parents:
diff changeset
308 # TODO - Refactor to use a generator function (in start order)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
309 # rather than making a list and sorting?
c3140b08d703 Uploaded
cpt
parents:
diff changeset
310 answer = []
c3140b08d703 Uploaded
cpt
parents:
diff changeset
311 full_len = len(nuc_seq)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
312
c3140b08d703 Uploaded
cpt
parents:
diff changeset
313 for frame in range(0, 3):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
314 for offset, n, t in self.break_up_frame(nuc_seq[frame:]):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
315 start = frame + offset # zero based
c3140b08d703 Uploaded
cpt
parents:
diff changeset
316 answer.append((start, start + len(n), +1, n, t))
c3140b08d703 Uploaded
cpt
parents:
diff changeset
317
c3140b08d703 Uploaded
cpt
parents:
diff changeset
318 rc = reverse_complement(nuc_seq)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
319 for frame in range(0, 3):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
320 for offset, n, t in self.break_up_frame(rc[frame:]):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
321 start = full_len - frame - offset # zero based
c3140b08d703 Uploaded
cpt
parents:
diff changeset
322 answer.append((start, start - len(n), -1, n, t))
c3140b08d703 Uploaded
cpt
parents:
diff changeset
323 answer.sort()
c3140b08d703 Uploaded
cpt
parents:
diff changeset
324 return answer
c3140b08d703 Uploaded
cpt
parents:
diff changeset
325
c3140b08d703 Uploaded
cpt
parents:
diff changeset
326 def get_all_peptides(self, nuc_seq):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
327 """Returns start, end, strand, nucleotides, protein.
c3140b08d703 Uploaded
cpt
parents:
diff changeset
328
c3140b08d703 Uploaded
cpt
parents:
diff changeset
329 Co-ordinates are Python style zero-based.
c3140b08d703 Uploaded
cpt
parents:
diff changeset
330 """
c3140b08d703 Uploaded
cpt
parents:
diff changeset
331 # Refactored into generator by CPT
c3140b08d703 Uploaded
cpt
parents:
diff changeset
332
c3140b08d703 Uploaded
cpt
parents:
diff changeset
333 full_len = len(nuc_seq)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
334 for frame in range(0, 3):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
335 for offset, n, t in self.break_up_frame(nuc_seq[frame:]):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
336 start = frame + offset # zero based
c3140b08d703 Uploaded
cpt
parents:
diff changeset
337 yield (start, start + len(n), +1, n, t)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
338 rc = reverse_complement(nuc_seq)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
339 for frame in range(0, 3):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
340 for offset, n, t in self.break_up_frame(rc[frame:]):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
341 start = full_len - frame - offset # zero based
c3140b08d703 Uploaded
cpt
parents:
diff changeset
342 yield (start - len(n), start, -1, n, t)