comparison syngenic.py @ 0:5397da1ef896 draft

Uploaded
author gianmarco_piccinno
date Tue, 21 May 2019 05:05:15 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:5397da1ef896
1 #!/usr/bin/env python
2
3 __author__= "Gianmarco Piccinno"
4 __version__ = "1.0.0"
5
6 import Bio
7 from Bio import SeqIO
8 from Bio.Seq import Seq
9 from Bio.Alphabet import IUPAC
10 from Bio.Data import IUPACData
11 from Bio.Data import CodonTable
12 import re
13 import sre_yield
14
15 import re
16 import itertools
17 from functools import reduce
18
19 import Bio
20 from Bio import Data
21 from Bio.Data import IUPACData
22 from Bio.Data import CodonTable
23
24 from pprint import pprint
25
26 import pandas as pd
27
28 def _check_bases(seq_string):
29 """
30 Check characters in a string (PRIVATE).
31 Remove digits and white space present in string. Allows any valid ambiguous
32 IUPAC DNA single letters codes (ABCDGHKMNRSTVWY, upper case are converted).
33
34 Other characters (e.g. symbols) trigger a TypeError.
35
36 Returns the string WITH A LEADING SPACE (!). This is for backwards
37 compatibility, and may in part be explained by the fact that
38 Bio.Restriction doesn't use zero based counting.
39 """
40 # Remove white space and make upper case:
41 seq_string = "".join(seq_string.split()).upper()
42 # Remove digits
43 for c in "0123456789":
44 seq_string = seq_string.replace(c, "")
45 # Check only allowed IUPAC letters
46 if not set(seq_string).issubset(set("ABCDGHKMNRSTVWY")):
47 raise TypeError("Invalid character found in %s" % repr(seq_string))
48 return " " + seq_string
49
50 matching = {'A': 'ARWMHVDN', 'C': 'CYSMHBVN', 'G': 'GRSKBVDN',
51 'T': 'TYWKHBDN', 'R': 'ABDGHKMNSRWV', 'Y': 'CBDHKMNSTWVY',
52 'W': 'ABDHKMNRTWVY', 'S': 'CBDGHKMNSRVY', 'M': 'ACBDHMNSRWVY',
53 'K': 'BDGHKNSRTWVY', 'H': 'ACBDHKMNSRTWVY',
54 'B': 'CBDGHKMNSRTWVY', 'V': 'ACBDGHKMNSRWVY',
55 'D': 'ABDGHKMNSRTWVY', 'N': 'ACBDGHKMNSRTWVY'}
56
57 class pattern(object):
58
59
60 def __init__(self, pattern_input):
61 s = str(pattern_input)
62 self.upper = s.isupper()
63 self.data = _check_bases(s)
64 self.pattern = s
65
66 def plan_ambiguity(self):
67 val = Bio.Data.IUPACData.ambiguous_dna_values
68 re_pattern = ""
69 for el in self.pattern:
70 re_pattern = re_pattern + "[" + val[el] + "]"
71 return re_pattern
72
73 class annotated_genome(object):
74
75 def __init__(self, seq):
76 s = str(seq)
77 self.upper = s.isupper()
78 self.data = _check_bases(s)
79 self.seq = s
80
81 def codon_usage(self, codonTable):
82
83 codon_usage = {}
84 tmp = [x for x in re.split(r'(\w{3})', self.seq) if x != ""]
85
86 b_cod_table = CodonTable.unambiguous_dna_by_name["Bacterial"].forward_table
87 aas = set(b_cod_table.values())
88
89 for aa in aas:
90 codon_usage[aa] = {}
91 for codon in b_cod_table.keys():
92 if b_cod_table[codon] == aa:
93 codon_usage[aa][codon] = tmp.count(codon)
94
95 tups = {(outerKey, innerKey): values for outerKey, innerDict in codon_usage.iteritems() for innerKey, values in innerDict.iteritems()}
96
97 codon_usage_ = pd.DataFrame(pd.Series(tups), columns = ["Count"])
98 codon_usage_.index = codon_usage_.index.set_names(["AA", "Codon"])
99 codon_usage_['Proportion'] = codon_usage_.groupby(level=0).transform(lambda x: (x / x.sum()).round(2))
100
101 codon_usage_.reset_index(inplace=True)
102 codon_usage_.index = codon_usage_["Codon"]
103
104 return {"Dictionary": codon_usage, "Tuples": tups, "Table": codon_usage_}
105
106 class plasmid(object):
107 """
108 This class represents a circular plasmid
109 """
110
111 def __init__(self, seq = "", circular=True, features = None):
112
113 if type(seq) in [Bio.SeqRecord.SeqRecord, plasmid, Seq]:
114 s = str(seq.seq)
115 self.features = seq.features
116 else:
117 s = str(seq)
118 if features != None:
119 self.features = features
120 self.upper = s.isupper()
121 #self.data = _check_bases(s)
122 self.data = s
123 self.circular = circular
124 self.Class = s.__class__
125 self.size = len(s)
126 self.seq = self.data
127
128 def __len__(self):
129 return len(self.data)
130
131 def __repr__(self):
132 return "plasmid(%s, circular=%s)" % (repr(self[:]),repr(self.circular))
133
134 def __eq__(self, other):
135 if isinstance(other, plasmid):
136 if repr(self) == repr(other):
137 return True
138 else:
139 return False
140 return False
141
142 def __getitem__(self, i):
143 if self.upper:
144 return self.Class((self.data[i]).upper())
145 return self.Class(self.data[i])
146
147 def sequence(self):
148 return self.data
149
150
151 def extract(self, feature):
152 return self.data[feature.location.start:feature.location.end]#[::feature.strand]
153
154
155 def findpattern(self, pattern, ambiguous_pattern):
156 """
157 Return a list of a given pattern which occurs in the sequence.
158 The list is made of tuple (location, pattern.group).
159 The latter is used with non palindromic sites.
160 Pattern is the regular expression pattern corresponding to the
161 enzyme restriction site.
162 Size is the size of the restriction enzyme recognition-site size.
163 """
164 if not self.circular:
165 data = self.data
166 else:
167 data = self.data + self.data[:len(ambiguous_pattern)]
168 print(data)
169 print(pattern)
170 return [(data[i.start():i.end()], i.start(), i.end(), "innner") if (i.end()<=self.size) else (data[i.start():i.end()], i.start(), i.end()-self.size, "junction") for i in re.finditer(pattern, data)]
171
172 def findpatterns(self, patterns, ambiguous_patterns):
173 """
174 Return a list of a given pattern which occurs in the sequence.
175 The list is made of tuple (location, pattern.group).
176 The latter is used with non palindromic sites.
177 Pattern is the regular expression pattern corresponding to the
178 enzyme restriction site.
179 Size is the size of the restriction enzyme recognition-site size.
180 """
181 patts = {}
182 for i in range(len(patterns)):
183 #print(patterns[i])
184 #print(ambiguous_patterns[i])
185 if not self.circular:
186 data = self.data
187 else:
188 data = self.data + self.data[:len(ambiguous_patterns[i])]
189 #print(data)
190 patts[ambiguous_patterns[i]] = [(data[j.start():j.end()], j.start(), j.end(), "innner") if (j.end()<=self.size) else (data[j.start():j.end()], j.start(), j.end()-self.size, "junction") for j in re.finditer(patterns[i], data)]
191
192 return patts
193
194
195 def codon_switch(self, patterns, annotation, codon_usage):
196
197 table = {}
198
199 for pattern in patterns:
200 if pattern[1] >= annotation & pattern[2] <= annotation:
201 # the pattern is in a coding sequence
202 # do codon switch
203 print("Do codon switch!")
204 else:
205 next
206 return table
207
208 def transversion(self, patterns):
209 return
210
211 def read_patterns(input):
212
213 patterns = []
214
215 with open(input, "rU") as handle:
216 for pat in handle.readlines():
217
218 tmp = Seq(pat.rstrip(), IUPAC.ambiguous_dna)
219
220 patterns.append(tmp)
221 #patterns.append(tmp.reverse_complement())
222
223
224 return patterns
225
226
227 def codon_usage(seqs, codonTable):
228
229 codon_usage = {}
230 tmp = [x for x in re.split(r'(\w{3})', seqs) if x != ""]
231
232 b_cod_table = CodonTable.unambiguous_dna_by_name[codonTable].forward_table
233
234 for cod in CodonTable.unambiguous_dna_by_name[codonTable].stop_codons:
235 b_cod_table[cod] = "_Stop"
236
237 for cod in CodonTable.unambiguous_dna_by_name[codonTable].start_codons:
238 #print(cod)
239 b_cod_table[cod] = b_cod_table[cod]
240
241 aas = set(b_cod_table.values())
242
243 for aa in aas:
244 #print(aa)
245 #codon_usage[aa] = {}
246 for codon in b_cod_table.keys():
247 if b_cod_table[codon] == aa:
248 codon_usage[codon] = tmp.count(codon.split(" ")[0])
249
250 return codon_usage
251
252
253 def read_annotated_genome(data="example.fna", type_="fasta"):
254 """
255 Accepted formats:
256 - fasta (multifasta)
257 - gbk
258
259 """
260
261 seqs = ""
262
263 if type_ == "fasta":
264 with open(data, "rU") as handle:
265 for record in SeqIO.parse(handle, type_):
266 seqs = seqs + str(record.seq)
267
268 elif type_ == "genbank":
269 with open(data, "rU") as input_handle:
270 types = []
271 for record in SeqIO.parse(input_handle, "genbank"):
272 for feature in record.features:
273 types.append(feature.type)
274 if feature.type == "CDS":
275 if feature.location.strand == +1:
276 seq = record.seq[feature.location.start:feature.location.end]
277 seqs = seqs + str(seq)
278 elif feature.location.strand == -1:
279 seq = record.seq[feature.location.start:
280 feature.location.end].reverse_complement()
281 seqs = seqs + str(seq)
282 return seqs
283
284
285 def synonims_(table_name="Bacterial"):
286
287 b_cod_table = CodonTable.unambiguous_dna_by_name[table_name].forward_table
288
289 print(b_cod_table)
290
291 for cod in CodonTable.unambiguous_dna_by_name[table_name].stop_codons:
292 b_cod_table[cod] = "_Stop"
293
294 for cod in CodonTable.unambiguous_dna_by_name[table_name].start_codons:
295 b_cod_table[cod] = "_Start"
296
297 #pprint(b_cod_table)
298 codons = {}
299
300 aas = set(b_cod_table.values())
301
302 for aa in aas:
303 codons[aa] = []
304 for codon in b_cod_table.keys():
305 if b_cod_table[codon] == aa:
306 codons[aa].append(codon)
307
308 #break
309
310 synonims = {}
311
312 for el1 in codons.keys():
313 print(el1)
314 for el2 in codons[el1]:
315 print(el2)
316 synonims[el2] = codons[el1]
317 #synonims[el2] = []
318 #for el3 in codons[el1]#set.difference(set(codons[el1]), {el2}):
319 # print(el3)
320 # synonims[el2].append(el3)
321 #break
322 #break
323 #break
324
325
326 anti_codons = {}
327
328 for codon in synonims.keys():
329 tmp_codon = Bio.Seq.Seq(codon, IUPAC.unambiguous_dna)
330 tmp_anticodon = str(tmp_codon.reverse_complement())
331
332 anti_codons[tmp_anticodon] = []
333
334 for synonim in synonims[codon]:
335 tmp_synonim = Bio.Seq.Seq(synonim, IUPAC.unambiguous_dna)
336 tmp_antisynonim = str(tmp_synonim.reverse_complement())
337 anti_codons[tmp_anticodon].append(tmp_antisynonim)
338
339 check = Bio.Seq.Seq("CTT")
340 anti_check = check.reverse_complement()
341 print("\nCheck:\n" + str(check))
342 print("\nCodons:\n")
343
344 for key in codons.keys():
345 if str(check) in codons[key]:
346 print(codons[key])
347
348 #pprint(codons)
349 print("\nSynonims:\n")
350 pprint(synonims[str(check)])
351 print("\nAnti_Codons:\n")
352 pprint(anti_codons[str(anti_check)])
353
354 #i = synonims.keys()
355 #right = True
356 #while len(i) > 0:
357 # tmp = i.pop()
358 # check = Bio.Seq.Seq(tmp)
359 # anti_check = check.reverse_complement()
360
361
362 return {"synonims":synonims, "anti_synonims":anti_codons}