Mercurial > repos > gianmarco_piccinno > cs_tool_project_rm
comparison syngenic.py @ 0:5397da1ef896 draft
Uploaded
author | gianmarco_piccinno |
---|---|
date | Tue, 21 May 2019 05:05:15 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:5397da1ef896 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 __author__= "Gianmarco Piccinno" | |
4 __version__ = "1.0.0" | |
5 | |
6 import Bio | |
7 from Bio import SeqIO | |
8 from Bio.Seq import Seq | |
9 from Bio.Alphabet import IUPAC | |
10 from Bio.Data import IUPACData | |
11 from Bio.Data import CodonTable | |
12 import re | |
13 import sre_yield | |
14 | |
15 import re | |
16 import itertools | |
17 from functools import reduce | |
18 | |
19 import Bio | |
20 from Bio import Data | |
21 from Bio.Data import IUPACData | |
22 from Bio.Data import CodonTable | |
23 | |
24 from pprint import pprint | |
25 | |
26 import pandas as pd | |
27 | |
28 def _check_bases(seq_string): | |
29 """ | |
30 Check characters in a string (PRIVATE). | |
31 Remove digits and white space present in string. Allows any valid ambiguous | |
32 IUPAC DNA single letters codes (ABCDGHKMNRSTVWY, upper case are converted). | |
33 | |
34 Other characters (e.g. symbols) trigger a TypeError. | |
35 | |
36 Returns the string WITH A LEADING SPACE (!). This is for backwards | |
37 compatibility, and may in part be explained by the fact that | |
38 Bio.Restriction doesn't use zero based counting. | |
39 """ | |
40 # Remove white space and make upper case: | |
41 seq_string = "".join(seq_string.split()).upper() | |
42 # Remove digits | |
43 for c in "0123456789": | |
44 seq_string = seq_string.replace(c, "") | |
45 # Check only allowed IUPAC letters | |
46 if not set(seq_string).issubset(set("ABCDGHKMNRSTVWY")): | |
47 raise TypeError("Invalid character found in %s" % repr(seq_string)) | |
48 return " " + seq_string | |
49 | |
50 matching = {'A': 'ARWMHVDN', 'C': 'CYSMHBVN', 'G': 'GRSKBVDN', | |
51 'T': 'TYWKHBDN', 'R': 'ABDGHKMNSRWV', 'Y': 'CBDHKMNSTWVY', | |
52 'W': 'ABDHKMNRTWVY', 'S': 'CBDGHKMNSRVY', 'M': 'ACBDHMNSRWVY', | |
53 'K': 'BDGHKNSRTWVY', 'H': 'ACBDHKMNSRTWVY', | |
54 'B': 'CBDGHKMNSRTWVY', 'V': 'ACBDGHKMNSRWVY', | |
55 'D': 'ABDGHKMNSRTWVY', 'N': 'ACBDGHKMNSRTWVY'} | |
56 | |
57 class pattern(object): | |
58 | |
59 | |
60 def __init__(self, pattern_input): | |
61 s = str(pattern_input) | |
62 self.upper = s.isupper() | |
63 self.data = _check_bases(s) | |
64 self.pattern = s | |
65 | |
66 def plan_ambiguity(self): | |
67 val = Bio.Data.IUPACData.ambiguous_dna_values | |
68 re_pattern = "" | |
69 for el in self.pattern: | |
70 re_pattern = re_pattern + "[" + val[el] + "]" | |
71 return re_pattern | |
72 | |
73 class annotated_genome(object): | |
74 | |
75 def __init__(self, seq): | |
76 s = str(seq) | |
77 self.upper = s.isupper() | |
78 self.data = _check_bases(s) | |
79 self.seq = s | |
80 | |
81 def codon_usage(self, codonTable): | |
82 | |
83 codon_usage = {} | |
84 tmp = [x for x in re.split(r'(\w{3})', self.seq) if x != ""] | |
85 | |
86 b_cod_table = CodonTable.unambiguous_dna_by_name["Bacterial"].forward_table | |
87 aas = set(b_cod_table.values()) | |
88 | |
89 for aa in aas: | |
90 codon_usage[aa] = {} | |
91 for codon in b_cod_table.keys(): | |
92 if b_cod_table[codon] == aa: | |
93 codon_usage[aa][codon] = tmp.count(codon) | |
94 | |
95 tups = {(outerKey, innerKey): values for outerKey, innerDict in codon_usage.iteritems() for innerKey, values in innerDict.iteritems()} | |
96 | |
97 codon_usage_ = pd.DataFrame(pd.Series(tups), columns = ["Count"]) | |
98 codon_usage_.index = codon_usage_.index.set_names(["AA", "Codon"]) | |
99 codon_usage_['Proportion'] = codon_usage_.groupby(level=0).transform(lambda x: (x / x.sum()).round(2)) | |
100 | |
101 codon_usage_.reset_index(inplace=True) | |
102 codon_usage_.index = codon_usage_["Codon"] | |
103 | |
104 return {"Dictionary": codon_usage, "Tuples": tups, "Table": codon_usage_} | |
105 | |
106 class plasmid(object): | |
107 """ | |
108 This class represents a circular plasmid | |
109 """ | |
110 | |
111 def __init__(self, seq = "", circular=True, features = None): | |
112 | |
113 if type(seq) in [Bio.SeqRecord.SeqRecord, plasmid, Seq]: | |
114 s = str(seq.seq) | |
115 self.features = seq.features | |
116 else: | |
117 s = str(seq) | |
118 if features != None: | |
119 self.features = features | |
120 self.upper = s.isupper() | |
121 #self.data = _check_bases(s) | |
122 self.data = s | |
123 self.circular = circular | |
124 self.Class = s.__class__ | |
125 self.size = len(s) | |
126 self.seq = self.data | |
127 | |
128 def __len__(self): | |
129 return len(self.data) | |
130 | |
131 def __repr__(self): | |
132 return "plasmid(%s, circular=%s)" % (repr(self[:]),repr(self.circular)) | |
133 | |
134 def __eq__(self, other): | |
135 if isinstance(other, plasmid): | |
136 if repr(self) == repr(other): | |
137 return True | |
138 else: | |
139 return False | |
140 return False | |
141 | |
142 def __getitem__(self, i): | |
143 if self.upper: | |
144 return self.Class((self.data[i]).upper()) | |
145 return self.Class(self.data[i]) | |
146 | |
147 def sequence(self): | |
148 return self.data | |
149 | |
150 | |
151 def extract(self, feature): | |
152 return self.data[feature.location.start:feature.location.end]#[::feature.strand] | |
153 | |
154 | |
155 def findpattern(self, pattern, ambiguous_pattern): | |
156 """ | |
157 Return a list of a given pattern which occurs in the sequence. | |
158 The list is made of tuple (location, pattern.group). | |
159 The latter is used with non palindromic sites. | |
160 Pattern is the regular expression pattern corresponding to the | |
161 enzyme restriction site. | |
162 Size is the size of the restriction enzyme recognition-site size. | |
163 """ | |
164 if not self.circular: | |
165 data = self.data | |
166 else: | |
167 data = self.data + self.data[:len(ambiguous_pattern)] | |
168 print(data) | |
169 print(pattern) | |
170 return [(data[i.start():i.end()], i.start(), i.end(), "innner") if (i.end()<=self.size) else (data[i.start():i.end()], i.start(), i.end()-self.size, "junction") for i in re.finditer(pattern, data)] | |
171 | |
172 def findpatterns(self, patterns, ambiguous_patterns): | |
173 """ | |
174 Return a list of a given pattern which occurs in the sequence. | |
175 The list is made of tuple (location, pattern.group). | |
176 The latter is used with non palindromic sites. | |
177 Pattern is the regular expression pattern corresponding to the | |
178 enzyme restriction site. | |
179 Size is the size of the restriction enzyme recognition-site size. | |
180 """ | |
181 patts = {} | |
182 for i in range(len(patterns)): | |
183 #print(patterns[i]) | |
184 #print(ambiguous_patterns[i]) | |
185 if not self.circular: | |
186 data = self.data | |
187 else: | |
188 data = self.data + self.data[:len(ambiguous_patterns[i])] | |
189 #print(data) | |
190 patts[ambiguous_patterns[i]] = [(data[j.start():j.end()], j.start(), j.end(), "innner") if (j.end()<=self.size) else (data[j.start():j.end()], j.start(), j.end()-self.size, "junction") for j in re.finditer(patterns[i], data)] | |
191 | |
192 return patts | |
193 | |
194 | |
195 def codon_switch(self, patterns, annotation, codon_usage): | |
196 | |
197 table = {} | |
198 | |
199 for pattern in patterns: | |
200 if pattern[1] >= annotation & pattern[2] <= annotation: | |
201 # the pattern is in a coding sequence | |
202 # do codon switch | |
203 print("Do codon switch!") | |
204 else: | |
205 next | |
206 return table | |
207 | |
208 def transversion(self, patterns): | |
209 return | |
210 | |
211 def read_patterns(input): | |
212 | |
213 patterns = [] | |
214 | |
215 with open(input, "rU") as handle: | |
216 for pat in handle.readlines(): | |
217 | |
218 tmp = Seq(pat.rstrip(), IUPAC.ambiguous_dna) | |
219 | |
220 patterns.append(tmp) | |
221 #patterns.append(tmp.reverse_complement()) | |
222 | |
223 | |
224 return patterns | |
225 | |
226 | |
227 def codon_usage(seqs, codonTable): | |
228 | |
229 codon_usage = {} | |
230 tmp = [x for x in re.split(r'(\w{3})', seqs) if x != ""] | |
231 | |
232 b_cod_table = CodonTable.unambiguous_dna_by_name[codonTable].forward_table | |
233 | |
234 for cod in CodonTable.unambiguous_dna_by_name[codonTable].stop_codons: | |
235 b_cod_table[cod] = "_Stop" | |
236 | |
237 for cod in CodonTable.unambiguous_dna_by_name[codonTable].start_codons: | |
238 #print(cod) | |
239 b_cod_table[cod] = b_cod_table[cod] | |
240 | |
241 aas = set(b_cod_table.values()) | |
242 | |
243 for aa in aas: | |
244 #print(aa) | |
245 #codon_usage[aa] = {} | |
246 for codon in b_cod_table.keys(): | |
247 if b_cod_table[codon] == aa: | |
248 codon_usage[codon] = tmp.count(codon.split(" ")[0]) | |
249 | |
250 return codon_usage | |
251 | |
252 | |
253 def read_annotated_genome(data="example.fna", type_="fasta"): | |
254 """ | |
255 Accepted formats: | |
256 - fasta (multifasta) | |
257 - gbk | |
258 | |
259 """ | |
260 | |
261 seqs = "" | |
262 | |
263 if type_ == "fasta": | |
264 with open(data, "rU") as handle: | |
265 for record in SeqIO.parse(handle, type_): | |
266 seqs = seqs + str(record.seq) | |
267 | |
268 elif type_ == "genbank": | |
269 with open(data, "rU") as input_handle: | |
270 types = [] | |
271 for record in SeqIO.parse(input_handle, "genbank"): | |
272 for feature in record.features: | |
273 types.append(feature.type) | |
274 if feature.type == "CDS": | |
275 if feature.location.strand == +1: | |
276 seq = record.seq[feature.location.start:feature.location.end] | |
277 seqs = seqs + str(seq) | |
278 elif feature.location.strand == -1: | |
279 seq = record.seq[feature.location.start: | |
280 feature.location.end].reverse_complement() | |
281 seqs = seqs + str(seq) | |
282 return seqs | |
283 | |
284 | |
285 def synonims_(table_name="Bacterial"): | |
286 | |
287 b_cod_table = CodonTable.unambiguous_dna_by_name[table_name].forward_table | |
288 | |
289 print(b_cod_table) | |
290 | |
291 for cod in CodonTable.unambiguous_dna_by_name[table_name].stop_codons: | |
292 b_cod_table[cod] = "_Stop" | |
293 | |
294 for cod in CodonTable.unambiguous_dna_by_name[table_name].start_codons: | |
295 b_cod_table[cod] = "_Start" | |
296 | |
297 #pprint(b_cod_table) | |
298 codons = {} | |
299 | |
300 aas = set(b_cod_table.values()) | |
301 | |
302 for aa in aas: | |
303 codons[aa] = [] | |
304 for codon in b_cod_table.keys(): | |
305 if b_cod_table[codon] == aa: | |
306 codons[aa].append(codon) | |
307 | |
308 #break | |
309 | |
310 synonims = {} | |
311 | |
312 for el1 in codons.keys(): | |
313 print(el1) | |
314 for el2 in codons[el1]: | |
315 print(el2) | |
316 synonims[el2] = codons[el1] | |
317 #synonims[el2] = [] | |
318 #for el3 in codons[el1]#set.difference(set(codons[el1]), {el2}): | |
319 # print(el3) | |
320 # synonims[el2].append(el3) | |
321 #break | |
322 #break | |
323 #break | |
324 | |
325 | |
326 anti_codons = {} | |
327 | |
328 for codon in synonims.keys(): | |
329 tmp_codon = Bio.Seq.Seq(codon, IUPAC.unambiguous_dna) | |
330 tmp_anticodon = str(tmp_codon.reverse_complement()) | |
331 | |
332 anti_codons[tmp_anticodon] = [] | |
333 | |
334 for synonim in synonims[codon]: | |
335 tmp_synonim = Bio.Seq.Seq(synonim, IUPAC.unambiguous_dna) | |
336 tmp_antisynonim = str(tmp_synonim.reverse_complement()) | |
337 anti_codons[tmp_anticodon].append(tmp_antisynonim) | |
338 | |
339 check = Bio.Seq.Seq("CTT") | |
340 anti_check = check.reverse_complement() | |
341 print("\nCheck:\n" + str(check)) | |
342 print("\nCodons:\n") | |
343 | |
344 for key in codons.keys(): | |
345 if str(check) in codons[key]: | |
346 print(codons[key]) | |
347 | |
348 #pprint(codons) | |
349 print("\nSynonims:\n") | |
350 pprint(synonims[str(check)]) | |
351 print("\nAnti_Codons:\n") | |
352 pprint(anti_codons[str(anti_check)]) | |
353 | |
354 #i = synonims.keys() | |
355 #right = True | |
356 #while len(i) > 0: | |
357 # tmp = i.pop() | |
358 # check = Bio.Seq.Seq(tmp) | |
359 # anti_check = check.reverse_complement() | |
360 | |
361 | |
362 return {"synonims":synonims, "anti_synonims":anti_codons} |