annotate CodonSwitchTool/syngenic.py @ 49:640db7b6847b draft default tip

Uploaded
author gianmarco_piccinno
date Mon, 20 May 2019 18:10:52 -0400
parents bd35b13fabfb
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
41
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
1 #!/usr/bin/env python
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
2
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
3 __author__= "Gianmarco Piccinno"
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
4 __version__ = "1.0.0"
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
5
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
6 import Bio
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
7 from Bio import SeqIO
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
8 from Bio.Seq import Seq
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
9 from Bio.Alphabet import IUPAC
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
10 from Bio.Data import IUPACData
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
11 from Bio.Data import CodonTable
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
12 import re
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
13 import sre_yield
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
14
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
15 import re
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
16 import itertools
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
17 from functools import reduce
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
18
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
19 import Bio
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
20 from Bio import Data
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
21 from Bio.Data import IUPACData
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
22 from Bio.Data import CodonTable
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
23
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
24 from pprint import pprint
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
25
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
26 import pandas as pd
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
27
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
28 def _check_bases(seq_string):
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
29 """
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
30 Check characters in a string (PRIVATE).
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
31 Remove digits and white space present in string. Allows any valid ambiguous
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
32 IUPAC DNA single letters codes (ABCDGHKMNRSTVWY, upper case are converted).
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
33
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
34 Other characters (e.g. symbols) trigger a TypeError.
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
35
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
36 Returns the string WITH A LEADING SPACE (!). This is for backwards
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
37 compatibility, and may in part be explained by the fact that
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
38 Bio.Restriction doesn't use zero based counting.
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
39 """
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
40 # Remove white space and make upper case:
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
41 seq_string = "".join(seq_string.split()).upper()
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
42 # Remove digits
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
43 for c in "0123456789":
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
44 seq_string = seq_string.replace(c, "")
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
45 # Check only allowed IUPAC letters
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
46 if not set(seq_string).issubset(set("ABCDGHKMNRSTVWY")):
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
47 raise TypeError("Invalid character found in %s" % repr(seq_string))
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
48 return " " + seq_string
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
49
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
50 matching = {'A': 'ARWMHVDN', 'C': 'CYSMHBVN', 'G': 'GRSKBVDN',
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
51 'T': 'TYWKHBDN', 'R': 'ABDGHKMNSRWV', 'Y': 'CBDHKMNSTWVY',
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
52 'W': 'ABDHKMNRTWVY', 'S': 'CBDGHKMNSRVY', 'M': 'ACBDHMNSRWVY',
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
53 'K': 'BDGHKNSRTWVY', 'H': 'ACBDHKMNSRTWVY',
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
54 'B': 'CBDGHKMNSRTWVY', 'V': 'ACBDGHKMNSRWVY',
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
55 'D': 'ABDGHKMNSRTWVY', 'N': 'ACBDGHKMNSRTWVY'}
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
56
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
57 class pattern(object):
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
58
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
59
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
60 def __init__(self, pattern_input):
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
61 s = str(pattern_input)
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
62 self.upper = s.isupper()
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
63 self.data = _check_bases(s)
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
64 self.pattern = s
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
65
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
66 def plan_ambiguity(self):
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
67 val = Bio.Data.IUPACData.ambiguous_dna_values
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
68 re_pattern = ""
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
69 for el in self.pattern:
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
70 re_pattern = re_pattern + "[" + val[el] + "]"
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
71 return re_pattern
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
72
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
73 class annotated_genome(object):
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
74
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
75 def __init__(self, seq):
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
76 s = str(seq)
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
77 self.upper = s.isupper()
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
78 self.data = _check_bases(s)
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
79 self.seq = s
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
80
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
81 def codon_usage(self, codonTable):
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
82
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
83 codon_usage = {}
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
84 tmp = [x for x in re.split(r'(\w{3})', self.seq) if x != ""]
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
85
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
86 b_cod_table = CodonTable.unambiguous_dna_by_name["Bacterial"].forward_table
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
87 aas = set(b_cod_table.values())
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
88
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
89 for aa in aas:
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
90 codon_usage[aa] = {}
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
91 for codon in b_cod_table.keys():
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
92 if b_cod_table[codon] == aa:
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
93 codon_usage[aa][codon] = tmp.count(codon)
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
94
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
95 tups = {(outerKey, innerKey): values for outerKey, innerDict in codon_usage.iteritems() for innerKey, values in innerDict.iteritems()}
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
96
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
97 codon_usage_ = pd.DataFrame(pd.Series(tups), columns = ["Count"])
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
98 codon_usage_.index = codon_usage_.index.set_names(["AA", "Codon"])
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
99 codon_usage_['Proportion'] = codon_usage_.groupby(level=0).transform(lambda x: (x / x.sum()).round(2))
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
100
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
101 codon_usage_.reset_index(inplace=True)
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
102 codon_usage_.index = codon_usage_["Codon"]
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
103
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
104 return {"Dictionary": codon_usage, "Tuples": tups, "Table": codon_usage_}
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
105
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
106 class plasmid(object):
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
107 """
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
108 This class represents a circular plasmid
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
109 """
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
110
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
111 def __init__(self, seq = "", circular=True, features = None):
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
112
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
113 if type(seq) in [Bio.SeqRecord.SeqRecord, plasmid, Seq]:
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
114 s = str(seq.seq)
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
115 self.features = seq.features
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
116 else:
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
117 s = str(seq)
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
118 if features != None:
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
119 self.features = features
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
120 self.upper = s.isupper()
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
121 #self.data = _check_bases(s)
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
122 self.data = s
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
123 self.circular = circular
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
124 self.Class = s.__class__
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
125 self.size = len(s)
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
126 self.seq = self.data
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
127
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
128 def __len__(self):
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
129 return len(self.data)
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
130
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
131 def __repr__(self):
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
132 return "plasmid(%s, circular=%s)" % (repr(self[:]),repr(self.circular))
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
133
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
134 def __eq__(self, other):
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
135 if isinstance(other, plasmid):
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
136 if repr(self) == repr(other):
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
137 return True
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
138 else:
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
139 return False
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
140 return False
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
141
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
142 def __getitem__(self, i):
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
143 if self.upper:
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
144 return self.Class((self.data[i]).upper())
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
145 return self.Class(self.data[i])
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
146
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
147 def sequence(self):
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
148 return self.data
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
149
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
150
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
151 def extract(self, feature):
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
152 return self.data[feature.location.start:feature.location.end]#[::feature.strand]
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
153
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
154
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
155 def findpattern(self, pattern, ambiguous_pattern):
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
156 """
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
157 Return a list of a given pattern which occurs in the sequence.
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
158 The list is made of tuple (location, pattern.group).
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
159 The latter is used with non palindromic sites.
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
160 Pattern is the regular expression pattern corresponding to the
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
161 enzyme restriction site.
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
162 Size is the size of the restriction enzyme recognition-site size.
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
163 """
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
164 if not self.circular:
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
165 data = self.data
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
166 else:
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
167 data = self.data + self.data[:len(ambiguous_pattern)]
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
168 print(data)
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
169 print(pattern)
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
170 return [(data[i.start():i.end()], i.start(), i.end(), "innner") if (i.end()<=self.size) else (data[i.start():i.end()], i.start(), i.end()-self.size, "junction") for i in re.finditer(pattern, data)]
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
171
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
172 def findpatterns(self, patterns, ambiguous_patterns):
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
173 """
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
174 Return a list of a given pattern which occurs in the sequence.
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
175 The list is made of tuple (location, pattern.group).
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
176 The latter is used with non palindromic sites.
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
177 Pattern is the regular expression pattern corresponding to the
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
178 enzyme restriction site.
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
179 Size is the size of the restriction enzyme recognition-site size.
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
180 """
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
181 patts = {}
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
182 for i in range(len(patterns)):
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
183 #print(patterns[i])
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
184 #print(ambiguous_patterns[i])
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
185 if not self.circular:
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
186 data = self.data
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
187 else:
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
188 data = self.data + self.data[:len(ambiguous_patterns[i])]
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
189 #print(data)
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
190 patts[ambiguous_patterns[i]] = [(data[j.start():j.end()], j.start(), j.end(), "innner") if (j.end()<=self.size) else (data[j.start():j.end()], j.start(), j.end()-self.size, "junction") for j in re.finditer(patterns[i], data)]
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
191
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
192 return patts
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
193
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
194
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
195 def codon_switch(self, patterns, annotation, codon_usage):
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
196
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
197 table = {}
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
198
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
199 for pattern in patterns:
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
200 if pattern[1] >= annotation & pattern[2] <= annotation:
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
201 # the pattern is in a coding sequence
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
202 # do codon switch
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
203 print("Do codon switch!")
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
204 else:
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
205 next
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
206 return table
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
207
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
208 def transversion(self, patterns):
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
209 return
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
210
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
211 def read_patterns(input):
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
212
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
213 patterns = []
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
214
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
215 with open(input, "rU") as handle:
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
216 for pat in handle.readlines():
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
217
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
218 tmp = Seq(pat.rstrip(), IUPAC.ambiguous_dna)
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
219
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
220 patterns.append(tmp)
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
221 #patterns.append(tmp.reverse_complement())
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
222
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
223
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
224 return patterns
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
225
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
226
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
227 def codon_usage(seqs, codonTable):
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
228
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
229 codon_usage = {}
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
230 tmp = [x for x in re.split(r'(\w{3})', seqs) if x != ""]
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
231
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
232 b_cod_table = CodonTable.unambiguous_dna_by_name[codonTable].forward_table
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
233
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
234 for cod in CodonTable.unambiguous_dna_by_name[codonTable].stop_codons:
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
235 b_cod_table[cod] = "_Stop"
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
236
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
237 for cod in CodonTable.unambiguous_dna_by_name[codonTable].start_codons:
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
238 #print(cod)
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
239 b_cod_table[cod] = b_cod_table[cod]
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
240
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
241 aas = set(b_cod_table.values())
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
242
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
243 for aa in aas:
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
244 #print(aa)
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
245 #codon_usage[aa] = {}
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
246 for codon in b_cod_table.keys():
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
247 if b_cod_table[codon] == aa:
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
248 codon_usage[codon] = tmp.count(codon.split(" ")[0])
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
249
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
250 return codon_usage
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
251
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
252
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
253 def read_annotated_genome(data="example.fna", type_="fasta"):
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
254 """
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
255 Accepted formats:
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
256 - fasta (multifasta)
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
257 - gbk
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
258
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
259 """
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
260
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
261 seqs = ""
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
262
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
263 if type_ == "fasta":
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
264 with open(data, "rU") as handle:
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
265 for record in SeqIO.parse(handle, type_):
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
266 seqs = seqs + str(record.seq)
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
267
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
268 elif type_ == "genbank":
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
269 with open(data, "rU") as input_handle:
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
270 types = []
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
271 for record in SeqIO.parse(input_handle, "genbank"):
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
272 for feature in record.features:
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
273 types.append(feature.type)
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
274 if feature.type == "CDS":
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
275 if feature.location.strand == +1:
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
276 seq = record.seq[feature.location.start:feature.location.end]
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
277 seqs = seqs + str(seq)
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
278 elif feature.location.strand == -1:
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
279 seq = record.seq[feature.location.start:
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
280 feature.location.end].reverse_complement()
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
281 seqs = seqs + str(seq)
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
282 return seqs
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
283
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
284
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
285 def synonims_(table_name="Bacterial"):
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
286
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
287 b_cod_table = CodonTable.unambiguous_dna_by_name[table_name].forward_table
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
288
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
289 print(b_cod_table)
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
290
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
291 for cod in CodonTable.unambiguous_dna_by_name[table_name].stop_codons:
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
292 b_cod_table[cod] = "_Stop"
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
293
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
294 for cod in CodonTable.unambiguous_dna_by_name[table_name].start_codons:
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
295 b_cod_table[cod] = "_Start"
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
296
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
297 #pprint(b_cod_table)
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
298 codons = {}
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
299
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
300 aas = set(b_cod_table.values())
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
301
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
302 for aa in aas:
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
303 codons[aa] = []
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
304 for codon in b_cod_table.keys():
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
305 if b_cod_table[codon] == aa:
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
306 codons[aa].append(codon)
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
307
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
308 #break
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
309
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
310 synonims = {}
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
311
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
312 for el1 in codons.keys():
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
313 print(el1)
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
314 for el2 in codons[el1]:
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
315 print(el2)
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
316 synonims[el2] = codons[el1]
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
317 #synonims[el2] = []
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
318 #for el3 in codons[el1]#set.difference(set(codons[el1]), {el2}):
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
319 # print(el3)
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
320 # synonims[el2].append(el3)
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
321 #break
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
322 #break
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
323 #break
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
324
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
325
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
326 anti_codons = {}
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
327
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
328 for codon in synonims.keys():
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
329 tmp_codon = Bio.Seq.Seq(codon, IUPAC.unambiguous_dna)
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
330 tmp_anticodon = str(tmp_codon.reverse_complement())
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
331
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
332 anti_codons[tmp_anticodon] = []
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
333
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
334 for synonim in synonims[codon]:
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
335 tmp_synonim = Bio.Seq.Seq(synonim, IUPAC.unambiguous_dna)
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
336 tmp_antisynonim = str(tmp_synonim.reverse_complement())
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
337 anti_codons[tmp_anticodon].append(tmp_antisynonim)
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
338
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
339 check = Bio.Seq.Seq("CTT")
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
340 anti_check = check.reverse_complement()
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
341 print("\nCheck:\n" + str(check))
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
342 print("\nCodons:\n")
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
343
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
344 for key in codons.keys():
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
345 if str(check) in codons[key]:
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
346 print(codons[key])
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
347
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
348 #pprint(codons)
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
349 print("\nSynonims:\n")
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
350 pprint(synonims[str(check)])
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
351 print("\nAnti_Codons:\n")
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
352 pprint(anti_codons[str(anti_check)])
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
353
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
354 #i = synonims.keys()
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
355 #right = True
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
356 #while len(i) > 0:
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
357 # tmp = i.pop()
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
358 # check = Bio.Seq.Seq(tmp)
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
359 # anti_check = check.reverse_complement()
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
360
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
361
bd35b13fabfb Uploaded
gianmarco_piccinno
parents:
diff changeset
362 return {"synonims":synonims, "anti_synonims":anti_codons}