annotate syngenic.py @ 0:5397da1ef896 draft

Uploaded
author gianmarco_piccinno
date Tue, 21 May 2019 05:05:15 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
1 #!/usr/bin/env python
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
2
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
3 __author__= "Gianmarco Piccinno"
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
4 __version__ = "1.0.0"
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
5
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
6 import Bio
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
7 from Bio import SeqIO
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
8 from Bio.Seq import Seq
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
9 from Bio.Alphabet import IUPAC
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
10 from Bio.Data import IUPACData
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
11 from Bio.Data import CodonTable
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
12 import re
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
13 import sre_yield
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
14
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
15 import re
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
16 import itertools
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
17 from functools import reduce
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
18
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
19 import Bio
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
20 from Bio import Data
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
21 from Bio.Data import IUPACData
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
22 from Bio.Data import CodonTable
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
23
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
24 from pprint import pprint
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
25
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
26 import pandas as pd
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
27
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
28 def _check_bases(seq_string):
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
29 """
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
30 Check characters in a string (PRIVATE).
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
31 Remove digits and white space present in string. Allows any valid ambiguous
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
32 IUPAC DNA single letters codes (ABCDGHKMNRSTVWY, upper case are converted).
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
33
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
34 Other characters (e.g. symbols) trigger a TypeError.
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
35
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
36 Returns the string WITH A LEADING SPACE (!). This is for backwards
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
37 compatibility, and may in part be explained by the fact that
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
38 Bio.Restriction doesn't use zero based counting.
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
39 """
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
40 # Remove white space and make upper case:
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
41 seq_string = "".join(seq_string.split()).upper()
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
42 # Remove digits
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
43 for c in "0123456789":
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
44 seq_string = seq_string.replace(c, "")
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
45 # Check only allowed IUPAC letters
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
46 if not set(seq_string).issubset(set("ABCDGHKMNRSTVWY")):
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
47 raise TypeError("Invalid character found in %s" % repr(seq_string))
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
48 return " " + seq_string
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
49
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
50 matching = {'A': 'ARWMHVDN', 'C': 'CYSMHBVN', 'G': 'GRSKBVDN',
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
51 'T': 'TYWKHBDN', 'R': 'ABDGHKMNSRWV', 'Y': 'CBDHKMNSTWVY',
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
52 'W': 'ABDHKMNRTWVY', 'S': 'CBDGHKMNSRVY', 'M': 'ACBDHMNSRWVY',
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
53 'K': 'BDGHKNSRTWVY', 'H': 'ACBDHKMNSRTWVY',
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
54 'B': 'CBDGHKMNSRTWVY', 'V': 'ACBDGHKMNSRWVY',
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
55 'D': 'ABDGHKMNSRTWVY', 'N': 'ACBDGHKMNSRTWVY'}
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
56
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
57 class pattern(object):
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
58
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
59
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
60 def __init__(self, pattern_input):
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
61 s = str(pattern_input)
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
62 self.upper = s.isupper()
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
63 self.data = _check_bases(s)
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
64 self.pattern = s
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
65
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
66 def plan_ambiguity(self):
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
67 val = Bio.Data.IUPACData.ambiguous_dna_values
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
68 re_pattern = ""
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
69 for el in self.pattern:
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
70 re_pattern = re_pattern + "[" + val[el] + "]"
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
71 return re_pattern
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
72
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
73 class annotated_genome(object):
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
74
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
75 def __init__(self, seq):
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
76 s = str(seq)
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
77 self.upper = s.isupper()
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
78 self.data = _check_bases(s)
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
79 self.seq = s
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
80
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
81 def codon_usage(self, codonTable):
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
82
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
83 codon_usage = {}
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
84 tmp = [x for x in re.split(r'(\w{3})', self.seq) if x != ""]
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
85
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
86 b_cod_table = CodonTable.unambiguous_dna_by_name["Bacterial"].forward_table
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
87 aas = set(b_cod_table.values())
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
88
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
89 for aa in aas:
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
90 codon_usage[aa] = {}
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
91 for codon in b_cod_table.keys():
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
92 if b_cod_table[codon] == aa:
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
93 codon_usage[aa][codon] = tmp.count(codon)
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
94
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
95 tups = {(outerKey, innerKey): values for outerKey, innerDict in codon_usage.iteritems() for innerKey, values in innerDict.iteritems()}
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
96
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
97 codon_usage_ = pd.DataFrame(pd.Series(tups), columns = ["Count"])
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
98 codon_usage_.index = codon_usage_.index.set_names(["AA", "Codon"])
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
99 codon_usage_['Proportion'] = codon_usage_.groupby(level=0).transform(lambda x: (x / x.sum()).round(2))
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
100
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
101 codon_usage_.reset_index(inplace=True)
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
102 codon_usage_.index = codon_usage_["Codon"]
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
103
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
104 return {"Dictionary": codon_usage, "Tuples": tups, "Table": codon_usage_}
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
105
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
106 class plasmid(object):
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
107 """
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
108 This class represents a circular plasmid
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
109 """
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
110
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
111 def __init__(self, seq = "", circular=True, features = None):
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
112
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
113 if type(seq) in [Bio.SeqRecord.SeqRecord, plasmid, Seq]:
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
114 s = str(seq.seq)
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
115 self.features = seq.features
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
116 else:
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
117 s = str(seq)
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
118 if features != None:
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
119 self.features = features
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
120 self.upper = s.isupper()
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
121 #self.data = _check_bases(s)
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
122 self.data = s
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
123 self.circular = circular
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
124 self.Class = s.__class__
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
125 self.size = len(s)
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
126 self.seq = self.data
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
127
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
128 def __len__(self):
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
129 return len(self.data)
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
130
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
131 def __repr__(self):
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
132 return "plasmid(%s, circular=%s)" % (repr(self[:]),repr(self.circular))
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
133
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
134 def __eq__(self, other):
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
135 if isinstance(other, plasmid):
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
136 if repr(self) == repr(other):
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
137 return True
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
138 else:
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
139 return False
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
140 return False
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
141
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
142 def __getitem__(self, i):
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
143 if self.upper:
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
144 return self.Class((self.data[i]).upper())
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
145 return self.Class(self.data[i])
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
146
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
147 def sequence(self):
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
148 return self.data
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
149
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
150
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
151 def extract(self, feature):
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
152 return self.data[feature.location.start:feature.location.end]#[::feature.strand]
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
153
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
154
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
155 def findpattern(self, pattern, ambiguous_pattern):
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
156 """
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
157 Return a list of a given pattern which occurs in the sequence.
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
158 The list is made of tuple (location, pattern.group).
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
159 The latter is used with non palindromic sites.
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
160 Pattern is the regular expression pattern corresponding to the
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
161 enzyme restriction site.
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
162 Size is the size of the restriction enzyme recognition-site size.
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
163 """
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
164 if not self.circular:
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
165 data = self.data
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
166 else:
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
167 data = self.data + self.data[:len(ambiguous_pattern)]
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
168 print(data)
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
169 print(pattern)
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
170 return [(data[i.start():i.end()], i.start(), i.end(), "innner") if (i.end()<=self.size) else (data[i.start():i.end()], i.start(), i.end()-self.size, "junction") for i in re.finditer(pattern, data)]
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
171
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
172 def findpatterns(self, patterns, ambiguous_patterns):
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
173 """
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
174 Return a list of a given pattern which occurs in the sequence.
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
175 The list is made of tuple (location, pattern.group).
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
176 The latter is used with non palindromic sites.
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
177 Pattern is the regular expression pattern corresponding to the
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
178 enzyme restriction site.
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
179 Size is the size of the restriction enzyme recognition-site size.
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
180 """
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
181 patts = {}
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
182 for i in range(len(patterns)):
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
183 #print(patterns[i])
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
184 #print(ambiguous_patterns[i])
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
185 if not self.circular:
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
186 data = self.data
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
187 else:
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
188 data = self.data + self.data[:len(ambiguous_patterns[i])]
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
189 #print(data)
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
190 patts[ambiguous_patterns[i]] = [(data[j.start():j.end()], j.start(), j.end(), "innner") if (j.end()<=self.size) else (data[j.start():j.end()], j.start(), j.end()-self.size, "junction") for j in re.finditer(patterns[i], data)]
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
191
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
192 return patts
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
193
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
194
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
195 def codon_switch(self, patterns, annotation, codon_usage):
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
196
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
197 table = {}
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
198
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
199 for pattern in patterns:
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
200 if pattern[1] >= annotation & pattern[2] <= annotation:
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
201 # the pattern is in a coding sequence
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
202 # do codon switch
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
203 print("Do codon switch!")
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
204 else:
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
205 next
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
206 return table
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
207
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
208 def transversion(self, patterns):
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
209 return
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
210
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
211 def read_patterns(input):
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
212
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
213 patterns = []
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
214
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
215 with open(input, "rU") as handle:
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
216 for pat in handle.readlines():
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
217
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
218 tmp = Seq(pat.rstrip(), IUPAC.ambiguous_dna)
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
219
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
220 patterns.append(tmp)
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
221 #patterns.append(tmp.reverse_complement())
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
222
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
223
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
224 return patterns
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
225
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
226
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
227 def codon_usage(seqs, codonTable):
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
228
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
229 codon_usage = {}
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
230 tmp = [x for x in re.split(r'(\w{3})', seqs) if x != ""]
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
231
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
232 b_cod_table = CodonTable.unambiguous_dna_by_name[codonTable].forward_table
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
233
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
234 for cod in CodonTable.unambiguous_dna_by_name[codonTable].stop_codons:
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
235 b_cod_table[cod] = "_Stop"
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
236
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
237 for cod in CodonTable.unambiguous_dna_by_name[codonTable].start_codons:
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
238 #print(cod)
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
239 b_cod_table[cod] = b_cod_table[cod]
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
240
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
241 aas = set(b_cod_table.values())
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
242
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
243 for aa in aas:
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
244 #print(aa)
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
245 #codon_usage[aa] = {}
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
246 for codon in b_cod_table.keys():
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
247 if b_cod_table[codon] == aa:
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
248 codon_usage[codon] = tmp.count(codon.split(" ")[0])
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
249
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
250 return codon_usage
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
251
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
252
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
253 def read_annotated_genome(data="example.fna", type_="fasta"):
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
254 """
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
255 Accepted formats:
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
256 - fasta (multifasta)
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
257 - gbk
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
258
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
259 """
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
260
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
261 seqs = ""
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
262
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
263 if type_ == "fasta":
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
264 with open(data, "rU") as handle:
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
265 for record in SeqIO.parse(handle, type_):
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
266 seqs = seqs + str(record.seq)
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
267
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
268 elif type_ == "genbank":
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
269 with open(data, "rU") as input_handle:
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
270 types = []
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
271 for record in SeqIO.parse(input_handle, "genbank"):
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
272 for feature in record.features:
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
273 types.append(feature.type)
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
274 if feature.type == "CDS":
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
275 if feature.location.strand == +1:
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
276 seq = record.seq[feature.location.start:feature.location.end]
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
277 seqs = seqs + str(seq)
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
278 elif feature.location.strand == -1:
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
279 seq = record.seq[feature.location.start:
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
280 feature.location.end].reverse_complement()
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
281 seqs = seqs + str(seq)
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
282 return seqs
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
283
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
284
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
285 def synonims_(table_name="Bacterial"):
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
286
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
287 b_cod_table = CodonTable.unambiguous_dna_by_name[table_name].forward_table
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
288
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
289 print(b_cod_table)
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
290
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
291 for cod in CodonTable.unambiguous_dna_by_name[table_name].stop_codons:
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
292 b_cod_table[cod] = "_Stop"
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
293
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
294 for cod in CodonTable.unambiguous_dna_by_name[table_name].start_codons:
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
295 b_cod_table[cod] = "_Start"
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
296
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
297 #pprint(b_cod_table)
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
298 codons = {}
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
299
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
300 aas = set(b_cod_table.values())
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
301
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
302 for aa in aas:
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
303 codons[aa] = []
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
304 for codon in b_cod_table.keys():
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
305 if b_cod_table[codon] == aa:
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
306 codons[aa].append(codon)
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
307
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
308 #break
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
309
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
310 synonims = {}
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
311
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
312 for el1 in codons.keys():
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
313 print(el1)
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
314 for el2 in codons[el1]:
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
315 print(el2)
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
316 synonims[el2] = codons[el1]
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
317 #synonims[el2] = []
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
318 #for el3 in codons[el1]#set.difference(set(codons[el1]), {el2}):
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
319 # print(el3)
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
320 # synonims[el2].append(el3)
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
321 #break
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
322 #break
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
323 #break
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
324
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
325
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
326 anti_codons = {}
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
327
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
328 for codon in synonims.keys():
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
329 tmp_codon = Bio.Seq.Seq(codon, IUPAC.unambiguous_dna)
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
330 tmp_anticodon = str(tmp_codon.reverse_complement())
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
331
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
332 anti_codons[tmp_anticodon] = []
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
333
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
334 for synonim in synonims[codon]:
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
335 tmp_synonim = Bio.Seq.Seq(synonim, IUPAC.unambiguous_dna)
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
336 tmp_antisynonim = str(tmp_synonim.reverse_complement())
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
337 anti_codons[tmp_anticodon].append(tmp_antisynonim)
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
338
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
339 check = Bio.Seq.Seq("CTT")
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
340 anti_check = check.reverse_complement()
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
341 print("\nCheck:\n" + str(check))
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
342 print("\nCodons:\n")
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
343
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
344 for key in codons.keys():
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
345 if str(check) in codons[key]:
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
346 print(codons[key])
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
347
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
348 #pprint(codons)
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
349 print("\nSynonims:\n")
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
350 pprint(synonims[str(check)])
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
351 print("\nAnti_Codons:\n")
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
352 pprint(anti_codons[str(anti_check)])
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
353
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
354 #i = synonims.keys()
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
355 #right = True
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
356 #while len(i) > 0:
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
357 # tmp = i.pop()
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
358 # check = Bio.Seq.Seq(tmp)
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
359 # anti_check = check.reverse_complement()
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
360
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
361
5397da1ef896 Uploaded
gianmarco_piccinno
parents:
diff changeset
362 return {"synonims":synonims, "anti_synonims":anti_codons}