cs_tool_project_rm: syngenic.py comparison

comparison syngenic.py @ 0:5397da1ef896 draft

Uploaded

author	gianmarco_piccinno
date	Tue, 21 May 2019 05:05:15 -0400
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:5397da1ef896
+#!/usr/bin/env python
+__author__= "Gianmarco Piccinno"
+__version__ = "1.0.0"
+import Bio
+from Bio import SeqIO
+from Bio.Seq import Seq
+from Bio.Alphabet import IUPAC
+from Bio.Data import IUPACData
+from Bio.Data import CodonTable
+import re
+import sre_yield
+import re
+import itertools
+from functools import reduce
+import Bio
+from Bio import Data
+from Bio.Data import IUPACData
+from Bio.Data import CodonTable
+from pprint import pprint
+import pandas as pd
+def _check_bases(seq_string):
+"""
+Check characters in a string (PRIVATE).
+Remove digits and white space present in string. Allows any valid ambiguous
+IUPAC DNA single letters codes (ABCDGHKMNRSTVWY, upper case are converted).
+Other characters (e.g. symbols) trigger a TypeError.
+Returns the string WITH A LEADING SPACE (!). This is for backwards
+compatibility, and may in part be explained by the fact that
+Bio.Restriction doesn't use zero based counting.
+"""
+# Remove white space and make upper case:
+seq_string = "".join(seq_string.split()).upper()
+# Remove digits
+for c in "0123456789":
+seq_string = seq_string.replace(c, "")
+# Check only allowed IUPAC letters
+if not set(seq_string).issubset(set("ABCDGHKMNRSTVWY")):
+raise TypeError("Invalid character found in %s" % repr(seq_string))
+return " " + seq_string
+matching = {'A': 'ARWMHVDN', 'C': 'CYSMHBVN', 'G': 'GRSKBVDN',
+'T': 'TYWKHBDN', 'R': 'ABDGHKMNSRWV', 'Y': 'CBDHKMNSTWVY',
+'W': 'ABDHKMNRTWVY', 'S': 'CBDGHKMNSRVY', 'M': 'ACBDHMNSRWVY',
+'K': 'BDGHKNSRTWVY', 'H': 'ACBDHKMNSRTWVY',
+'B': 'CBDGHKMNSRTWVY', 'V': 'ACBDGHKMNSRWVY',
+'D': 'ABDGHKMNSRTWVY', 'N': 'ACBDGHKMNSRTWVY'}
+class pattern(object):
+def __init__(self, pattern_input):
+s = str(pattern_input)
+self.upper = s.isupper()
+self.data = _check_bases(s)
+self.pattern = s
+def plan_ambiguity(self):
+val = Bio.Data.IUPACData.ambiguous_dna_values
+re_pattern = ""
+for el in self.pattern:
+re_pattern = re_pattern + "[" + val[el] + "]"
+return re_pattern
+class annotated_genome(object):
+def __init__(self, seq):
+s = str(seq)
+self.upper = s.isupper()
+self.data = _check_bases(s)
+self.seq = s
+def codon_usage(self, codonTable):
+codon_usage = {}
+tmp = [x for x in re.split(r'(\w{3})', self.seq) if x != ""]
+b_cod_table = CodonTable.unambiguous_dna_by_name["Bacterial"].forward_table
+aas = set(b_cod_table.values())
+for aa in aas:
+codon_usage[aa] = {}
+for codon in b_cod_table.keys():
+if b_cod_table[codon] == aa:
+codon_usage[aa][codon] = tmp.count(codon)
+tups = {(outerKey, innerKey): values for outerKey, innerDict in codon_usage.iteritems() for innerKey, values in innerDict.iteritems()}
+codon_usage_ = pd.DataFrame(pd.Series(tups), columns = ["Count"])
+codon_usage_.index = codon_usage_.index.set_names(["AA", "Codon"])
+codon_usage_['Proportion'] = codon_usage_.groupby(level=0).transform(lambda x: (x / x.sum()).round(2))
+codon_usage_.reset_index(inplace=True)
+codon_usage_.index = codon_usage_["Codon"]
+return {"Dictionary": codon_usage, "Tuples": tups, "Table": codon_usage_}
+class plasmid(object):
+"""
+This class represents a circular plasmid
+"""
+def __init__(self, seq = "", circular=True, features = None):
+if type(seq) in [Bio.SeqRecord.SeqRecord, plasmid, Seq]:
+s = str(seq.seq)
+self.features = seq.features
+else:
+s = str(seq)
+if features != None:
+self.features = features
+self.upper = s.isupper()
+#self.data = _check_bases(s)
+self.data = s
+self.circular = circular
+self.Class = s.__class__
+self.size = len(s)
+self.seq = self.data
+def __len__(self):
+return len(self.data)
+def __repr__(self):
+return "plasmid(%s, circular=%s)" % (repr(self[:]),repr(self.circular))
+def __eq__(self, other):
+if isinstance(other, plasmid):
+if repr(self) == repr(other):
+return True
+else:
+return False
+return False
+def __getitem__(self, i):
+if self.upper:
+return self.Class((self.data[i]).upper())
+return self.Class(self.data[i])
+def sequence(self):
+return self.data
+def extract(self, feature):
+return self.data[feature.location.start:feature.location.end]#[::feature.strand]
+def findpattern(self, pattern, ambiguous_pattern):
+"""
+Return a list of a given pattern which occurs in the sequence.
+The list is made of tuple (location, pattern.group).
+The latter is used with non palindromic sites.
+Pattern is the regular expression pattern corresponding to the
+enzyme restriction site.
+Size is the size of the restriction enzyme recognition-site size.
+"""
+if not self.circular:
+data = self.data
+else:
+data = self.data + self.data[:len(ambiguous_pattern)]
+print(data)
+print(pattern)
+return [(data[i.start():i.end()], i.start(), i.end(), "innner") if (i.end()<=self.size) else (data[i.start():i.end()], i.start(), i.end()-self.size, "junction") for i in re.finditer(pattern, data)]
+def findpatterns(self, patterns, ambiguous_patterns):
+"""
+Return a list of a given pattern which occurs in the sequence.
+The list is made of tuple (location, pattern.group).
+The latter is used with non palindromic sites.
+Pattern is the regular expression pattern corresponding to the
+enzyme restriction site.
+Size is the size of the restriction enzyme recognition-site size.
+"""
+patts = {}
+for i in range(len(patterns)):
+#print(patterns[i])
+#print(ambiguous_patterns[i])
+if not self.circular:
+data = self.data
+else:
+data = self.data + self.data[:len(ambiguous_patterns[i])]
+#print(data)
+patts[ambiguous_patterns[i]] = [(data[j.start():j.end()], j.start(), j.end(), "innner") if (j.end()<=self.size) else (data[j.start():j.end()], j.start(), j.end()-self.size, "junction") for j in re.finditer(patterns[i], data)]
+return patts
+def codon_switch(self, patterns, annotation, codon_usage):
+table = {}
+for pattern in patterns:
+if pattern[1] >= annotation & pattern[2] <= annotation:
+# the pattern is in a coding sequence
+# do codon switch
+print("Do codon switch!")
+else:
+next
+return table
+def transversion(self, patterns):
+return
+def read_patterns(input):
+patterns = []
+with open(input, "rU") as handle:
+for pat in handle.readlines():
+tmp = Seq(pat.rstrip(), IUPAC.ambiguous_dna)
+patterns.append(tmp)
+#patterns.append(tmp.reverse_complement())
+return patterns
+def codon_usage(seqs, codonTable):
+codon_usage = {}
+tmp = [x for x in re.split(r'(\w{3})', seqs) if x != ""]
+b_cod_table = CodonTable.unambiguous_dna_by_name[codonTable].forward_table
+for cod in CodonTable.unambiguous_dna_by_name[codonTable].stop_codons:
+b_cod_table[cod] = "_Stop"
+for cod in CodonTable.unambiguous_dna_by_name[codonTable].start_codons:
+#print(cod)
+b_cod_table[cod] = b_cod_table[cod]
+aas = set(b_cod_table.values())
+for aa in aas:
+#print(aa)
+#codon_usage[aa] = {}
+for codon in b_cod_table.keys():
+if b_cod_table[codon] == aa:
+codon_usage[codon] = tmp.count(codon.split(" ")[0])
+return codon_usage
+def read_annotated_genome(data="example.fna", type_="fasta"):
+"""
+Accepted formats:
+- fasta (multifasta)
+- gbk
+"""
+seqs = ""
+if type_ == "fasta":
+with open(data, "rU") as handle:
+for record in SeqIO.parse(handle, type_):
+seqs = seqs + str(record.seq)
+elif type_ == "genbank":
+with open(data, "rU") as input_handle:
+types = []
+for record in SeqIO.parse(input_handle, "genbank"):
+for feature in record.features:
+types.append(feature.type)
+if feature.type == "CDS":
+if feature.location.strand == +1:
+seq = record.seq[feature.location.start:feature.location.end]
+seqs = seqs + str(seq)
+elif feature.location.strand == -1:
+seq = record.seq[feature.location.start:
+feature.location.end].reverse_complement()
+seqs = seqs + str(seq)
+return seqs
+def synonims_(table_name="Bacterial"):
+b_cod_table = CodonTable.unambiguous_dna_by_name[table_name].forward_table
+print(b_cod_table)
+for cod in CodonTable.unambiguous_dna_by_name[table_name].stop_codons:
+b_cod_table[cod] = "_Stop"
+for cod in CodonTable.unambiguous_dna_by_name[table_name].start_codons:
+b_cod_table[cod] = "_Start"
+#pprint(b_cod_table)
+codons = {}
+aas = set(b_cod_table.values())
+for aa in aas:
+codons[aa] = []
+for codon in b_cod_table.keys():
+if b_cod_table[codon] == aa:
+codons[aa].append(codon)
+#break
+synonims = {}
+for el1 in codons.keys():
+print(el1)
+for el2 in codons[el1]:
+print(el2)
+synonims[el2] = codons[el1]
+#synonims[el2] = []
+#for el3 in codons[el1]#set.difference(set(codons[el1]), {el2}):
+#    print(el3)
+#    synonims[el2].append(el3)
+#break
+#break
+#break
+anti_codons = {}
+for codon in synonims.keys():
+tmp_codon = Bio.Seq.Seq(codon, IUPAC.unambiguous_dna)
+tmp_anticodon = str(tmp_codon.reverse_complement())
+anti_codons[tmp_anticodon] = []
+for synonim in synonims[codon]:
+tmp_synonim = Bio.Seq.Seq(synonim, IUPAC.unambiguous_dna)
+tmp_antisynonim = str(tmp_synonim.reverse_complement())
+anti_codons[tmp_anticodon].append(tmp_antisynonim)
+check = Bio.Seq.Seq("CTT")
+anti_check = check.reverse_complement()
+print("\nCheck:\n" + str(check))
+print("\nCodons:\n")
+for key in codons.keys():
+if str(check) in codons[key]:
+print(codons[key])
+#pprint(codons)
+print("\nSynonims:\n")
+pprint(synonims[str(check)])
+print("\nAnti_Codons:\n")
+pprint(anti_codons[str(anti_check)])
+#i = synonims.keys()
+#right = True
+#while len(i) > 0:
+#    tmp = i.pop()
+#    check = Bio.Seq.Seq(tmp)
+#    anti_check = check.reverse_complement()
+return {"synonims":synonims, "anti_synonims":anti_codons}

Mercurial > repos > gianmarco_piccinno > cs_tool_project_rm

comparison syngenic.py @ 0:5397da1ef896 draft