Mercurial > repos > gianmarco_piccinno > cs_tool_project_rm
view syngenic.py @ 0:5397da1ef896 draft
Uploaded
author | gianmarco_piccinno |
---|---|
date | Tue, 21 May 2019 05:05:15 -0400 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env python __author__= "Gianmarco Piccinno" __version__ = "1.0.0" import Bio from Bio import SeqIO from Bio.Seq import Seq from Bio.Alphabet import IUPAC from Bio.Data import IUPACData from Bio.Data import CodonTable import re import sre_yield import re import itertools from functools import reduce import Bio from Bio import Data from Bio.Data import IUPACData from Bio.Data import CodonTable from pprint import pprint import pandas as pd def _check_bases(seq_string): """ Check characters in a string (PRIVATE). Remove digits and white space present in string. Allows any valid ambiguous IUPAC DNA single letters codes (ABCDGHKMNRSTVWY, upper case are converted). Other characters (e.g. symbols) trigger a TypeError. Returns the string WITH A LEADING SPACE (!). This is for backwards compatibility, and may in part be explained by the fact that Bio.Restriction doesn't use zero based counting. """ # Remove white space and make upper case: seq_string = "".join(seq_string.split()).upper() # Remove digits for c in "0123456789": seq_string = seq_string.replace(c, "") # Check only allowed IUPAC letters if not set(seq_string).issubset(set("ABCDGHKMNRSTVWY")): raise TypeError("Invalid character found in %s" % repr(seq_string)) return " " + seq_string matching = {'A': 'ARWMHVDN', 'C': 'CYSMHBVN', 'G': 'GRSKBVDN', 'T': 'TYWKHBDN', 'R': 'ABDGHKMNSRWV', 'Y': 'CBDHKMNSTWVY', 'W': 'ABDHKMNRTWVY', 'S': 'CBDGHKMNSRVY', 'M': 'ACBDHMNSRWVY', 'K': 'BDGHKNSRTWVY', 'H': 'ACBDHKMNSRTWVY', 'B': 'CBDGHKMNSRTWVY', 'V': 'ACBDGHKMNSRWVY', 'D': 'ABDGHKMNSRTWVY', 'N': 'ACBDGHKMNSRTWVY'} class pattern(object): def __init__(self, pattern_input): s = str(pattern_input) self.upper = s.isupper() self.data = _check_bases(s) self.pattern = s def plan_ambiguity(self): val = Bio.Data.IUPACData.ambiguous_dna_values re_pattern = "" for el in self.pattern: re_pattern = re_pattern + "[" + val[el] + "]" return re_pattern class annotated_genome(object): def __init__(self, seq): s = str(seq) self.upper = s.isupper() self.data = _check_bases(s) self.seq = s def codon_usage(self, codonTable): codon_usage = {} tmp = [x for x in re.split(r'(\w{3})', self.seq) if x != ""] b_cod_table = CodonTable.unambiguous_dna_by_name["Bacterial"].forward_table aas = set(b_cod_table.values()) for aa in aas: codon_usage[aa] = {} for codon in b_cod_table.keys(): if b_cod_table[codon] == aa: codon_usage[aa][codon] = tmp.count(codon) tups = {(outerKey, innerKey): values for outerKey, innerDict in codon_usage.iteritems() for innerKey, values in innerDict.iteritems()} codon_usage_ = pd.DataFrame(pd.Series(tups), columns = ["Count"]) codon_usage_.index = codon_usage_.index.set_names(["AA", "Codon"]) codon_usage_['Proportion'] = codon_usage_.groupby(level=0).transform(lambda x: (x / x.sum()).round(2)) codon_usage_.reset_index(inplace=True) codon_usage_.index = codon_usage_["Codon"] return {"Dictionary": codon_usage, "Tuples": tups, "Table": codon_usage_} class plasmid(object): """ This class represents a circular plasmid """ def __init__(self, seq = "", circular=True, features = None): if type(seq) in [Bio.SeqRecord.SeqRecord, plasmid, Seq]: s = str(seq.seq) self.features = seq.features else: s = str(seq) if features != None: self.features = features self.upper = s.isupper() #self.data = _check_bases(s) self.data = s self.circular = circular self.Class = s.__class__ self.size = len(s) self.seq = self.data def __len__(self): return len(self.data) def __repr__(self): return "plasmid(%s, circular=%s)" % (repr(self[:]),repr(self.circular)) def __eq__(self, other): if isinstance(other, plasmid): if repr(self) == repr(other): return True else: return False return False def __getitem__(self, i): if self.upper: return self.Class((self.data[i]).upper()) return self.Class(self.data[i]) def sequence(self): return self.data def extract(self, feature): return self.data[feature.location.start:feature.location.end]#[::feature.strand] def findpattern(self, pattern, ambiguous_pattern): """ Return a list of a given pattern which occurs in the sequence. The list is made of tuple (location, pattern.group). The latter is used with non palindromic sites. Pattern is the regular expression pattern corresponding to the enzyme restriction site. Size is the size of the restriction enzyme recognition-site size. """ if not self.circular: data = self.data else: data = self.data + self.data[:len(ambiguous_pattern)] print(data) print(pattern) return [(data[i.start():i.end()], i.start(), i.end(), "innner") if (i.end()<=self.size) else (data[i.start():i.end()], i.start(), i.end()-self.size, "junction") for i in re.finditer(pattern, data)] def findpatterns(self, patterns, ambiguous_patterns): """ Return a list of a given pattern which occurs in the sequence. The list is made of tuple (location, pattern.group). The latter is used with non palindromic sites. Pattern is the regular expression pattern corresponding to the enzyme restriction site. Size is the size of the restriction enzyme recognition-site size. """ patts = {} for i in range(len(patterns)): #print(patterns[i]) #print(ambiguous_patterns[i]) if not self.circular: data = self.data else: data = self.data + self.data[:len(ambiguous_patterns[i])] #print(data) patts[ambiguous_patterns[i]] = [(data[j.start():j.end()], j.start(), j.end(), "innner") if (j.end()<=self.size) else (data[j.start():j.end()], j.start(), j.end()-self.size, "junction") for j in re.finditer(patterns[i], data)] return patts def codon_switch(self, patterns, annotation, codon_usage): table = {} for pattern in patterns: if pattern[1] >= annotation & pattern[2] <= annotation: # the pattern is in a coding sequence # do codon switch print("Do codon switch!") else: next return table def transversion(self, patterns): return def read_patterns(input): patterns = [] with open(input, "rU") as handle: for pat in handle.readlines(): tmp = Seq(pat.rstrip(), IUPAC.ambiguous_dna) patterns.append(tmp) #patterns.append(tmp.reverse_complement()) return patterns def codon_usage(seqs, codonTable): codon_usage = {} tmp = [x for x in re.split(r'(\w{3})', seqs) if x != ""] b_cod_table = CodonTable.unambiguous_dna_by_name[codonTable].forward_table for cod in CodonTable.unambiguous_dna_by_name[codonTable].stop_codons: b_cod_table[cod] = "_Stop" for cod in CodonTable.unambiguous_dna_by_name[codonTable].start_codons: #print(cod) b_cod_table[cod] = b_cod_table[cod] aas = set(b_cod_table.values()) for aa in aas: #print(aa) #codon_usage[aa] = {} for codon in b_cod_table.keys(): if b_cod_table[codon] == aa: codon_usage[codon] = tmp.count(codon.split(" ")[0]) return codon_usage def read_annotated_genome(data="example.fna", type_="fasta"): """ Accepted formats: - fasta (multifasta) - gbk """ seqs = "" if type_ == "fasta": with open(data, "rU") as handle: for record in SeqIO.parse(handle, type_): seqs = seqs + str(record.seq) elif type_ == "genbank": with open(data, "rU") as input_handle: types = [] for record in SeqIO.parse(input_handle, "genbank"): for feature in record.features: types.append(feature.type) if feature.type == "CDS": if feature.location.strand == +1: seq = record.seq[feature.location.start:feature.location.end] seqs = seqs + str(seq) elif feature.location.strand == -1: seq = record.seq[feature.location.start: feature.location.end].reverse_complement() seqs = seqs + str(seq) return seqs def synonims_(table_name="Bacterial"): b_cod_table = CodonTable.unambiguous_dna_by_name[table_name].forward_table print(b_cod_table) for cod in CodonTable.unambiguous_dna_by_name[table_name].stop_codons: b_cod_table[cod] = "_Stop" for cod in CodonTable.unambiguous_dna_by_name[table_name].start_codons: b_cod_table[cod] = "_Start" #pprint(b_cod_table) codons = {} aas = set(b_cod_table.values()) for aa in aas: codons[aa] = [] for codon in b_cod_table.keys(): if b_cod_table[codon] == aa: codons[aa].append(codon) #break synonims = {} for el1 in codons.keys(): print(el1) for el2 in codons[el1]: print(el2) synonims[el2] = codons[el1] #synonims[el2] = [] #for el3 in codons[el1]#set.difference(set(codons[el1]), {el2}): # print(el3) # synonims[el2].append(el3) #break #break #break anti_codons = {} for codon in synonims.keys(): tmp_codon = Bio.Seq.Seq(codon, IUPAC.unambiguous_dna) tmp_anticodon = str(tmp_codon.reverse_complement()) anti_codons[tmp_anticodon] = [] for synonim in synonims[codon]: tmp_synonim = Bio.Seq.Seq(synonim, IUPAC.unambiguous_dna) tmp_antisynonim = str(tmp_synonim.reverse_complement()) anti_codons[tmp_anticodon].append(tmp_antisynonim) check = Bio.Seq.Seq("CTT") anti_check = check.reverse_complement() print("\nCheck:\n" + str(check)) print("\nCodons:\n") for key in codons.keys(): if str(check) in codons[key]: print(codons[key]) #pprint(codons) print("\nSynonims:\n") pprint(synonims[str(check)]) print("\nAnti_Codons:\n") pprint(anti_codons[str(anti_check)]) #i = synonims.keys() #right = True #while len(i) > 0: # tmp = i.pop() # check = Bio.Seq.Seq(tmp) # anti_check = check.reverse_complement() return {"synonims":synonims, "anti_synonims":anti_codons}