Mercurial > repos > gianmarco_piccinno > cs_tool_project_rm
diff syngenic.py @ 0:5397da1ef896 draft
Uploaded
author | gianmarco_piccinno |
---|---|
date | Tue, 21 May 2019 05:05:15 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/syngenic.py Tue May 21 05:05:15 2019 -0400 @@ -0,0 +1,362 @@ +#!/usr/bin/env python + +__author__= "Gianmarco Piccinno" +__version__ = "1.0.0" + +import Bio +from Bio import SeqIO +from Bio.Seq import Seq +from Bio.Alphabet import IUPAC +from Bio.Data import IUPACData +from Bio.Data import CodonTable +import re +import sre_yield + +import re +import itertools +from functools import reduce + +import Bio +from Bio import Data +from Bio.Data import IUPACData +from Bio.Data import CodonTable + +from pprint import pprint + +import pandas as pd + +def _check_bases(seq_string): + """ + Check characters in a string (PRIVATE). + Remove digits and white space present in string. Allows any valid ambiguous + IUPAC DNA single letters codes (ABCDGHKMNRSTVWY, upper case are converted). + + Other characters (e.g. symbols) trigger a TypeError. + + Returns the string WITH A LEADING SPACE (!). This is for backwards + compatibility, and may in part be explained by the fact that + Bio.Restriction doesn't use zero based counting. + """ + # Remove white space and make upper case: + seq_string = "".join(seq_string.split()).upper() + # Remove digits + for c in "0123456789": + seq_string = seq_string.replace(c, "") + # Check only allowed IUPAC letters + if not set(seq_string).issubset(set("ABCDGHKMNRSTVWY")): + raise TypeError("Invalid character found in %s" % repr(seq_string)) + return " " + seq_string + + matching = {'A': 'ARWMHVDN', 'C': 'CYSMHBVN', 'G': 'GRSKBVDN', + 'T': 'TYWKHBDN', 'R': 'ABDGHKMNSRWV', 'Y': 'CBDHKMNSTWVY', + 'W': 'ABDHKMNRTWVY', 'S': 'CBDGHKMNSRVY', 'M': 'ACBDHMNSRWVY', + 'K': 'BDGHKNSRTWVY', 'H': 'ACBDHKMNSRTWVY', + 'B': 'CBDGHKMNSRTWVY', 'V': 'ACBDGHKMNSRWVY', + 'D': 'ABDGHKMNSRTWVY', 'N': 'ACBDGHKMNSRTWVY'} + +class pattern(object): + + + def __init__(self, pattern_input): + s = str(pattern_input) + self.upper = s.isupper() + self.data = _check_bases(s) + self.pattern = s + + def plan_ambiguity(self): + val = Bio.Data.IUPACData.ambiguous_dna_values + re_pattern = "" + for el in self.pattern: + re_pattern = re_pattern + "[" + val[el] + "]" + return re_pattern + +class annotated_genome(object): + + def __init__(self, seq): + s = str(seq) + self.upper = s.isupper() + self.data = _check_bases(s) + self.seq = s + + def codon_usage(self, codonTable): + + codon_usage = {} + tmp = [x for x in re.split(r'(\w{3})', self.seq) if x != ""] + + b_cod_table = CodonTable.unambiguous_dna_by_name["Bacterial"].forward_table + aas = set(b_cod_table.values()) + + for aa in aas: + codon_usage[aa] = {} + for codon in b_cod_table.keys(): + if b_cod_table[codon] == aa: + codon_usage[aa][codon] = tmp.count(codon) + + tups = {(outerKey, innerKey): values for outerKey, innerDict in codon_usage.iteritems() for innerKey, values in innerDict.iteritems()} + + codon_usage_ = pd.DataFrame(pd.Series(tups), columns = ["Count"]) + codon_usage_.index = codon_usage_.index.set_names(["AA", "Codon"]) + codon_usage_['Proportion'] = codon_usage_.groupby(level=0).transform(lambda x: (x / x.sum()).round(2)) + + codon_usage_.reset_index(inplace=True) + codon_usage_.index = codon_usage_["Codon"] + + return {"Dictionary": codon_usage, "Tuples": tups, "Table": codon_usage_} + +class plasmid(object): + """ + This class represents a circular plasmid + """ + + def __init__(self, seq = "", circular=True, features = None): + + if type(seq) in [Bio.SeqRecord.SeqRecord, plasmid, Seq]: + s = str(seq.seq) + self.features = seq.features + else: + s = str(seq) + if features != None: + self.features = features + self.upper = s.isupper() + #self.data = _check_bases(s) + self.data = s + self.circular = circular + self.Class = s.__class__ + self.size = len(s) + self.seq = self.data + + def __len__(self): + return len(self.data) + + def __repr__(self): + return "plasmid(%s, circular=%s)" % (repr(self[:]),repr(self.circular)) + + def __eq__(self, other): + if isinstance(other, plasmid): + if repr(self) == repr(other): + return True + else: + return False + return False + + def __getitem__(self, i): + if self.upper: + return self.Class((self.data[i]).upper()) + return self.Class(self.data[i]) + + def sequence(self): + return self.data + + + def extract(self, feature): + return self.data[feature.location.start:feature.location.end]#[::feature.strand] + + + def findpattern(self, pattern, ambiguous_pattern): + """ + Return a list of a given pattern which occurs in the sequence. + The list is made of tuple (location, pattern.group). + The latter is used with non palindromic sites. + Pattern is the regular expression pattern corresponding to the + enzyme restriction site. + Size is the size of the restriction enzyme recognition-site size. + """ + if not self.circular: + data = self.data + else: + data = self.data + self.data[:len(ambiguous_pattern)] + print(data) + print(pattern) + return [(data[i.start():i.end()], i.start(), i.end(), "innner") if (i.end()<=self.size) else (data[i.start():i.end()], i.start(), i.end()-self.size, "junction") for i in re.finditer(pattern, data)] + + def findpatterns(self, patterns, ambiguous_patterns): + """ + Return a list of a given pattern which occurs in the sequence. + The list is made of tuple (location, pattern.group). + The latter is used with non palindromic sites. + Pattern is the regular expression pattern corresponding to the + enzyme restriction site. + Size is the size of the restriction enzyme recognition-site size. + """ + patts = {} + for i in range(len(patterns)): + #print(patterns[i]) + #print(ambiguous_patterns[i]) + if not self.circular: + data = self.data + else: + data = self.data + self.data[:len(ambiguous_patterns[i])] + #print(data) + patts[ambiguous_patterns[i]] = [(data[j.start():j.end()], j.start(), j.end(), "innner") if (j.end()<=self.size) else (data[j.start():j.end()], j.start(), j.end()-self.size, "junction") for j in re.finditer(patterns[i], data)] + + return patts + + + def codon_switch(self, patterns, annotation, codon_usage): + + table = {} + + for pattern in patterns: + if pattern[1] >= annotation & pattern[2] <= annotation: + # the pattern is in a coding sequence + # do codon switch + print("Do codon switch!") + else: + next + return table + + def transversion(self, patterns): + return + +def read_patterns(input): + + patterns = [] + + with open(input, "rU") as handle: + for pat in handle.readlines(): + + tmp = Seq(pat.rstrip(), IUPAC.ambiguous_dna) + + patterns.append(tmp) + #patterns.append(tmp.reverse_complement()) + + + return patterns + + +def codon_usage(seqs, codonTable): + + codon_usage = {} + tmp = [x for x in re.split(r'(\w{3})', seqs) if x != ""] + + b_cod_table = CodonTable.unambiguous_dna_by_name[codonTable].forward_table + + for cod in CodonTable.unambiguous_dna_by_name[codonTable].stop_codons: + b_cod_table[cod] = "_Stop" + + for cod in CodonTable.unambiguous_dna_by_name[codonTable].start_codons: + #print(cod) + b_cod_table[cod] = b_cod_table[cod] + + aas = set(b_cod_table.values()) + + for aa in aas: + #print(aa) + #codon_usage[aa] = {} + for codon in b_cod_table.keys(): + if b_cod_table[codon] == aa: + codon_usage[codon] = tmp.count(codon.split(" ")[0]) + + return codon_usage + + +def read_annotated_genome(data="example.fna", type_="fasta"): + """ + Accepted formats: + - fasta (multifasta) + - gbk + + """ + + seqs = "" + + if type_ == "fasta": + with open(data, "rU") as handle: + for record in SeqIO.parse(handle, type_): + seqs = seqs + str(record.seq) + + elif type_ == "genbank": + with open(data, "rU") as input_handle: + types = [] + for record in SeqIO.parse(input_handle, "genbank"): + for feature in record.features: + types.append(feature.type) + if feature.type == "CDS": + if feature.location.strand == +1: + seq = record.seq[feature.location.start:feature.location.end] + seqs = seqs + str(seq) + elif feature.location.strand == -1: + seq = record.seq[feature.location.start: + feature.location.end].reverse_complement() + seqs = seqs + str(seq) + return seqs + + +def synonims_(table_name="Bacterial"): + + b_cod_table = CodonTable.unambiguous_dna_by_name[table_name].forward_table + + print(b_cod_table) + + for cod in CodonTable.unambiguous_dna_by_name[table_name].stop_codons: + b_cod_table[cod] = "_Stop" + + for cod in CodonTable.unambiguous_dna_by_name[table_name].start_codons: + b_cod_table[cod] = "_Start" + + #pprint(b_cod_table) + codons = {} + + aas = set(b_cod_table.values()) + + for aa in aas: + codons[aa] = [] + for codon in b_cod_table.keys(): + if b_cod_table[codon] == aa: + codons[aa].append(codon) + + #break + + synonims = {} + + for el1 in codons.keys(): + print(el1) + for el2 in codons[el1]: + print(el2) + synonims[el2] = codons[el1] + #synonims[el2] = [] + #for el3 in codons[el1]#set.difference(set(codons[el1]), {el2}): + # print(el3) + # synonims[el2].append(el3) + #break + #break + #break + + + anti_codons = {} + + for codon in synonims.keys(): + tmp_codon = Bio.Seq.Seq(codon, IUPAC.unambiguous_dna) + tmp_anticodon = str(tmp_codon.reverse_complement()) + + anti_codons[tmp_anticodon] = [] + + for synonim in synonims[codon]: + tmp_synonim = Bio.Seq.Seq(synonim, IUPAC.unambiguous_dna) + tmp_antisynonim = str(tmp_synonim.reverse_complement()) + anti_codons[tmp_anticodon].append(tmp_antisynonim) + + check = Bio.Seq.Seq("CTT") + anti_check = check.reverse_complement() + print("\nCheck:\n" + str(check)) + print("\nCodons:\n") + + for key in codons.keys(): + if str(check) in codons[key]: + print(codons[key]) + + #pprint(codons) + print("\nSynonims:\n") + pprint(synonims[str(check)]) + print("\nAnti_Codons:\n") + pprint(anti_codons[str(anti_check)]) + + #i = synonims.keys() + #right = True + #while len(i) > 0: + # tmp = i.pop() + # check = Bio.Seq.Seq(tmp) + # anti_check = check.reverse_complement() + + + return {"synonims":synonims, "anti_synonims":anti_codons}