view syngenic.py @ 0:5397da1ef896 draft

Uploaded
author gianmarco_piccinno
date Tue, 21 May 2019 05:05:15 -0400
parents
children
line wrap: on
line source

#!/usr/bin/env python

__author__= "Gianmarco Piccinno"
__version__ = "1.0.0"

import Bio
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
from Bio.Data import IUPACData
from Bio.Data import CodonTable
import re
import sre_yield

import re
import itertools
from functools import reduce

import Bio
from Bio import Data
from Bio.Data import IUPACData
from Bio.Data import CodonTable

from pprint import pprint

import pandas as pd

def _check_bases(seq_string):
    """
    Check characters in a string (PRIVATE).
    Remove digits and white space present in string. Allows any valid ambiguous
    IUPAC DNA single letters codes (ABCDGHKMNRSTVWY, upper case are converted).

    Other characters (e.g. symbols) trigger a TypeError.

    Returns the string WITH A LEADING SPACE (!). This is for backwards
    compatibility, and may in part be explained by the fact that
    Bio.Restriction doesn't use zero based counting.
    """
    # Remove white space and make upper case:
    seq_string = "".join(seq_string.split()).upper()
    # Remove digits
    for c in "0123456789":
        seq_string = seq_string.replace(c, "")
    # Check only allowed IUPAC letters
    if not set(seq_string).issubset(set("ABCDGHKMNRSTVWY")):
        raise TypeError("Invalid character found in %s" % repr(seq_string))
    return " " + seq_string

    matching = {'A': 'ARWMHVDN', 'C': 'CYSMHBVN', 'G': 'GRSKBVDN',
                'T': 'TYWKHBDN', 'R': 'ABDGHKMNSRWV', 'Y': 'CBDHKMNSTWVY',
                'W': 'ABDHKMNRTWVY', 'S': 'CBDGHKMNSRVY', 'M': 'ACBDHMNSRWVY',
                'K': 'BDGHKNSRTWVY', 'H': 'ACBDHKMNSRTWVY',
                'B': 'CBDGHKMNSRTWVY', 'V': 'ACBDGHKMNSRWVY',
                'D': 'ABDGHKMNSRTWVY', 'N': 'ACBDGHKMNSRTWVY'}

class pattern(object):


    def __init__(self, pattern_input):
        s = str(pattern_input)
        self.upper = s.isupper()
        self.data = _check_bases(s)
        self.pattern = s

    def plan_ambiguity(self):
        val = Bio.Data.IUPACData.ambiguous_dna_values
        re_pattern = ""
        for el in self.pattern:
            re_pattern = re_pattern + "[" + val[el] + "]"
        return re_pattern

class annotated_genome(object):

    def __init__(self, seq):
        s = str(seq)
        self.upper = s.isupper()
        self.data = _check_bases(s)
        self.seq = s

    def codon_usage(self, codonTable):

        codon_usage = {}
        tmp = [x for x in re.split(r'(\w{3})', self.seq) if x != ""]

        b_cod_table = CodonTable.unambiguous_dna_by_name["Bacterial"].forward_table
        aas = set(b_cod_table.values())

        for aa in aas:
            codon_usage[aa] = {}
            for codon in b_cod_table.keys():
                if b_cod_table[codon] == aa:
                    codon_usage[aa][codon] = tmp.count(codon)

        tups = {(outerKey, innerKey): values for outerKey, innerDict in codon_usage.iteritems() for innerKey, values in innerDict.iteritems()}

        codon_usage_ = pd.DataFrame(pd.Series(tups), columns = ["Count"])
        codon_usage_.index = codon_usage_.index.set_names(["AA", "Codon"])
        codon_usage_['Proportion'] = codon_usage_.groupby(level=0).transform(lambda x: (x / x.sum()).round(2))

        codon_usage_.reset_index(inplace=True)
        codon_usage_.index = codon_usage_["Codon"]

        return {"Dictionary": codon_usage, "Tuples": tups, "Table": codon_usage_}

class plasmid(object):
    """
    This class represents a circular plasmid
    """

    def __init__(self, seq = "", circular=True, features = None):

        if type(seq) in [Bio.SeqRecord.SeqRecord, plasmid, Seq]:
            s = str(seq.seq)
            self.features = seq.features
        else:
            s = str(seq)
            if features != None:
                self.features = features
        self.upper = s.isupper()
        #self.data = _check_bases(s)
        self.data = s
        self.circular = circular
        self.Class = s.__class__
        self.size = len(s)
        self.seq = self.data

    def __len__(self):
        return len(self.data)

    def __repr__(self):
        return "plasmid(%s, circular=%s)" % (repr(self[:]),repr(self.circular))

    def __eq__(self, other):
        if isinstance(other, plasmid):
            if repr(self) == repr(other):
                return True
            else:
                return False
        return False

    def __getitem__(self, i):
        if self.upper:
            return self.Class((self.data[i]).upper())
        return self.Class(self.data[i])

    def sequence(self):
        return self.data


    def extract(self, feature):
        return self.data[feature.location.start:feature.location.end]#[::feature.strand]


    def findpattern(self, pattern, ambiguous_pattern):
        """
        Return a list of a given pattern which occurs in the sequence.
        The list is made of tuple (location, pattern.group).
        The latter is used with non palindromic sites.
        Pattern is the regular expression pattern corresponding to the
        enzyme restriction site.
        Size is the size of the restriction enzyme recognition-site size.
        """
        if not self.circular:
            data = self.data
        else:
            data = self.data + self.data[:len(ambiguous_pattern)]
            print(data)
            print(pattern)
        return [(data[i.start():i.end()], i.start(), i.end(), "innner") if (i.end()<=self.size) else (data[i.start():i.end()], i.start(), i.end()-self.size, "junction") for i in re.finditer(pattern, data)]

    def findpatterns(self, patterns, ambiguous_patterns):
        """
        Return a list of a given pattern which occurs in the sequence.
        The list is made of tuple (location, pattern.group).
        The latter is used with non palindromic sites.
        Pattern is the regular expression pattern corresponding to the
        enzyme restriction site.
        Size is the size of the restriction enzyme recognition-site size.
        """
        patts = {}
        for i in range(len(patterns)):
            #print(patterns[i])
            #print(ambiguous_patterns[i])
            if not self.circular:
                data = self.data
            else:
                data = self.data + self.data[:len(ambiguous_patterns[i])]
                #print(data)
            patts[ambiguous_patterns[i]] = [(data[j.start():j.end()], j.start(), j.end(), "innner") if (j.end()<=self.size) else (data[j.start():j.end()], j.start(), j.end()-self.size, "junction") for j in re.finditer(patterns[i], data)]

        return patts


    def codon_switch(self, patterns, annotation, codon_usage):

        table = {}

        for pattern in patterns:
            if pattern[1] >= annotation & pattern[2] <= annotation:
                # the pattern is in a coding sequence
                # do codon switch
                print("Do codon switch!")
            else:
                next
        return table

    def transversion(self, patterns):
        return

def read_patterns(input):

    patterns = []

    with open(input, "rU") as handle:
        for pat in handle.readlines():

            tmp = Seq(pat.rstrip(), IUPAC.ambiguous_dna)

            patterns.append(tmp)
            #patterns.append(tmp.reverse_complement())


    return patterns


def codon_usage(seqs, codonTable):

    codon_usage = {}
    tmp = [x for x in re.split(r'(\w{3})', seqs) if x != ""]

    b_cod_table = CodonTable.unambiguous_dna_by_name[codonTable].forward_table

    for cod in CodonTable.unambiguous_dna_by_name[codonTable].stop_codons:
        b_cod_table[cod] = "_Stop"

    for cod in CodonTable.unambiguous_dna_by_name[codonTable].start_codons:
        #print(cod)
        b_cod_table[cod] = b_cod_table[cod]

    aas = set(b_cod_table.values())

    for aa in aas:
        #print(aa)
        #codon_usage[aa] = {}
        for codon in b_cod_table.keys():
            if b_cod_table[codon] == aa:
                codon_usage[codon] = tmp.count(codon.split(" ")[0])

    return codon_usage


def read_annotated_genome(data="example.fna", type_="fasta"):
    """
    Accepted formats:
        - fasta (multifasta)
        - gbk

    """

    seqs = ""

    if type_ == "fasta":
        with open(data, "rU") as handle:
            for record in SeqIO.parse(handle, type_):
                seqs = seqs + str(record.seq)

    elif type_ == "genbank":
        with open(data, "rU") as input_handle:
            types = []
            for record in SeqIO.parse(input_handle, "genbank"):
                for feature in record.features:
                    types.append(feature.type)
                    if feature.type == "CDS":
                        if feature.location.strand == +1:
                            seq = record.seq[feature.location.start:feature.location.end]
                            seqs = seqs + str(seq)
                        elif feature.location.strand == -1:
                            seq = record.seq[feature.location.start:
                                             feature.location.end].reverse_complement()
                            seqs = seqs + str(seq)
    return seqs


def synonims_(table_name="Bacterial"):

    b_cod_table = CodonTable.unambiguous_dna_by_name[table_name].forward_table

    print(b_cod_table)

    for cod in CodonTable.unambiguous_dna_by_name[table_name].stop_codons:
        b_cod_table[cod] = "_Stop"

    for cod in CodonTable.unambiguous_dna_by_name[table_name].start_codons:
        b_cod_table[cod] = "_Start"

    #pprint(b_cod_table)
    codons = {}

    aas = set(b_cod_table.values())

    for aa in aas:
        codons[aa] = []
        for codon in b_cod_table.keys():
            if b_cod_table[codon] == aa:
                codons[aa].append(codon)

        #break

    synonims = {}

    for el1 in codons.keys():
        print(el1)
        for el2 in codons[el1]:
            print(el2)
            synonims[el2] = codons[el1]
            #synonims[el2] = []
            #for el3 in codons[el1]#set.difference(set(codons[el1]), {el2}):
            #    print(el3)
            #    synonims[el2].append(el3)
                #break
            #break
        #break


    anti_codons = {}

    for codon in synonims.keys():
            tmp_codon = Bio.Seq.Seq(codon, IUPAC.unambiguous_dna)
            tmp_anticodon = str(tmp_codon.reverse_complement())

            anti_codons[tmp_anticodon] = []

            for synonim in synonims[codon]:
                    tmp_synonim = Bio.Seq.Seq(synonim, IUPAC.unambiguous_dna)
                    tmp_antisynonim = str(tmp_synonim.reverse_complement())
                    anti_codons[tmp_anticodon].append(tmp_antisynonim)

    check = Bio.Seq.Seq("CTT")
    anti_check = check.reverse_complement()
    print("\nCheck:\n" + str(check))
    print("\nCodons:\n")

    for key in codons.keys():
        if str(check) in codons[key]:
            print(codons[key])

    #pprint(codons)
    print("\nSynonims:\n")
    pprint(synonims[str(check)])
    print("\nAnti_Codons:\n")
    pprint(anti_codons[str(anti_check)])

    #i = synonims.keys()
    #right = True
    #while len(i) > 0:
    #    tmp = i.pop()
    #    check = Bio.Seq.Seq(tmp)
    #    anti_check = check.reverse_complement()


    return {"synonims":synonims, "anti_synonims":anti_codons}