diff syngenic.py @ 0:5397da1ef896 draft

Uploaded
author gianmarco_piccinno
date Tue, 21 May 2019 05:05:15 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/syngenic.py	Tue May 21 05:05:15 2019 -0400
@@ -0,0 +1,362 @@
+#!/usr/bin/env python
+
+__author__= "Gianmarco Piccinno"
+__version__ = "1.0.0"
+
+import Bio
+from Bio import SeqIO
+from Bio.Seq import Seq
+from Bio.Alphabet import IUPAC
+from Bio.Data import IUPACData
+from Bio.Data import CodonTable
+import re
+import sre_yield
+
+import re
+import itertools
+from functools import reduce
+
+import Bio
+from Bio import Data
+from Bio.Data import IUPACData
+from Bio.Data import CodonTable
+
+from pprint import pprint
+
+import pandas as pd
+
+def _check_bases(seq_string):
+    """
+    Check characters in a string (PRIVATE).
+    Remove digits and white space present in string. Allows any valid ambiguous
+    IUPAC DNA single letters codes (ABCDGHKMNRSTVWY, upper case are converted).
+
+    Other characters (e.g. symbols) trigger a TypeError.
+
+    Returns the string WITH A LEADING SPACE (!). This is for backwards
+    compatibility, and may in part be explained by the fact that
+    Bio.Restriction doesn't use zero based counting.
+    """
+    # Remove white space and make upper case:
+    seq_string = "".join(seq_string.split()).upper()
+    # Remove digits
+    for c in "0123456789":
+        seq_string = seq_string.replace(c, "")
+    # Check only allowed IUPAC letters
+    if not set(seq_string).issubset(set("ABCDGHKMNRSTVWY")):
+        raise TypeError("Invalid character found in %s" % repr(seq_string))
+    return " " + seq_string
+
+    matching = {'A': 'ARWMHVDN', 'C': 'CYSMHBVN', 'G': 'GRSKBVDN',
+                'T': 'TYWKHBDN', 'R': 'ABDGHKMNSRWV', 'Y': 'CBDHKMNSTWVY',
+                'W': 'ABDHKMNRTWVY', 'S': 'CBDGHKMNSRVY', 'M': 'ACBDHMNSRWVY',
+                'K': 'BDGHKNSRTWVY', 'H': 'ACBDHKMNSRTWVY',
+                'B': 'CBDGHKMNSRTWVY', 'V': 'ACBDGHKMNSRWVY',
+                'D': 'ABDGHKMNSRTWVY', 'N': 'ACBDGHKMNSRTWVY'}
+
+class pattern(object):
+
+
+    def __init__(self, pattern_input):
+        s = str(pattern_input)
+        self.upper = s.isupper()
+        self.data = _check_bases(s)
+        self.pattern = s
+
+    def plan_ambiguity(self):
+        val = Bio.Data.IUPACData.ambiguous_dna_values
+        re_pattern = ""
+        for el in self.pattern:
+            re_pattern = re_pattern + "[" + val[el] + "]"
+        return re_pattern
+
+class annotated_genome(object):
+
+    def __init__(self, seq):
+        s = str(seq)
+        self.upper = s.isupper()
+        self.data = _check_bases(s)
+        self.seq = s
+
+    def codon_usage(self, codonTable):
+
+        codon_usage = {}
+        tmp = [x for x in re.split(r'(\w{3})', self.seq) if x != ""]
+
+        b_cod_table = CodonTable.unambiguous_dna_by_name["Bacterial"].forward_table
+        aas = set(b_cod_table.values())
+
+        for aa in aas:
+            codon_usage[aa] = {}
+            for codon in b_cod_table.keys():
+                if b_cod_table[codon] == aa:
+                    codon_usage[aa][codon] = tmp.count(codon)
+
+        tups = {(outerKey, innerKey): values for outerKey, innerDict in codon_usage.iteritems() for innerKey, values in innerDict.iteritems()}
+
+        codon_usage_ = pd.DataFrame(pd.Series(tups), columns = ["Count"])
+        codon_usage_.index = codon_usage_.index.set_names(["AA", "Codon"])
+        codon_usage_['Proportion'] = codon_usage_.groupby(level=0).transform(lambda x: (x / x.sum()).round(2))
+
+        codon_usage_.reset_index(inplace=True)
+        codon_usage_.index = codon_usage_["Codon"]
+
+        return {"Dictionary": codon_usage, "Tuples": tups, "Table": codon_usage_}
+
+class plasmid(object):
+    """
+    This class represents a circular plasmid
+    """
+
+    def __init__(self, seq = "", circular=True, features = None):
+
+        if type(seq) in [Bio.SeqRecord.SeqRecord, plasmid, Seq]:
+            s = str(seq.seq)
+            self.features = seq.features
+        else:
+            s = str(seq)
+            if features != None:
+                self.features = features
+        self.upper = s.isupper()
+        #self.data = _check_bases(s)
+        self.data = s
+        self.circular = circular
+        self.Class = s.__class__
+        self.size = len(s)
+        self.seq = self.data
+
+    def __len__(self):
+        return len(self.data)
+
+    def __repr__(self):
+        return "plasmid(%s, circular=%s)" % (repr(self[:]),repr(self.circular))
+
+    def __eq__(self, other):
+        if isinstance(other, plasmid):
+            if repr(self) == repr(other):
+                return True
+            else:
+                return False
+        return False
+
+    def __getitem__(self, i):
+        if self.upper:
+            return self.Class((self.data[i]).upper())
+        return self.Class(self.data[i])
+
+    def sequence(self):
+        return self.data
+
+
+    def extract(self, feature):
+        return self.data[feature.location.start:feature.location.end]#[::feature.strand]
+
+
+    def findpattern(self, pattern, ambiguous_pattern):
+        """
+        Return a list of a given pattern which occurs in the sequence.
+        The list is made of tuple (location, pattern.group).
+        The latter is used with non palindromic sites.
+        Pattern is the regular expression pattern corresponding to the
+        enzyme restriction site.
+        Size is the size of the restriction enzyme recognition-site size.
+        """
+        if not self.circular:
+            data = self.data
+        else:
+            data = self.data + self.data[:len(ambiguous_pattern)]
+            print(data)
+            print(pattern)
+        return [(data[i.start():i.end()], i.start(), i.end(), "innner") if (i.end()<=self.size) else (data[i.start():i.end()], i.start(), i.end()-self.size, "junction") for i in re.finditer(pattern, data)]
+
+    def findpatterns(self, patterns, ambiguous_patterns):
+        """
+        Return a list of a given pattern which occurs in the sequence.
+        The list is made of tuple (location, pattern.group).
+        The latter is used with non palindromic sites.
+        Pattern is the regular expression pattern corresponding to the
+        enzyme restriction site.
+        Size is the size of the restriction enzyme recognition-site size.
+        """
+        patts = {}
+        for i in range(len(patterns)):
+            #print(patterns[i])
+            #print(ambiguous_patterns[i])
+            if not self.circular:
+                data = self.data
+            else:
+                data = self.data + self.data[:len(ambiguous_patterns[i])]
+                #print(data)
+            patts[ambiguous_patterns[i]] = [(data[j.start():j.end()], j.start(), j.end(), "innner") if (j.end()<=self.size) else (data[j.start():j.end()], j.start(), j.end()-self.size, "junction") for j in re.finditer(patterns[i], data)]
+
+        return patts
+
+
+    def codon_switch(self, patterns, annotation, codon_usage):
+
+        table = {}
+
+        for pattern in patterns:
+            if pattern[1] >= annotation & pattern[2] <= annotation:
+                # the pattern is in a coding sequence
+                # do codon switch
+                print("Do codon switch!")
+            else:
+                next
+        return table
+
+    def transversion(self, patterns):
+        return
+
+def read_patterns(input):
+
+    patterns = []
+
+    with open(input, "rU") as handle:
+        for pat in handle.readlines():
+
+            tmp = Seq(pat.rstrip(), IUPAC.ambiguous_dna)
+
+            patterns.append(tmp)
+            #patterns.append(tmp.reverse_complement())
+
+
+    return patterns
+
+
+def codon_usage(seqs, codonTable):
+
+    codon_usage = {}
+    tmp = [x for x in re.split(r'(\w{3})', seqs) if x != ""]
+
+    b_cod_table = CodonTable.unambiguous_dna_by_name[codonTable].forward_table
+
+    for cod in CodonTable.unambiguous_dna_by_name[codonTable].stop_codons:
+        b_cod_table[cod] = "_Stop"
+
+    for cod in CodonTable.unambiguous_dna_by_name[codonTable].start_codons:
+        #print(cod)
+        b_cod_table[cod] = b_cod_table[cod]
+
+    aas = set(b_cod_table.values())
+
+    for aa in aas:
+        #print(aa)
+        #codon_usage[aa] = {}
+        for codon in b_cod_table.keys():
+            if b_cod_table[codon] == aa:
+                codon_usage[codon] = tmp.count(codon.split(" ")[0])
+
+    return codon_usage
+
+
+def read_annotated_genome(data="example.fna", type_="fasta"):
+    """
+    Accepted formats:
+        - fasta (multifasta)
+        - gbk
+
+    """
+
+    seqs = ""
+
+    if type_ == "fasta":
+        with open(data, "rU") as handle:
+            for record in SeqIO.parse(handle, type_):
+                seqs = seqs + str(record.seq)
+
+    elif type_ == "genbank":
+        with open(data, "rU") as input_handle:
+            types = []
+            for record in SeqIO.parse(input_handle, "genbank"):
+                for feature in record.features:
+                    types.append(feature.type)
+                    if feature.type == "CDS":
+                        if feature.location.strand == +1:
+                            seq = record.seq[feature.location.start:feature.location.end]
+                            seqs = seqs + str(seq)
+                        elif feature.location.strand == -1:
+                            seq = record.seq[feature.location.start:
+                                             feature.location.end].reverse_complement()
+                            seqs = seqs + str(seq)
+    return seqs
+
+
+def synonims_(table_name="Bacterial"):
+
+    b_cod_table = CodonTable.unambiguous_dna_by_name[table_name].forward_table
+
+    print(b_cod_table)
+
+    for cod in CodonTable.unambiguous_dna_by_name[table_name].stop_codons:
+        b_cod_table[cod] = "_Stop"
+
+    for cod in CodonTable.unambiguous_dna_by_name[table_name].start_codons:
+        b_cod_table[cod] = "_Start"
+
+    #pprint(b_cod_table)
+    codons = {}
+
+    aas = set(b_cod_table.values())
+
+    for aa in aas:
+        codons[aa] = []
+        for codon in b_cod_table.keys():
+            if b_cod_table[codon] == aa:
+                codons[aa].append(codon)
+
+        #break
+
+    synonims = {}
+
+    for el1 in codons.keys():
+        print(el1)
+        for el2 in codons[el1]:
+            print(el2)
+            synonims[el2] = codons[el1]
+            #synonims[el2] = []
+            #for el3 in codons[el1]#set.difference(set(codons[el1]), {el2}):
+            #    print(el3)
+            #    synonims[el2].append(el3)
+                #break
+            #break
+        #break
+
+
+    anti_codons = {}
+
+    for codon in synonims.keys():
+            tmp_codon = Bio.Seq.Seq(codon, IUPAC.unambiguous_dna)
+            tmp_anticodon = str(tmp_codon.reverse_complement())
+
+            anti_codons[tmp_anticodon] = []
+
+            for synonim in synonims[codon]:
+                    tmp_synonim = Bio.Seq.Seq(synonim, IUPAC.unambiguous_dna)
+                    tmp_antisynonim = str(tmp_synonim.reverse_complement())
+                    anti_codons[tmp_anticodon].append(tmp_antisynonim)
+
+    check = Bio.Seq.Seq("CTT")
+    anti_check = check.reverse_complement()
+    print("\nCheck:\n" + str(check))
+    print("\nCodons:\n")
+
+    for key in codons.keys():
+        if str(check) in codons[key]:
+            print(codons[key])
+
+    #pprint(codons)
+    print("\nSynonims:\n")
+    pprint(synonims[str(check)])
+    print("\nAnti_Codons:\n")
+    pprint(anti_codons[str(anti_check)])
+
+    #i = synonims.keys()
+    #right = True
+    #while len(i) > 0:
+    #    tmp = i.pop()
+    #    check = Bio.Seq.Seq(tmp)
+    #    anti_check = check.reverse_complement()
+
+
+    return {"synonims":synonims, "anti_synonims":anti_codons}