Mercurial > repos > cpt > cpt_sar_finder

#!/usr/bin/env python

import sys
import argparse
import os
import re
from Bio import SeqIO


class CheckSequence:
    """
    SAR endolysin Verification class, which starts with complete FA file, and is shrunk by each function to reveal best candidates of SAR endolysin proteins
    """

    def __init__(self, protein_name, protein_data):
        self.name = protein_name
        self.seq = protein_data.seq
        self.description = protein_data.description
        self.size = len(self.seq)
        self.store = {}

    def check_sizes(self, min, max):
        """check the minimum and maximum peptide lengths"""
        if self.size < min:
            print("too small")
        elif self.size > max:
            print("too large")
        else:
            print(f"{self.name} : {self.seq}")
            return True

    def check_hydrophobicity_and_charge(
        self, sar_min=15, sar_max=20, perc_residues="SGAT"
    ):
        """verifies the existence of a hydrophobic region within the sequence"""
        hydrophobic_residues = "['FIWLVMYCATGSP']"  # fed through regex
        hits = self.store
        pos_res = "RK"
        neg_res = "DE"

        if self.size > 50:
            seq = self.seq[0:50]
        else:
            seq = self.seq
        for sar_size in range(sar_min, sar_max, 1):
            for i in range(0, len(seq) - sar_size, 1):
                sar_seq = str(seq[i : i + sar_size])
                if re.search(
                    (hydrophobic_residues + "{" + str(sar_size) + "}"), sar_seq
                ):
                    (
                        charge_seq,
                        charge,
                        perc_cont,
                        sar_coords,
                        nterm_coords,
                        cterm_coords,
                        sar_start,
                        sar_end,
                    ) = rep_funcs(
                        self, seq, i, pos_res, neg_res, sar_seq, perc_residues, sar_size
                    )
                    storage_dict(
                        self=self,
                        sar_size=sar_size,
                        sar_seq=sar_seq,
                        hits=hits,
                        charge_seq=charge_seq,
                        charge=charge,
                        perc_cont=perc_cont,
                        nterm_coords=nterm_coords,
                        sar_coords=sar_coords,
                        cterm_coords=cterm_coords,
                        sar_start=sar_start,
                        sar_end=sar_end,
                    )
                    # print("TMDSIZE: {}\tINDEX: {}".format(sar_size,i+1))
                elif "K" in sar_seq[0] and re.search(
                    (hydrophobic_residues + "{" + str(sar_size - 1) + "}"), sar_seq[1:]
                ):  # check frontend snorkels
                    (
                        charge_seq,
                        charge,
                        perc_cont,
                        sar_coords,
                        nterm_coords,
                        cterm_coords,
                        sar_start,
                        sar_end,
                    ) = rep_funcs(
                        self, seq, i, pos_res, neg_res, sar_seq, perc_residues, sar_size
                    )
                    storage_dict(
                        self=self,
                        sar_size=sar_size,
                        sar_seq=sar_seq,
                        hits=hits,
                        charge_seq=charge_seq,
                        charge=charge,
                        perc_cont=perc_cont,
                        nterm_coords=nterm_coords,
                        sar_coords=sar_coords,
                        cterm_coords=cterm_coords,
                        sar_start=sar_start,
                        sar_end=sar_end,
                    )
                    # print("TMDSIZE: {}\tINDEX: {}".format(sar_size,i+1))
                elif "K" in sar_seq[-1] and re.search(
                    (hydrophobic_residues + "{" + str(sar_size - 1) + "}"), sar_seq[:-1]
                ):  # check backend snorkels
                    (
                        charge_seq,
                        charge,
                        perc_cont,
                        sar_coords,
                        nterm_coords,
                        cterm_coords,
                        sar_start,
                        sar_end,
                    ) = rep_funcs(
                        self, seq, i, pos_res, neg_res, sar_seq, perc_residues, sar_size
                    )
                    storage_dict(
                        self=self,
                        sar_size=sar_size,
                        sar_seq=sar_seq,
                        hits=hits,
                        charge_seq=charge_seq,
                        charge=charge,
                        perc_cont=perc_cont,
                        nterm_coords=nterm_coords,
                        sar_coords=sar_coords,
                        cterm_coords=cterm_coords,
                        sar_start=sar_start,
                        sar_end=sar_end,
                    )
                    # print("TMDSIZE: {}\tINDEX: {}".format(sar_size,i+1))
                continue

        return hits

    def shrink_results(self, sar_min=15, sar_max=20, perc_residues="SGAT"):
        """removes repetiive hits, keeps only the shortest and longest of each SAR domain"""
        compare_candidates = {}
        hits = self.check_hydrophobicity_and_charge(sar_min=sar_min, sar_max=sar_max)
        for sar_name, data in hits.items():
            # print(sar_name)
            compare_candidates[sar_name] = {}
            # print("\nThese are the values: {}".format(v))
            # count_of_times = 0
            tmd_log = []
            for sar_size in range(sar_max, sar_min - 1, -1):
                if "TMD_" + str(sar_size) in data:
                    tmd_log.append(sar_size)
                    # print(tmd_log)
                    for idx, the_data in enumerate(data["TMD_" + str(sar_size)]):
                        # print(the_data[7])
                        # print(the_data)
                        # print(f"This is the index: {idx}")
                        # print(f"This is the list of data at this index: {the_data}")
                        if (
                            the_data[7] in compare_candidates[sar_name]
                        ):  # index to start
                            compare_candidates[sar_name][the_data[7]]["count"] += 1
                            compare_candidates[sar_name][the_data[7]]["size"].append(
                                sar_size
                            )
                            compare_candidates[sar_name][the_data[7]]["index"].append(
                                idx
                            )
                        else:
                            compare_candidates[sar_name][the_data[7]] = {}
                            compare_candidates[sar_name][the_data[7]]["count"] = 1
                            compare_candidates[sar_name][the_data[7]]["size"] = [
                                sar_size
                            ]
                            compare_candidates[sar_name][the_data[7]]["index"] = [idx]
            hits[sar_name]["biggest_sar"] = tmd_log[0]
        for sar_name, compare_data in compare_candidates.items():
            for data in compare_data.values():
                if len(data["size"]) >= 3:
                    # print(f"{each_size} --> {data}")
                    minmax = [min(data["size"]), max(data["size"])]
                    nonminmax = [x for x in data["size"] if x not in minmax]
                    nonminmax_index = []
                    for each_nonminmax in nonminmax:
                        v = data["size"].index(each_nonminmax)
                        x = data["index"][v]
                        nonminmax_index.append(x)
                    nons = zip(nonminmax, nonminmax_index)
                    for value in nons:
                        # hits[sar_name]["TMD_"+str(value[0])] = hits[sar_name]["TMD_"+str(value[0])].pop(value[1])
                        hits[sar_name]["TMD_" + str(value[0])][value[1]] = [""]

        return hits


def rep_funcs(self, seq, loc, pos_res, neg_res, sar_seq, perc_residues, sar_size):
    """run a set of functions together before sending the results to the storage dictionary"""

    charge_seq = str(seq[:loc])
    charge = charge_check(charge_seq, pos_res, neg_res)
    perc_cont = percent_calc(sar_seq, perc_residues, int(sar_size))
    sar_start = loc
    sar_end = loc + sar_size
    sar_coords = "{}..{}".format(loc, loc + sar_size)
    nterm_coords = "{}..{}".format("0", loc - 1)
    cterm_coords = "{}..{}".format(loc + sar_size + 1, self.size)

    return (
        charge_seq,
        charge,
        perc_cont,
        sar_coords,
        nterm_coords,
        cterm_coords,
        sar_start,
        sar_end,
    )


### Extra "helper" functions
def storage_dict(
    self,
    sar_size,
    sar_seq,
    hits,
    charge_seq,
    charge,
    perc_cont,
    nterm_coords,
    sar_coords,
    cterm_coords,
    sar_start,
    sar_end,
):  # probably not good to call "self" a param here...definitley not PEP approved...
    """organize dictionary for hydrophobicity check"""
    if self.name not in hits:
        hits[self.name] = {}
        hits[self.name]["description"] = str(self.description)
        hits[self.name]["sequence"] = str(self.seq)
        hits[self.name]["size"] = str(self.size)
        # GAcont = str((str(self.seq).count("G")+str(self.seq).count("A"))/int(self.size)*100)
        # hits[self.name]["GAcont"] = "{:.2f}%".format(float(GAcont))
        if "TMD_" + str(sar_size) not in hits[self.name]:
            hits[self.name]["TMD_" + str(sar_size)] = []
            hits[self.name]["TMD_" + str(sar_size)].append(
                [
                    sar_seq,
                    charge_seq,
                    charge,
                    perc_cont,
                    nterm_coords,
                    sar_coords,
                    cterm_coords,
                    sar_start,
                    sar_end,
                ]
            )
        else:
            hits[self.name]["TMD_" + str(sar_size)].append(
                [
                    sar_seq,
                    charge_seq,
                    charge,
                    perc_cont,
                    nterm_coords,
                    sar_coords,
                    cterm_coords,
                    sar_start,
                    sar_end,
                ]
            )
    else:
        if "TMD_" + str(sar_size) not in hits[self.name]:
            hits[self.name]["TMD_" + str(sar_size)] = []
            hits[self.name]["TMD_" + str(sar_size)].append(
                [
                    sar_seq,
                    charge_seq,
                    charge,
                    perc_cont,
                    nterm_coords,
                    sar_coords,
                    cterm_coords,
                    sar_start,
                    sar_end,
                ]
            )
        else:
            hits[self.name]["TMD_" + str(sar_size)].append(
                [
                    sar_seq,
                    charge_seq,
                    charge,
                    perc_cont,
                    nterm_coords,
                    sar_coords,
                    cterm_coords,
                    sar_start,
                    sar_end,
                ]
            )


def percent_calc(sequence, residues, size):
    """Calculate the percent of a set of residues within an input sequence"""
    counted = {}
    for aa in sequence:
        # print(aa)
        if aa in counted:
            counted[aa] += 1
        else:
            counted[aa] = 1
    residue_amt = 0
    my_ratios = []
    for res_of_interest in residues:
        try:
            residue_amt = counted[res_of_interest]
        except KeyError:
            residue_amt = 0
        ratio = residue_amt / size
        my_ratios.append((round(ratio * 100, 2)))

    res_rat = list(zip(residues, my_ratios))

    return res_rat


def charge_check(charge_seq, pos_res, neg_res):
    charge = 0
    for aa in charge_seq:
        if aa in pos_res:
            charge += 1
        if aa in neg_res:
            charge -= 1
    return charge


if __name__ == "__main__":
    sequence = "MAGBYYYTRLCVRKLRKGGGHP"
    residues = "YL"
    size = len(sequence)
    print(size)
    v = percent_calc(sequence, residues, size)
    print(v)
    for i in v:
        print(i)
author	cpt
date	Fri, 05 Jan 2024 05:57:09 +0000
parents	112751823323
children