Mercurial > repos > cpt > cpt_sar_finder
changeset 0:9f62910edcc9 draft
Uploaded
author | cpt |
---|---|
date | Fri, 17 Jun 2022 13:15:55 +0000 |
parents | |
children | 112751823323 |
files | cpt_sar_finder/SAR_finder.py cpt_sar_finder/SAR_finder.xml cpt_sar_finder/SAR_functions.py cpt_sar_finder/biopython_parsing.py cpt_sar_finder/cpt-macros.xml cpt_sar_finder/file_operations.py cpt_sar_finder/macros.xml cpt_sar_finder/test-data/candidate_SAR.fa cpt_sar_finder/test-data/candidate_SAR.gff3 cpt_sar_finder/test-data/candidate_SAR_stats.tsv cpt_sar_finder/test-data/simple-proteins.fa |
diffstat | 11 files changed, 583 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_sar_finder/SAR_finder.py Fri Jun 17 13:15:55 2022 +0000 @@ -0,0 +1,44 @@ +import sys +import argparse +import os +import re +from biopython_parsing import FASTA_parser +from file_operations import fasta_from_SAR_dict, gff3_from_SAR_dict, tab_from_SAR_dict +from SAR_functions import CheckSequence + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="SAR Finder") + + parser.add_argument("fa",type=argparse.FileType("r"),help="organism's multi fasta file") + + parser.add_argument("--min",type=int,default=20,help="minimum size of candidate peptide") + + parser.add_argument("--max",type=int,default=200,help="maximum size of candidate peptide") + + parser.add_argument("--sar_min",type=int,default=15,help="minimum size of candidate peptide TMD domain") + + parser.add_argument("--sar_max",type=int,default=24,help="maximum size of candidate peptide TMD domain") + + parser.add_argument("--out_fa",type=argparse.FileType("w"),help="multifasta output of candidate SAR proteins",default="candidate_SAR.fa") + + parser.add_argument("--out_stat",type=argparse.FileType("w"),help="summary statistic file for candidate SAR proteins, tab separated",default="candidate_SAR_stats.tsv") + + parser.add_argument("--out_gff3",type=argparse.FileType("w"),help="multigff3 file for candidate SAR proteins",default="candidate_SAR.gff3") + + args = parser.parse_args() + + fa_dict = FASTA_parser(fa=args.fa).multifasta_dict() + + sars = {} + + for protein_name, protein_data in fa_dict.items(): + sar = CheckSequence(protein_name, protein_data) + #sar.check_sizes(min=args.min,max=args.max) + hydros = sar.shrink_results(sar_min=args.sar_min, sar_max=args.sar_max) + sars.update(hydros) + + + gff3_from_SAR_dict(sars, args.out_gff3) + tab_from_SAR_dict(sars,args.out_stat,"SGAT",sar_min=args.sar_min, sar_max=args.sar_max) + fasta_from_SAR_dict(sars,args.out_fa) + #stat_file_from_SAR_dict(sars,args.out_stat,sar_min=args.sar_min,sar_max=args.sar_max) # fix this whenever ready. \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_sar_finder/SAR_finder.xml Fri Jun 17 13:15:55 2022 +0000 @@ -0,0 +1,73 @@ +<tool id="edu.tamu.cpt.sar.sar_finder" name="SAR Finder" version="1.0"> + <description>SAR Domain Finder</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements"> + </expand> + <command detect_errors="aggressive"><![CDATA[ +python $__tool_directory__/SAR_finder.py +$fa +--sar_min $sar_min +--sar_max $sar_max +--out_fa $out_fa +--out_gff3 $out_gff3 +--out_stat $out_stat + ]]></command> + <inputs> + <param label="Multi FASTA File" name="fa" type="data" format="fasta" /> + <param label="SAR domain minimal size" name="sar_min" type="integer" value="15" /> + <param label="SAR domain maximum size" name="sar_max" type="integer" value="20" /> + </inputs> + <outputs> + <data format="tabular" name="out_stat" label="candidate_SAR_stats.tsv"/> + <data format="fasta" name="out_fa" label="candidate_SAR.fa"/> + <data format="gff3" name="out_gff3" label="candidate_SAR.gff3"/> + </outputs> + <tests> + <test> + <param name="fa" value="simple-proteins.fa"/> + <param name="sar_min" value="15"/> + <param name="sar_max" value="20"/> + <output name="out_stat" file="candidate_SAR_stats.tsv"/> + <output name="out_fa" file="candidate_SAR.fa"/> + <output name="out_gff3" file="candidate_SAR.gff3"/> + </test> + </tests> + <help><![CDATA[ + +**What it does** +A tool that analyzes the sequence within the first 50 residues of a protein for a weakly hydrophobic domain called Signal-Anchor-Release (aka SAR). +The tool finds proteins that contain a stretch (default 15-20 residues) of hydrophobic residues (Ile, Leu, Val, Phe, Tyr, Trp, Met, Gly, Ala, Ser) and +calculates the % Gly/Ala/Ser/Thr residues in the hydrophobic stretch. The net charge on the N-terminus is also displayed to aid in determining the +SAR orientation in the membrane.[2] + +Definition: A Signal-Anchor-Release (SAR) domain is an N-terminal, weakly hydrophobic transmembrane region rich in Gly/Ala and/or Ser (and sometimes Thr) residues. +The SAR domain is sometimes found in phage lysis proteins, including endolysins and holins. The SAR domain can be released from the membrane in a proton +motive force-dependent manner. Known SAR domains in phage endolysins often have >50-60% Gly/Ala/Ser/Thr content. SAR endolysins are expected to have a net positive +charge on the N-terminus by the positive-inside rule. + +**INPUT** --> Protein Multi FASTA + +**OUTPUT** --> + +* Multi FASTA with candidate proteins that pass the SAR domain criteria + +* Tabular summary file that lists every subdomain fitting the criteria for each potential SAR domain-containing protein with the following: protein name/sequence/length, SAR length/start/sequence/end, individual and total GAST% content in SAR, and N-terminal sequence/net charge + +* Multi GFF3 for unique candidate SAR domain-containing proteins + + ]]></help> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="doi">https://dx.doi.org/10.1016/bs.aivir.2018.09.003</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {C. Ross}, + title = {CPT Galaxy Tools}, + year = {2020-}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_sar_finder/SAR_functions.py Fri Jun 17 13:15:55 2022 +0000 @@ -0,0 +1,193 @@ +#!/usr/bin/env python + +import sys +import argparse +import os +import re +from Bio import SeqIO + + +class CheckSequence: + """ + SAR endolysin Verification class, which starts with complete FA file, and is shrunk by each function to reveal best candidates of SAR endolysin proteins + """ + + + def __init__(self, protein_name, protein_data): + self.name = protein_name + self.seq = protein_data.seq + self.description = protein_data.description + self.size = len(self.seq) + self.store = {} + + + def check_sizes(self,min,max): + """ check the minimum and maximum peptide lengths """ + if self.size < min: + print("too small") + elif self.size > max: + print("too large") + else: + print(f"{self.name} : {self.seq}") + return True + + + def check_hydrophobicity_and_charge(self,sar_min=15,sar_max=20,perc_residues="SGAT"): + """ verifies the existence of a hydrophobic region within the sequence """ + hydrophobic_residues = "['FIWLVMYCATGSP']" # fed through regex + hits = self.store + pos_res = "RK" + neg_res = "DE" + + if self.size > 50: + seq = self.seq[0:50] + else: + seq = self.seq + for sar_size in range(sar_min, sar_max, 1): + for i in range(0,len(seq)-sar_size,1): + sar_seq = str(seq[i:i+sar_size]) + if re.search((hydrophobic_residues+"{"+str(sar_size)+"}"),sar_seq): + charge_seq, charge, perc_cont, sar_coords, nterm_coords, cterm_coords, sar_start, sar_end = rep_funcs(self,seq,i,pos_res,neg_res,sar_seq,perc_residues,sar_size) + storage_dict(self=self,sar_size=sar_size,sar_seq=sar_seq,hits=hits,charge_seq=charge_seq,charge=charge,perc_cont=perc_cont,nterm_coords=nterm_coords,sar_coords=sar_coords,cterm_coords=cterm_coords,sar_start=sar_start,sar_end=sar_end) + #print("TMDSIZE: {}\tINDEX: {}".format(sar_size,i+1)) + elif "K" in sar_seq[0] and re.search((hydrophobic_residues+"{"+str(sar_size-1)+"}"),sar_seq[1:]): # check frontend snorkels + charge_seq, charge, perc_cont, sar_coords, nterm_coords, cterm_coords, sar_start, sar_end = rep_funcs(self,seq,i,pos_res,neg_res,sar_seq,perc_residues,sar_size) + storage_dict(self=self,sar_size=sar_size,sar_seq=sar_seq,hits=hits,charge_seq=charge_seq,charge=charge,perc_cont=perc_cont,nterm_coords=nterm_coords,sar_coords=sar_coords,cterm_coords=cterm_coords,sar_start=sar_start,sar_end=sar_end) + #print("TMDSIZE: {}\tINDEX: {}".format(sar_size,i+1)) + elif "K" in sar_seq[-1] and re.search((hydrophobic_residues+"{"+str(sar_size-1)+"}"),sar_seq[:-1]): # check backend snorkels + charge_seq, charge, perc_cont, sar_coords, nterm_coords, cterm_coords, sar_start, sar_end = rep_funcs(self,seq,i,pos_res,neg_res,sar_seq,perc_residues,sar_size) + storage_dict(self=self,sar_size=sar_size,sar_seq=sar_seq,hits=hits,charge_seq=charge_seq,charge=charge,perc_cont=perc_cont,nterm_coords=nterm_coords,sar_coords=sar_coords,cterm_coords=cterm_coords,sar_start=sar_start,sar_end=sar_end) + #print("TMDSIZE: {}\tINDEX: {}".format(sar_size,i+1)) + continue + + return hits + + def shrink_results(self,sar_min=15,sar_max=20,perc_residues="SGAT"): + """ removes repetiive hits, keeps only the shortest and longest of each SAR domain """ + compare_candidates = {} + hits = self.check_hydrophobicity_and_charge(sar_min=sar_min,sar_max=sar_max) + for sar_name, data in hits.items(): + #print(sar_name) + compare_candidates[sar_name] = {} + #print("\nThese are the values: {}".format(v)) + #count_of_times = 0 + tmd_log = [] + for sar_size in range(sar_max,sar_min-1,-1): + if "TMD_"+str(sar_size) in data: + tmd_log.append(sar_size) + #print(tmd_log) + for idx,the_data in enumerate(data["TMD_"+str(sar_size)]): + #print(the_data[7]) + #print(the_data) + #print(f"This is the index: {idx}") + #print(f"This is the list of data at this index: {the_data}") + if the_data[7] in compare_candidates[sar_name]: # index to start + compare_candidates[sar_name][the_data[7]]["count"] += 1 + compare_candidates[sar_name][the_data[7]]["size"].append(sar_size) + compare_candidates[sar_name][the_data[7]]["index"].append(idx) + else: + compare_candidates[sar_name][the_data[7]] = {} + compare_candidates[sar_name][the_data[7]]["count"] = 1 + compare_candidates[sar_name][the_data[7]]["size"] = [sar_size] + compare_candidates[sar_name][the_data[7]]["index"] = [idx] + hits[sar_name]["biggest_sar"] = tmd_log[0] + for sar_name, compare_data in compare_candidates.items(): + for data in compare_data.values(): + if len(data["size"]) >= 3: + #print(f"{each_size} --> {data}") + minmax = [min(data["size"]),max(data["size"])] + nonminmax = [x for x in data["size"] if x not in minmax] + nonminmax_index = [] + for each_nonminmax in nonminmax: + v = data["size"].index(each_nonminmax) + x = data["index"][v] + nonminmax_index.append(x) + nons = zip(nonminmax,nonminmax_index) + for value in nons: + #hits[sar_name]["TMD_"+str(value[0])] = hits[sar_name]["TMD_"+str(value[0])].pop(value[1]) + hits[sar_name]["TMD_"+str(value[0])][value[1]] = [""] + + return hits + + +def rep_funcs(self,seq,loc,pos_res,neg_res,sar_seq,perc_residues,sar_size): + """ run a set of functions together before sending the results to the storage dictionary """ + + charge_seq = str(seq[:loc]) + charge = charge_check(charge_seq,pos_res,neg_res) + perc_cont = percent_calc(sar_seq,perc_residues,int(sar_size)) + sar_start = loc + sar_end = loc + sar_size + sar_coords = "{}..{}".format(loc,loc+sar_size) + nterm_coords = "{}..{}".format("0",loc-1) + cterm_coords = "{}..{}".format(loc+sar_size+1,self.size) + + return charge_seq, charge, perc_cont, sar_coords, nterm_coords, cterm_coords, sar_start, sar_end + + +### Extra "helper" functions +def storage_dict(self,sar_size,sar_seq,hits,charge_seq,charge,perc_cont,nterm_coords,sar_coords,cterm_coords,sar_start,sar_end): # probably not good to call "self" a param here...definitley not PEP approved... + """ organize dictionary for hydrophobicity check """ + if self.name not in hits: + hits[self.name] = {} + hits[self.name]["description"] = str(self.description) + hits[self.name]["sequence"] = str(self.seq) + hits[self.name]["size"] = str(self.size) + #GAcont = str((str(self.seq).count("G")+str(self.seq).count("A"))/int(self.size)*100) + #hits[self.name]["GAcont"] = "{:.2f}%".format(float(GAcont)) + if "TMD_"+str(sar_size) not in hits[self.name]: + hits[self.name]["TMD_"+str(sar_size)] = [] + hits[self.name]["TMD_"+str(sar_size)].append([sar_seq,charge_seq,charge,perc_cont,nterm_coords,sar_coords,cterm_coords,sar_start,sar_end]) + else: + hits[self.name]["TMD_"+str(sar_size)].append([sar_seq,charge_seq,charge,perc_cont,nterm_coords,sar_coords,cterm_coords,sar_start,sar_end]) + else: + if "TMD_"+str(sar_size) not in hits[self.name]: + hits[self.name]["TMD_"+str(sar_size)] = [] + hits[self.name]["TMD_"+str(sar_size)].append([sar_seq,charge_seq,charge,perc_cont,nterm_coords,sar_coords,cterm_coords,sar_start,sar_end]) + else: + hits[self.name]["TMD_"+str(sar_size)].append([sar_seq,charge_seq,charge,perc_cont,nterm_coords,sar_coords,cterm_coords,sar_start,sar_end]) + + +def percent_calc(sequence,residues,size): + """ Calculate the percent of a set of residues within an input sequence """ + counted = {} + for aa in sequence: + #print(aa) + if aa in counted: + counted[aa] += 1 + else: + counted[aa] = 1 + residue_amt = 0 + my_ratios = [] + for res_of_interest in residues: + try: + residue_amt = counted[res_of_interest] + except KeyError: + residue_amt = 0 + ratio = residue_amt/size + my_ratios.append((round(ratio*100,2))) + + res_rat = list(zip(residues,my_ratios)) + + return res_rat + + +def charge_check(charge_seq,pos_res,neg_res): + charge = 0 + for aa in charge_seq: + if aa in pos_res: + charge += 1 + if aa in neg_res: + charge -= 1 + return charge + +if __name__ == "__main__": + sequence = "MAGBYYYTRLCVRKLRKGGGHP" + residues = "YL" + size = len(sequence) + print(size) + v = percent_calc(sequence,residues,size) + print(v) + for i in v: + print(i) +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_sar_finder/biopython_parsing.py Fri Jun 17 13:15:55 2022 +0000 @@ -0,0 +1,24 @@ +#!/usr/bin/env python +# Biopython parsing module. Uses in conjunction with the sar_finder script, and potential future scripts down the line. + +from Bio import SeqIO + +class FASTA_parser: + """ Parses multi fasta file, and zips together header with sequence """ + + def __init__(self, fa): + self.fa = fa + + def multifasta_dict(self): + """ parses the input multi fasta, and puts results into dictionary """ + + return SeqIO.to_dict(SeqIO.parse(self.fa,"fasta")) + + +if __name__ == "__main__": + fa_file = "test-data/mu-proteins.fa" + d = FASTA_parser(fa_file).multifasta_dict() + print(d) + for k, v in d.items(): + print(v.description) +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_sar_finder/cpt-macros.xml Fri Jun 17 13:15:55 2022 +0000 @@ -0,0 +1,115 @@ +<?xml version="1.0"?> +<macros> + <xml name="gff_requirements"> + <requirements> + <requirement type="package" version="2.7">python</requirement> + <requirement type="package" version="1.65">biopython</requirement> + <requirement type="package" version="2.12.1">requests</requirement> + <yield/> + </requirements> + <version_command> + <![CDATA[ + cd $__tool_directory__ && git rev-parse HEAD + ]]> + </version_command> + </xml> + <xml name="citation/mijalisrasche"> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex">@unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + </xml> + <xml name="citations"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-crr"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {C. Ross}, + title = {CPT Galaxy Tools}, + year = {2020-}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-2020"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {A. Criscione}, + title = {CPT Galaxy Tools}, + year = {2019-2021}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-2020-AJC-solo"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {A. Criscione}, + title = {CPT Galaxy Tools}, + year = {2019-2021}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-clm"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {C. Maughmer}, + title = {CPT Galaxy Tools}, + year = {2017-2020}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="sl-citations-clm"> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {C. Maughmer}, + title = {CPT Galaxy Tools}, + year = {2017-2020}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </xml> +</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_sar_finder/file_operations.py Fri Jun 17 13:15:55 2022 +0000 @@ -0,0 +1,98 @@ + +def fasta_from_SAR_dict(sar_dict,fa_file): + """ makes a multi fasta with candidates from SAR dictionary """ + with fa_file as f: + for data in sar_dict.values(): + f.writelines(">{}\n".format(data["description"])) + f.writelines("{}\n".format(data["sequence"])) + +def gff3_from_SAR_dict(sar_dict,gff3_file): + """ make a multi gff3 with candidates from SAR dictionary """ + gff3_cols = ["Seqid","Source","Type","Start","End","Score","Strand","Phase","Attributes"] + with gff3_file as f: + f.writelines(f"{gff3_cols[0]}\t{gff3_cols[1]}\t{gff3_cols[2]}\t{gff3_cols[3]}\t{gff3_cols[4]}\t{gff3_cols[5]}\t{gff3_cols[6]}\t{gff3_cols[7]}\t{gff3_cols[8]}\n") + if sar_dict: + #print(sar_dict) + for name, data in sar_dict.items(): + min_idx = 0 + f.writelines("##gff-version 3\n") + f.writelines(f"##sequence-region {name}\n") + n_start, n_end = split_seq_string(data["TMD_"+str(data["biggest_sar"])][min_idx][4]) + sar_start, sar_end = split_seq_string(data["TMD_"+str(data["biggest_sar"])][min_idx][5]) + c_start, c_end = split_seq_string(data["TMD_"+str(data["biggest_sar"])][min_idx][6]) + f.writelines(f'{name}\tSAR_finder\tTopological domain\t{n_start}\t{n_end}\t.\t.\t.\tNote=N-terminal net charge is {data["TMD_"+str(data["biggest_sar"])][min_idx][2]}\n') + f.writelines(f'{name}\tSAR_finder\tSAR domain\t{sar_start}\t{sar_end}\t.\t.\t.\tNote=residue % in SAR {[perc for perc in data["TMD_"+str(data["biggest_sar"])][min_idx][3]]},Total % is {round(sum(j for i,j in data["TMD_"+str(data["biggest_sar"])][min_idx][3]),2)}\n') + f.writelines(f'{name}\tSAR_finder\tTopological domain\t{c_start}\t{c_end}\t.\t.\t.\tNote=C-terminus\n') + else: + f.writelines("##gff-version 3\n") + f.writelines(f"##sequence-region\n") + + +def tab_from_SAR_dict(sar_dict,stat_file,hydrophillic_res, sar_min, sar_max): + """ convert SAR dict to a dataframe """ + columns = ["Name","Protein Sequence","Protein Length","SAR Length","SAR Start","Putative SAR Sequence","SAR End",[f"{res}%" for res in hydrophillic_res],"% Total","N-term Sequence","N-term net Charge"] # using different residues for percent calc: [f"{res}%" for res in hydrophillic_res] + with stat_file as f: + f.writelines(f"{columns[0]}\t{columns[1]}\t{columns[2]}\t{columns[3]}\t{columns[4]}\t{columns[5]}\t{columns[6]}\t{columns[7]}\t{columns[8]}\t{columns[9]}\t{columns[10]}\n") + if sar_dict: + #print(sar_dict) + for name, data in sar_dict.items(): + for tmd_size in range(sar_max, sar_min-1, -1): + if "TMD_"+str(tmd_size) in data: + for each_match in data["TMD_"+str(tmd_size)]: + if each_match != [""]: + #print(f"{name} - {data}") + #print(each_match) + #for perc in each_match[3]: + # print(perc) + try: + f.writelines(f'{name}\t{data["sequence"]}\t{data["size"]}\t{tmd_size}\t{int(each_match[7])+1}\t{each_match[0]}\t{int(each_match[8])+1}\t{[perc for perc in each_match[3]]}\t{round(sum(j for i,j in each_match[3]),2)}\t{each_match[1]}\t{each_match[2]}\n') + except IndexError: + f.writelines(f'ERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\n') + else: + continue + +def stat_file_from_SAR_dict(sar_dict, stat_file, sar_min, sar_max): + """ summary statistics from SAR finder function """ + with stat_file as f: + f.writelines("..........:::::: Candidate SAR Proteins ::::::..........\n\n") + if sar_dict: + for data in sar_dict.values(): + f.writelines("Protein Description and Name: {}\n".format(data["description"])) + f.writelines("Protein Sequence: {}\n".format(data["sequence"])) + f.writelines("Protein Length: {}\n".format(data["size"])) + f.writelines("SAR Criteria matching region(s)\n") + for tmd_size in range(sar_max, sar_min-1, -1): + if "TMD_"+str(tmd_size) in data: + f.writelines("\nSAR length of {}:\n".format(tmd_size)) + for each_match in data["TMD_"+str(tmd_size)]: + if each_match != ['']: + f.writelines("\nPotential SAR domain sequence: {}\n".format(each_match[0])) + f.writelines("N-term sequence: {}\n".format(each_match[1])) + f.writelines("N-term net charge: {}\n".format(each_match[2])) + for each_perc_calc in each_match[3]: + f.writelines("Percent {} content: {}%\n".format(each_perc_calc[0],each_perc_calc[1])) + f.writelines("N-term coords: {}\n".format(each_match[4])) + f.writelines("SAR coords: {}\n".format(each_match[5])) + f.writelines("C-term coords: {}\n".format(each_match[6])) + f.writelines("SAR start: {}\n".format(each_match[7])) + else: + continue + f.writelines("========================================================\n\n") + else: + f.writelines("No candidate SAR Proteins found") + +def split_seq_string(input_range, python_indexing=True): + """ splits a #..# sequence into the two respective starts and ends, if python indexing, adds 1, otherwise keeps """ + if python_indexing: + values = input_range.split("..") + start =int(values[0]) + 1 + end = int(values[1]) + 1 + else: + values = input_range.split("..") + start = values[0] + end = values[1] + + return start, end + +if __name__ == "__main__": + pass \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_sar_finder/macros.xml Fri Jun 17 13:15:55 2022 +0000 @@ -0,0 +1,15 @@ +<?xml version="1.0"?> +<macros> + <xml name="requirements"> + <requirements> + <requirement type="package" version="3.6.1">python</requirement> + <requirement type="package" version="1.67">biopython</requirement> + <yield/> + </requirements> + <version_command> + <![CDATA[ + cd $__tool_directory__ && git rev-parse HEAD + ]]> + </version_command> + </xml> +</macros> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_sar_finder/test-data/candidate_SAR.fa Fri Jun 17 13:15:55 2022 +0000 @@ -0,0 +1,2 @@ +>SAR-endolysin +MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGPDIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTLLKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_sar_finder/test-data/candidate_SAR.gff3 Fri Jun 17 13:15:55 2022 +0000 @@ -0,0 +1,6 @@ +Seqid Source Type Start End Score Strand Phase Attributes +##gff-version 3 +##sequence-region SAR-endolysin +SAR-endolysin SAR_finder Topological domain 1 8 . . . Note=N-terminal net charge is 2 +SAR-endolysin SAR_finder SAR domain 9 26 . . . Note=residue % in SAR [('S', 0.0), ('G', 29.41), ('A', 23.53), ('T', 5.88)],Total % is 58.82 +SAR-endolysin SAR_finder Topological domain 27 172 . . . Note=C-terminus
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_sar_finder/test-data/candidate_SAR_stats.tsv Fri Jun 17 13:15:55 2022 +0000 @@ -0,0 +1,6 @@ +Name Protein Sequence Protein Length SAR Length SAR Start Putative SAR Sequence SAR End ['S%', 'G%', 'A%', 'T%'] % Total N-term Sequence N-term net Charge +SAR-endolysin MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGPDIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTLLKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ 171 17 9 KAALLAVTIAGGGVGGY 26 [('S', 0.0), ('G', 29.41), ('A', 23.53), ('T', 5.88)] 58.82 MAGIPKKL 2 +SAR-endolysin MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGPDIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTLLKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ 171 16 10 AALLAVTIAGGGVGGY 26 [('S', 0.0), ('G', 31.25), ('A', 25.0), ('T', 6.25)] 62.5 MAGIPKKLK 3 +SAR-endolysin MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGPDIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTLLKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ 171 15 9 KAALLAVTIAGGGVG 24 [('S', 0.0), ('G', 26.67), ('A', 26.67), ('T', 6.67)] 60.01 MAGIPKKL 2 +SAR-endolysin MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGPDIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTLLKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ 171 15 10 AALLAVTIAGGGVGG 25 [('S', 0.0), ('G', 33.33), ('A', 26.67), ('T', 6.67)] 66.67 MAGIPKKLK 3 +SAR-endolysin MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGPDIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTLLKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ 171 15 11 ALLAVTIAGGGVGGY 26 [('S', 0.0), ('G', 33.33), ('A', 20.0), ('T', 6.67)] 60.0 MAGIPKKLKA 3
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_sar_finder/test-data/simple-proteins.fa Fri Jun 17 13:15:55 2022 +0000 @@ -0,0 +1,7 @@ +>SAR-endolysin +MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGP +DIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTL +LKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ +>CPT_NC_000929.1_038 +MLKIKPAAGKAIRDPLTMKLLASEGEEKPRNSFWIRRLAAGDVVEVGSTENTADDTDAAP +KKRSKSK \ No newline at end of file