# HG changeset patch
# User cpt
# Date 1655471755 0
# Node ID 9f62910edcc9e8d673fb092d6c24ba3e3eef4a83
Uploaded
diff -r 000000000000 -r 9f62910edcc9 cpt_sar_finder/SAR_finder.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_sar_finder/SAR_finder.py Fri Jun 17 13:15:55 2022 +0000
@@ -0,0 +1,44 @@
+import sys
+import argparse
+import os
+import re
+from biopython_parsing import FASTA_parser
+from file_operations import fasta_from_SAR_dict, gff3_from_SAR_dict, tab_from_SAR_dict
+from SAR_functions import CheckSequence
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description="SAR Finder")
+
+ parser.add_argument("fa",type=argparse.FileType("r"),help="organism's multi fasta file")
+
+ parser.add_argument("--min",type=int,default=20,help="minimum size of candidate peptide")
+
+ parser.add_argument("--max",type=int,default=200,help="maximum size of candidate peptide")
+
+ parser.add_argument("--sar_min",type=int,default=15,help="minimum size of candidate peptide TMD domain")
+
+ parser.add_argument("--sar_max",type=int,default=24,help="maximum size of candidate peptide TMD domain")
+
+ parser.add_argument("--out_fa",type=argparse.FileType("w"),help="multifasta output of candidate SAR proteins",default="candidate_SAR.fa")
+
+ parser.add_argument("--out_stat",type=argparse.FileType("w"),help="summary statistic file for candidate SAR proteins, tab separated",default="candidate_SAR_stats.tsv")
+
+ parser.add_argument("--out_gff3",type=argparse.FileType("w"),help="multigff3 file for candidate SAR proteins",default="candidate_SAR.gff3")
+
+ args = parser.parse_args()
+
+ fa_dict = FASTA_parser(fa=args.fa).multifasta_dict()
+
+ sars = {}
+
+ for protein_name, protein_data in fa_dict.items():
+ sar = CheckSequence(protein_name, protein_data)
+ #sar.check_sizes(min=args.min,max=args.max)
+ hydros = sar.shrink_results(sar_min=args.sar_min, sar_max=args.sar_max)
+ sars.update(hydros)
+
+
+ gff3_from_SAR_dict(sars, args.out_gff3)
+ tab_from_SAR_dict(sars,args.out_stat,"SGAT",sar_min=args.sar_min, sar_max=args.sar_max)
+ fasta_from_SAR_dict(sars,args.out_fa)
+ #stat_file_from_SAR_dict(sars,args.out_stat,sar_min=args.sar_min,sar_max=args.sar_max) # fix this whenever ready.
\ No newline at end of file
diff -r 000000000000 -r 9f62910edcc9 cpt_sar_finder/SAR_finder.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_sar_finder/SAR_finder.xml Fri Jun 17 13:15:55 2022 +0000
@@ -0,0 +1,73 @@
+
+ SAR Domain Finder
+
+ macros.xml
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 50-60% Gly/Ala/Ser/Thr content. SAR endolysins are expected to have a net positive
+charge on the N-terminus by the positive-inside rule.
+
+**INPUT** --> Protein Multi FASTA
+
+**OUTPUT** -->
+
+* Multi FASTA with candidate proteins that pass the SAR domain criteria
+
+* Tabular summary file that lists every subdomain fitting the criteria for each potential SAR domain-containing protein with the following: protein name/sequence/length, SAR length/start/sequence/end, individual and total GAST% content in SAR, and N-terminal sequence/net charge
+
+* Multi GFF3 for unique candidate SAR domain-containing proteins
+
+ ]]>
+
+ 10.1371/journal.pcbi.1008214
+ https://dx.doi.org/10.1016/bs.aivir.2018.09.003
+
+ @unpublished{galaxyTools,
+ author = {C. Ross},
+ title = {CPT Galaxy Tools},
+ year = {2020-},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
diff -r 000000000000 -r 9f62910edcc9 cpt_sar_finder/SAR_functions.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_sar_finder/SAR_functions.py Fri Jun 17 13:15:55 2022 +0000
@@ -0,0 +1,193 @@
+#!/usr/bin/env python
+
+import sys
+import argparse
+import os
+import re
+from Bio import SeqIO
+
+
+class CheckSequence:
+ """
+ SAR endolysin Verification class, which starts with complete FA file, and is shrunk by each function to reveal best candidates of SAR endolysin proteins
+ """
+
+
+ def __init__(self, protein_name, protein_data):
+ self.name = protein_name
+ self.seq = protein_data.seq
+ self.description = protein_data.description
+ self.size = len(self.seq)
+ self.store = {}
+
+
+ def check_sizes(self,min,max):
+ """ check the minimum and maximum peptide lengths """
+ if self.size < min:
+ print("too small")
+ elif self.size > max:
+ print("too large")
+ else:
+ print(f"{self.name} : {self.seq}")
+ return True
+
+
+ def check_hydrophobicity_and_charge(self,sar_min=15,sar_max=20,perc_residues="SGAT"):
+ """ verifies the existence of a hydrophobic region within the sequence """
+ hydrophobic_residues = "['FIWLVMYCATGSP']" # fed through regex
+ hits = self.store
+ pos_res = "RK"
+ neg_res = "DE"
+
+ if self.size > 50:
+ seq = self.seq[0:50]
+ else:
+ seq = self.seq
+ for sar_size in range(sar_min, sar_max, 1):
+ for i in range(0,len(seq)-sar_size,1):
+ sar_seq = str(seq[i:i+sar_size])
+ if re.search((hydrophobic_residues+"{"+str(sar_size)+"}"),sar_seq):
+ charge_seq, charge, perc_cont, sar_coords, nterm_coords, cterm_coords, sar_start, sar_end = rep_funcs(self,seq,i,pos_res,neg_res,sar_seq,perc_residues,sar_size)
+ storage_dict(self=self,sar_size=sar_size,sar_seq=sar_seq,hits=hits,charge_seq=charge_seq,charge=charge,perc_cont=perc_cont,nterm_coords=nterm_coords,sar_coords=sar_coords,cterm_coords=cterm_coords,sar_start=sar_start,sar_end=sar_end)
+ #print("TMDSIZE: {}\tINDEX: {}".format(sar_size,i+1))
+ elif "K" in sar_seq[0] and re.search((hydrophobic_residues+"{"+str(sar_size-1)+"}"),sar_seq[1:]): # check frontend snorkels
+ charge_seq, charge, perc_cont, sar_coords, nterm_coords, cterm_coords, sar_start, sar_end = rep_funcs(self,seq,i,pos_res,neg_res,sar_seq,perc_residues,sar_size)
+ storage_dict(self=self,sar_size=sar_size,sar_seq=sar_seq,hits=hits,charge_seq=charge_seq,charge=charge,perc_cont=perc_cont,nterm_coords=nterm_coords,sar_coords=sar_coords,cterm_coords=cterm_coords,sar_start=sar_start,sar_end=sar_end)
+ #print("TMDSIZE: {}\tINDEX: {}".format(sar_size,i+1))
+ elif "K" in sar_seq[-1] and re.search((hydrophobic_residues+"{"+str(sar_size-1)+"}"),sar_seq[:-1]): # check backend snorkels
+ charge_seq, charge, perc_cont, sar_coords, nterm_coords, cterm_coords, sar_start, sar_end = rep_funcs(self,seq,i,pos_res,neg_res,sar_seq,perc_residues,sar_size)
+ storage_dict(self=self,sar_size=sar_size,sar_seq=sar_seq,hits=hits,charge_seq=charge_seq,charge=charge,perc_cont=perc_cont,nterm_coords=nterm_coords,sar_coords=sar_coords,cterm_coords=cterm_coords,sar_start=sar_start,sar_end=sar_end)
+ #print("TMDSIZE: {}\tINDEX: {}".format(sar_size,i+1))
+ continue
+
+ return hits
+
+ def shrink_results(self,sar_min=15,sar_max=20,perc_residues="SGAT"):
+ """ removes repetiive hits, keeps only the shortest and longest of each SAR domain """
+ compare_candidates = {}
+ hits = self.check_hydrophobicity_and_charge(sar_min=sar_min,sar_max=sar_max)
+ for sar_name, data in hits.items():
+ #print(sar_name)
+ compare_candidates[sar_name] = {}
+ #print("\nThese are the values: {}".format(v))
+ #count_of_times = 0
+ tmd_log = []
+ for sar_size in range(sar_max,sar_min-1,-1):
+ if "TMD_"+str(sar_size) in data:
+ tmd_log.append(sar_size)
+ #print(tmd_log)
+ for idx,the_data in enumerate(data["TMD_"+str(sar_size)]):
+ #print(the_data[7])
+ #print(the_data)
+ #print(f"This is the index: {idx}")
+ #print(f"This is the list of data at this index: {the_data}")
+ if the_data[7] in compare_candidates[sar_name]: # index to start
+ compare_candidates[sar_name][the_data[7]]["count"] += 1
+ compare_candidates[sar_name][the_data[7]]["size"].append(sar_size)
+ compare_candidates[sar_name][the_data[7]]["index"].append(idx)
+ else:
+ compare_candidates[sar_name][the_data[7]] = {}
+ compare_candidates[sar_name][the_data[7]]["count"] = 1
+ compare_candidates[sar_name][the_data[7]]["size"] = [sar_size]
+ compare_candidates[sar_name][the_data[7]]["index"] = [idx]
+ hits[sar_name]["biggest_sar"] = tmd_log[0]
+ for sar_name, compare_data in compare_candidates.items():
+ for data in compare_data.values():
+ if len(data["size"]) >= 3:
+ #print(f"{each_size} --> {data}")
+ minmax = [min(data["size"]),max(data["size"])]
+ nonminmax = [x for x in data["size"] if x not in minmax]
+ nonminmax_index = []
+ for each_nonminmax in nonminmax:
+ v = data["size"].index(each_nonminmax)
+ x = data["index"][v]
+ nonminmax_index.append(x)
+ nons = zip(nonminmax,nonminmax_index)
+ for value in nons:
+ #hits[sar_name]["TMD_"+str(value[0])] = hits[sar_name]["TMD_"+str(value[0])].pop(value[1])
+ hits[sar_name]["TMD_"+str(value[0])][value[1]] = [""]
+
+ return hits
+
+
+def rep_funcs(self,seq,loc,pos_res,neg_res,sar_seq,perc_residues,sar_size):
+ """ run a set of functions together before sending the results to the storage dictionary """
+
+ charge_seq = str(seq[:loc])
+ charge = charge_check(charge_seq,pos_res,neg_res)
+ perc_cont = percent_calc(sar_seq,perc_residues,int(sar_size))
+ sar_start = loc
+ sar_end = loc + sar_size
+ sar_coords = "{}..{}".format(loc,loc+sar_size)
+ nterm_coords = "{}..{}".format("0",loc-1)
+ cterm_coords = "{}..{}".format(loc+sar_size+1,self.size)
+
+ return charge_seq, charge, perc_cont, sar_coords, nterm_coords, cterm_coords, sar_start, sar_end
+
+
+### Extra "helper" functions
+def storage_dict(self,sar_size,sar_seq,hits,charge_seq,charge,perc_cont,nterm_coords,sar_coords,cterm_coords,sar_start,sar_end): # probably not good to call "self" a param here...definitley not PEP approved...
+ """ organize dictionary for hydrophobicity check """
+ if self.name not in hits:
+ hits[self.name] = {}
+ hits[self.name]["description"] = str(self.description)
+ hits[self.name]["sequence"] = str(self.seq)
+ hits[self.name]["size"] = str(self.size)
+ #GAcont = str((str(self.seq).count("G")+str(self.seq).count("A"))/int(self.size)*100)
+ #hits[self.name]["GAcont"] = "{:.2f}%".format(float(GAcont))
+ if "TMD_"+str(sar_size) not in hits[self.name]:
+ hits[self.name]["TMD_"+str(sar_size)] = []
+ hits[self.name]["TMD_"+str(sar_size)].append([sar_seq,charge_seq,charge,perc_cont,nterm_coords,sar_coords,cterm_coords,sar_start,sar_end])
+ else:
+ hits[self.name]["TMD_"+str(sar_size)].append([sar_seq,charge_seq,charge,perc_cont,nterm_coords,sar_coords,cterm_coords,sar_start,sar_end])
+ else:
+ if "TMD_"+str(sar_size) not in hits[self.name]:
+ hits[self.name]["TMD_"+str(sar_size)] = []
+ hits[self.name]["TMD_"+str(sar_size)].append([sar_seq,charge_seq,charge,perc_cont,nterm_coords,sar_coords,cterm_coords,sar_start,sar_end])
+ else:
+ hits[self.name]["TMD_"+str(sar_size)].append([sar_seq,charge_seq,charge,perc_cont,nterm_coords,sar_coords,cterm_coords,sar_start,sar_end])
+
+
+def percent_calc(sequence,residues,size):
+ """ Calculate the percent of a set of residues within an input sequence """
+ counted = {}
+ for aa in sequence:
+ #print(aa)
+ if aa in counted:
+ counted[aa] += 1
+ else:
+ counted[aa] = 1
+ residue_amt = 0
+ my_ratios = []
+ for res_of_interest in residues:
+ try:
+ residue_amt = counted[res_of_interest]
+ except KeyError:
+ residue_amt = 0
+ ratio = residue_amt/size
+ my_ratios.append((round(ratio*100,2)))
+
+ res_rat = list(zip(residues,my_ratios))
+
+ return res_rat
+
+
+def charge_check(charge_seq,pos_res,neg_res):
+ charge = 0
+ for aa in charge_seq:
+ if aa in pos_res:
+ charge += 1
+ if aa in neg_res:
+ charge -= 1
+ return charge
+
+if __name__ == "__main__":
+ sequence = "MAGBYYYTRLCVRKLRKGGGHP"
+ residues = "YL"
+ size = len(sequence)
+ print(size)
+ v = percent_calc(sequence,residues,size)
+ print(v)
+ for i in v:
+ print(i)
+
diff -r 000000000000 -r 9f62910edcc9 cpt_sar_finder/biopython_parsing.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_sar_finder/biopython_parsing.py Fri Jun 17 13:15:55 2022 +0000
@@ -0,0 +1,24 @@
+#!/usr/bin/env python
+# Biopython parsing module. Uses in conjunction with the sar_finder script, and potential future scripts down the line.
+
+from Bio import SeqIO
+
+class FASTA_parser:
+ """ Parses multi fasta file, and zips together header with sequence """
+
+ def __init__(self, fa):
+ self.fa = fa
+
+ def multifasta_dict(self):
+ """ parses the input multi fasta, and puts results into dictionary """
+
+ return SeqIO.to_dict(SeqIO.parse(self.fa,"fasta"))
+
+
+if __name__ == "__main__":
+ fa_file = "test-data/mu-proteins.fa"
+ d = FASTA_parser(fa_file).multifasta_dict()
+ print(d)
+ for k, v in d.items():
+ print(v.description)
+
diff -r 000000000000 -r 9f62910edcc9 cpt_sar_finder/cpt-macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_sar_finder/cpt-macros.xml Fri Jun 17 13:15:55 2022 +0000
@@ -0,0 +1,115 @@
+
+
+
+
+ python
+ biopython
+ requests
+
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+ @unpublished{galaxyTools,
+ author = {E. Mijalis, H. Rasche},
+ title = {CPT Galaxy Tools},
+ year = {2013-2017},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {E. Mijalis, H. Rasche},
+ title = {CPT Galaxy Tools},
+ year = {2013-2017},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {C. Ross},
+ title = {CPT Galaxy Tools},
+ year = {2020-},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {E. Mijalis, H. Rasche},
+ title = {CPT Galaxy Tools},
+ year = {2013-2017},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+ @unpublished{galaxyTools,
+ author = {A. Criscione},
+ title = {CPT Galaxy Tools},
+ year = {2019-2021},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {A. Criscione},
+ title = {CPT Galaxy Tools},
+ year = {2019-2021},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {C. Maughmer},
+ title = {CPT Galaxy Tools},
+ year = {2017-2020},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ @unpublished{galaxyTools,
+ author = {C. Maughmer},
+ title = {CPT Galaxy Tools},
+ year = {2017-2020},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
diff -r 000000000000 -r 9f62910edcc9 cpt_sar_finder/file_operations.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_sar_finder/file_operations.py Fri Jun 17 13:15:55 2022 +0000
@@ -0,0 +1,98 @@
+
+def fasta_from_SAR_dict(sar_dict,fa_file):
+ """ makes a multi fasta with candidates from SAR dictionary """
+ with fa_file as f:
+ for data in sar_dict.values():
+ f.writelines(">{}\n".format(data["description"]))
+ f.writelines("{}\n".format(data["sequence"]))
+
+def gff3_from_SAR_dict(sar_dict,gff3_file):
+ """ make a multi gff3 with candidates from SAR dictionary """
+ gff3_cols = ["Seqid","Source","Type","Start","End","Score","Strand","Phase","Attributes"]
+ with gff3_file as f:
+ f.writelines(f"{gff3_cols[0]}\t{gff3_cols[1]}\t{gff3_cols[2]}\t{gff3_cols[3]}\t{gff3_cols[4]}\t{gff3_cols[5]}\t{gff3_cols[6]}\t{gff3_cols[7]}\t{gff3_cols[8]}\n")
+ if sar_dict:
+ #print(sar_dict)
+ for name, data in sar_dict.items():
+ min_idx = 0
+ f.writelines("##gff-version 3\n")
+ f.writelines(f"##sequence-region {name}\n")
+ n_start, n_end = split_seq_string(data["TMD_"+str(data["biggest_sar"])][min_idx][4])
+ sar_start, sar_end = split_seq_string(data["TMD_"+str(data["biggest_sar"])][min_idx][5])
+ c_start, c_end = split_seq_string(data["TMD_"+str(data["biggest_sar"])][min_idx][6])
+ f.writelines(f'{name}\tSAR_finder\tTopological domain\t{n_start}\t{n_end}\t.\t.\t.\tNote=N-terminal net charge is {data["TMD_"+str(data["biggest_sar"])][min_idx][2]}\n')
+ f.writelines(f'{name}\tSAR_finder\tSAR domain\t{sar_start}\t{sar_end}\t.\t.\t.\tNote=residue % in SAR {[perc for perc in data["TMD_"+str(data["biggest_sar"])][min_idx][3]]},Total % is {round(sum(j for i,j in data["TMD_"+str(data["biggest_sar"])][min_idx][3]),2)}\n')
+ f.writelines(f'{name}\tSAR_finder\tTopological domain\t{c_start}\t{c_end}\t.\t.\t.\tNote=C-terminus\n')
+ else:
+ f.writelines("##gff-version 3\n")
+ f.writelines(f"##sequence-region\n")
+
+
+def tab_from_SAR_dict(sar_dict,stat_file,hydrophillic_res, sar_min, sar_max):
+ """ convert SAR dict to a dataframe """
+ columns = ["Name","Protein Sequence","Protein Length","SAR Length","SAR Start","Putative SAR Sequence","SAR End",[f"{res}%" for res in hydrophillic_res],"% Total","N-term Sequence","N-term net Charge"] # using different residues for percent calc: [f"{res}%" for res in hydrophillic_res]
+ with stat_file as f:
+ f.writelines(f"{columns[0]}\t{columns[1]}\t{columns[2]}\t{columns[3]}\t{columns[4]}\t{columns[5]}\t{columns[6]}\t{columns[7]}\t{columns[8]}\t{columns[9]}\t{columns[10]}\n")
+ if sar_dict:
+ #print(sar_dict)
+ for name, data in sar_dict.items():
+ for tmd_size in range(sar_max, sar_min-1, -1):
+ if "TMD_"+str(tmd_size) in data:
+ for each_match in data["TMD_"+str(tmd_size)]:
+ if each_match != [""]:
+ #print(f"{name} - {data}")
+ #print(each_match)
+ #for perc in each_match[3]:
+ # print(perc)
+ try:
+ f.writelines(f'{name}\t{data["sequence"]}\t{data["size"]}\t{tmd_size}\t{int(each_match[7])+1}\t{each_match[0]}\t{int(each_match[8])+1}\t{[perc for perc in each_match[3]]}\t{round(sum(j for i,j in each_match[3]),2)}\t{each_match[1]}\t{each_match[2]}\n')
+ except IndexError:
+ f.writelines(f'ERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\n')
+ else:
+ continue
+
+def stat_file_from_SAR_dict(sar_dict, stat_file, sar_min, sar_max):
+ """ summary statistics from SAR finder function """
+ with stat_file as f:
+ f.writelines("..........:::::: Candidate SAR Proteins ::::::..........\n\n")
+ if sar_dict:
+ for data in sar_dict.values():
+ f.writelines("Protein Description and Name: {}\n".format(data["description"]))
+ f.writelines("Protein Sequence: {}\n".format(data["sequence"]))
+ f.writelines("Protein Length: {}\n".format(data["size"]))
+ f.writelines("SAR Criteria matching region(s)\n")
+ for tmd_size in range(sar_max, sar_min-1, -1):
+ if "TMD_"+str(tmd_size) in data:
+ f.writelines("\nSAR length of {}:\n".format(tmd_size))
+ for each_match in data["TMD_"+str(tmd_size)]:
+ if each_match != ['']:
+ f.writelines("\nPotential SAR domain sequence: {}\n".format(each_match[0]))
+ f.writelines("N-term sequence: {}\n".format(each_match[1]))
+ f.writelines("N-term net charge: {}\n".format(each_match[2]))
+ for each_perc_calc in each_match[3]:
+ f.writelines("Percent {} content: {}%\n".format(each_perc_calc[0],each_perc_calc[1]))
+ f.writelines("N-term coords: {}\n".format(each_match[4]))
+ f.writelines("SAR coords: {}\n".format(each_match[5]))
+ f.writelines("C-term coords: {}\n".format(each_match[6]))
+ f.writelines("SAR start: {}\n".format(each_match[7]))
+ else:
+ continue
+ f.writelines("========================================================\n\n")
+ else:
+ f.writelines("No candidate SAR Proteins found")
+
+def split_seq_string(input_range, python_indexing=True):
+ """ splits a #..# sequence into the two respective starts and ends, if python indexing, adds 1, otherwise keeps """
+ if python_indexing:
+ values = input_range.split("..")
+ start =int(values[0]) + 1
+ end = int(values[1]) + 1
+ else:
+ values = input_range.split("..")
+ start = values[0]
+ end = values[1]
+
+ return start, end
+
+if __name__ == "__main__":
+ pass
\ No newline at end of file
diff -r 000000000000 -r 9f62910edcc9 cpt_sar_finder/macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_sar_finder/macros.xml Fri Jun 17 13:15:55 2022 +0000
@@ -0,0 +1,15 @@
+
+
+
+
+ python
+ biopython
+
+
+
+
+
+
+
\ No newline at end of file
diff -r 000000000000 -r 9f62910edcc9 cpt_sar_finder/test-data/candidate_SAR.fa
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_sar_finder/test-data/candidate_SAR.fa Fri Jun 17 13:15:55 2022 +0000
@@ -0,0 +1,2 @@
+>SAR-endolysin
+MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGPDIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTLLKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ
diff -r 000000000000 -r 9f62910edcc9 cpt_sar_finder/test-data/candidate_SAR.gff3
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_sar_finder/test-data/candidate_SAR.gff3 Fri Jun 17 13:15:55 2022 +0000
@@ -0,0 +1,6 @@
+Seqid Source Type Start End Score Strand Phase Attributes
+##gff-version 3
+##sequence-region SAR-endolysin
+SAR-endolysin SAR_finder Topological domain 1 8 . . . Note=N-terminal net charge is 2
+SAR-endolysin SAR_finder SAR domain 9 26 . . . Note=residue % in SAR [('S', 0.0), ('G', 29.41), ('A', 23.53), ('T', 5.88)],Total % is 58.82
+SAR-endolysin SAR_finder Topological domain 27 172 . . . Note=C-terminus
diff -r 000000000000 -r 9f62910edcc9 cpt_sar_finder/test-data/candidate_SAR_stats.tsv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_sar_finder/test-data/candidate_SAR_stats.tsv Fri Jun 17 13:15:55 2022 +0000
@@ -0,0 +1,6 @@
+Name Protein Sequence Protein Length SAR Length SAR Start Putative SAR Sequence SAR End ['S%', 'G%', 'A%', 'T%'] % Total N-term Sequence N-term net Charge
+SAR-endolysin MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGPDIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTLLKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ 171 17 9 KAALLAVTIAGGGVGGY 26 [('S', 0.0), ('G', 29.41), ('A', 23.53), ('T', 5.88)] 58.82 MAGIPKKL 2
+SAR-endolysin MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGPDIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTLLKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ 171 16 10 AALLAVTIAGGGVGGY 26 [('S', 0.0), ('G', 31.25), ('A', 25.0), ('T', 6.25)] 62.5 MAGIPKKLK 3
+SAR-endolysin MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGPDIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTLLKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ 171 15 9 KAALLAVTIAGGGVG 24 [('S', 0.0), ('G', 26.67), ('A', 26.67), ('T', 6.67)] 60.01 MAGIPKKL 2
+SAR-endolysin MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGPDIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTLLKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ 171 15 10 AALLAVTIAGGGVGG 25 [('S', 0.0), ('G', 33.33), ('A', 26.67), ('T', 6.67)] 66.67 MAGIPKKLK 3
+SAR-endolysin MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGPDIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTLLKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ 171 15 11 ALLAVTIAGGGVGGY 26 [('S', 0.0), ('G', 33.33), ('A', 20.0), ('T', 6.67)] 60.0 MAGIPKKLKA 3
diff -r 000000000000 -r 9f62910edcc9 cpt_sar_finder/test-data/simple-proteins.fa
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_sar_finder/test-data/simple-proteins.fa Fri Jun 17 13:15:55 2022 +0000
@@ -0,0 +1,7 @@
+>SAR-endolysin
+MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGP
+DIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTL
+LKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ
+>CPT_NC_000929.1_038
+MLKIKPAAGKAIRDPLTMKLLASEGEEKPRNSFWIRRLAAGDVVEVGSTENTADDTDAAP
+KKRSKSK
\ No newline at end of file