# HG changeset patch
# User cpt
# Date 1685933577 0
# Node ID 1127518233232e2f0a7a3278e30a9935fc833f73
# Parent 9f62910edcc9e8d673fb092d6c24ba3e3eef4a83
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
diff -r 9f62910edcc9 -r 112751823323 SAR_finder.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/SAR_finder.py Mon Jun 05 02:52:57 2023 +0000
@@ -0,0 +1,76 @@
+import sys
+import argparse
+import os
+import re
+from biopython_parsing import FASTA_parser
+from file_operations import fasta_from_SAR_dict, gff3_from_SAR_dict, tab_from_SAR_dict
+from SAR_functions import CheckSequence
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description="SAR Finder")
+
+ parser.add_argument(
+ "fa", type=argparse.FileType("r"), help="organism's multi fasta file"
+ )
+
+ parser.add_argument(
+ "--min", type=int, default=20, help="minimum size of candidate peptide"
+ )
+
+ parser.add_argument(
+ "--max", type=int, default=200, help="maximum size of candidate peptide"
+ )
+
+ parser.add_argument(
+ "--sar_min",
+ type=int,
+ default=15,
+ help="minimum size of candidate peptide TMD domain",
+ )
+
+ parser.add_argument(
+ "--sar_max",
+ type=int,
+ default=24,
+ help="maximum size of candidate peptide TMD domain",
+ )
+
+ parser.add_argument(
+ "--out_fa",
+ type=argparse.FileType("w"),
+ help="multifasta output of candidate SAR proteins",
+ default="candidate_SAR.fa",
+ )
+
+ parser.add_argument(
+ "--out_stat",
+ type=argparse.FileType("w"),
+ help="summary statistic file for candidate SAR proteins, tab separated",
+ default="candidate_SAR_stats.tsv",
+ )
+
+ parser.add_argument(
+ "--out_gff3",
+ type=argparse.FileType("w"),
+ help="multigff3 file for candidate SAR proteins",
+ default="candidate_SAR.gff3",
+ )
+
+ args = parser.parse_args()
+
+ fa_dict = FASTA_parser(fa=args.fa).multifasta_dict()
+
+ sars = {}
+
+ for protein_name, protein_data in fa_dict.items():
+ sar = CheckSequence(protein_name, protein_data)
+ # sar.check_sizes(min=args.min,max=args.max)
+ hydros = sar.shrink_results(sar_min=args.sar_min, sar_max=args.sar_max)
+ sars.update(hydros)
+
+ gff3_from_SAR_dict(sars, args.out_gff3)
+ tab_from_SAR_dict(
+ sars, args.out_stat, "SGAT", sar_min=args.sar_min, sar_max=args.sar_max
+ )
+ fasta_from_SAR_dict(sars, args.out_fa)
+ # stat_file_from_SAR_dict(sars,args.out_stat,sar_min=args.sar_min,sar_max=args.sar_max) # fix this whenever ready.
diff -r 9f62910edcc9 -r 112751823323 SAR_finder.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/SAR_finder.xml Mon Jun 05 02:52:57 2023 +0000
@@ -0,0 +1,73 @@
+
+ SAR Domain Finder
+
+ macros.xml
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 50-60% Gly/Ala/Ser/Thr content. SAR endolysins are expected to have a net positive
+charge on the N-terminus by the positive-inside rule.
+
+**INPUT** --> Protein Multi FASTA
+
+**OUTPUT** -->
+
+* Multi FASTA with candidate proteins that pass the SAR domain criteria
+
+* Tabular summary file that lists every subdomain fitting the criteria for each potential SAR domain-containing protein with the following: protein name/sequence/length, SAR length/start/sequence/end, individual and total GAST% content in SAR, and N-terminal sequence/net charge
+
+* Multi GFF3 for unique candidate SAR domain-containing proteins
+
+ ]]>
+
+ 10.1371/journal.pcbi.1008214
+ https://dx.doi.org/10.1016/bs.aivir.2018.09.003
+
+ @unpublished{galaxyTools,
+ author = {C. Ross},
+ title = {CPT Galaxy Tools},
+ year = {2020-},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
diff -r 9f62910edcc9 -r 112751823323 SAR_functions.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/SAR_functions.py Mon Jun 05 02:52:57 2023 +0000
@@ -0,0 +1,348 @@
+#!/usr/bin/env python
+
+import sys
+import argparse
+import os
+import re
+from Bio import SeqIO
+
+
+class CheckSequence:
+ """
+ SAR endolysin Verification class, which starts with complete FA file, and is shrunk by each function to reveal best candidates of SAR endolysin proteins
+ """
+
+ def __init__(self, protein_name, protein_data):
+ self.name = protein_name
+ self.seq = protein_data.seq
+ self.description = protein_data.description
+ self.size = len(self.seq)
+ self.store = {}
+
+ def check_sizes(self, min, max):
+ """check the minimum and maximum peptide lengths"""
+ if self.size < min:
+ print("too small")
+ elif self.size > max:
+ print("too large")
+ else:
+ print(f"{self.name} : {self.seq}")
+ return True
+
+ def check_hydrophobicity_and_charge(
+ self, sar_min=15, sar_max=20, perc_residues="SGAT"
+ ):
+ """verifies the existence of a hydrophobic region within the sequence"""
+ hydrophobic_residues = "['FIWLVMYCATGSP']" # fed through regex
+ hits = self.store
+ pos_res = "RK"
+ neg_res = "DE"
+
+ if self.size > 50:
+ seq = self.seq[0:50]
+ else:
+ seq = self.seq
+ for sar_size in range(sar_min, sar_max, 1):
+ for i in range(0, len(seq) - sar_size, 1):
+ sar_seq = str(seq[i : i + sar_size])
+ if re.search(
+ (hydrophobic_residues + "{" + str(sar_size) + "}"), sar_seq
+ ):
+ (
+ charge_seq,
+ charge,
+ perc_cont,
+ sar_coords,
+ nterm_coords,
+ cterm_coords,
+ sar_start,
+ sar_end,
+ ) = rep_funcs(
+ self, seq, i, pos_res, neg_res, sar_seq, perc_residues, sar_size
+ )
+ storage_dict(
+ self=self,
+ sar_size=sar_size,
+ sar_seq=sar_seq,
+ hits=hits,
+ charge_seq=charge_seq,
+ charge=charge,
+ perc_cont=perc_cont,
+ nterm_coords=nterm_coords,
+ sar_coords=sar_coords,
+ cterm_coords=cterm_coords,
+ sar_start=sar_start,
+ sar_end=sar_end,
+ )
+ # print("TMDSIZE: {}\tINDEX: {}".format(sar_size,i+1))
+ elif "K" in sar_seq[0] and re.search(
+ (hydrophobic_residues + "{" + str(sar_size - 1) + "}"), sar_seq[1:]
+ ): # check frontend snorkels
+ (
+ charge_seq,
+ charge,
+ perc_cont,
+ sar_coords,
+ nterm_coords,
+ cterm_coords,
+ sar_start,
+ sar_end,
+ ) = rep_funcs(
+ self, seq, i, pos_res, neg_res, sar_seq, perc_residues, sar_size
+ )
+ storage_dict(
+ self=self,
+ sar_size=sar_size,
+ sar_seq=sar_seq,
+ hits=hits,
+ charge_seq=charge_seq,
+ charge=charge,
+ perc_cont=perc_cont,
+ nterm_coords=nterm_coords,
+ sar_coords=sar_coords,
+ cterm_coords=cterm_coords,
+ sar_start=sar_start,
+ sar_end=sar_end,
+ )
+ # print("TMDSIZE: {}\tINDEX: {}".format(sar_size,i+1))
+ elif "K" in sar_seq[-1] and re.search(
+ (hydrophobic_residues + "{" + str(sar_size - 1) + "}"), sar_seq[:-1]
+ ): # check backend snorkels
+ (
+ charge_seq,
+ charge,
+ perc_cont,
+ sar_coords,
+ nterm_coords,
+ cterm_coords,
+ sar_start,
+ sar_end,
+ ) = rep_funcs(
+ self, seq, i, pos_res, neg_res, sar_seq, perc_residues, sar_size
+ )
+ storage_dict(
+ self=self,
+ sar_size=sar_size,
+ sar_seq=sar_seq,
+ hits=hits,
+ charge_seq=charge_seq,
+ charge=charge,
+ perc_cont=perc_cont,
+ nterm_coords=nterm_coords,
+ sar_coords=sar_coords,
+ cterm_coords=cterm_coords,
+ sar_start=sar_start,
+ sar_end=sar_end,
+ )
+ # print("TMDSIZE: {}\tINDEX: {}".format(sar_size,i+1))
+ continue
+
+ return hits
+
+ def shrink_results(self, sar_min=15, sar_max=20, perc_residues="SGAT"):
+ """removes repetiive hits, keeps only the shortest and longest of each SAR domain"""
+ compare_candidates = {}
+ hits = self.check_hydrophobicity_and_charge(sar_min=sar_min, sar_max=sar_max)
+ for sar_name, data in hits.items():
+ # print(sar_name)
+ compare_candidates[sar_name] = {}
+ # print("\nThese are the values: {}".format(v))
+ # count_of_times = 0
+ tmd_log = []
+ for sar_size in range(sar_max, sar_min - 1, -1):
+ if "TMD_" + str(sar_size) in data:
+ tmd_log.append(sar_size)
+ # print(tmd_log)
+ for idx, the_data in enumerate(data["TMD_" + str(sar_size)]):
+ # print(the_data[7])
+ # print(the_data)
+ # print(f"This is the index: {idx}")
+ # print(f"This is the list of data at this index: {the_data}")
+ if (
+ the_data[7] in compare_candidates[sar_name]
+ ): # index to start
+ compare_candidates[sar_name][the_data[7]]["count"] += 1
+ compare_candidates[sar_name][the_data[7]]["size"].append(
+ sar_size
+ )
+ compare_candidates[sar_name][the_data[7]]["index"].append(
+ idx
+ )
+ else:
+ compare_candidates[sar_name][the_data[7]] = {}
+ compare_candidates[sar_name][the_data[7]]["count"] = 1
+ compare_candidates[sar_name][the_data[7]]["size"] = [
+ sar_size
+ ]
+ compare_candidates[sar_name][the_data[7]]["index"] = [idx]
+ hits[sar_name]["biggest_sar"] = tmd_log[0]
+ for sar_name, compare_data in compare_candidates.items():
+ for data in compare_data.values():
+ if len(data["size"]) >= 3:
+ # print(f"{each_size} --> {data}")
+ minmax = [min(data["size"]), max(data["size"])]
+ nonminmax = [x for x in data["size"] if x not in minmax]
+ nonminmax_index = []
+ for each_nonminmax in nonminmax:
+ v = data["size"].index(each_nonminmax)
+ x = data["index"][v]
+ nonminmax_index.append(x)
+ nons = zip(nonminmax, nonminmax_index)
+ for value in nons:
+ # hits[sar_name]["TMD_"+str(value[0])] = hits[sar_name]["TMD_"+str(value[0])].pop(value[1])
+ hits[sar_name]["TMD_" + str(value[0])][value[1]] = [""]
+
+ return hits
+
+
+def rep_funcs(self, seq, loc, pos_res, neg_res, sar_seq, perc_residues, sar_size):
+ """run a set of functions together before sending the results to the storage dictionary"""
+
+ charge_seq = str(seq[:loc])
+ charge = charge_check(charge_seq, pos_res, neg_res)
+ perc_cont = percent_calc(sar_seq, perc_residues, int(sar_size))
+ sar_start = loc
+ sar_end = loc + sar_size
+ sar_coords = "{}..{}".format(loc, loc + sar_size)
+ nterm_coords = "{}..{}".format("0", loc - 1)
+ cterm_coords = "{}..{}".format(loc + sar_size + 1, self.size)
+
+ return (
+ charge_seq,
+ charge,
+ perc_cont,
+ sar_coords,
+ nterm_coords,
+ cterm_coords,
+ sar_start,
+ sar_end,
+ )
+
+
+### Extra "helper" functions
+def storage_dict(
+ self,
+ sar_size,
+ sar_seq,
+ hits,
+ charge_seq,
+ charge,
+ perc_cont,
+ nterm_coords,
+ sar_coords,
+ cterm_coords,
+ sar_start,
+ sar_end,
+): # probably not good to call "self" a param here...definitley not PEP approved...
+ """organize dictionary for hydrophobicity check"""
+ if self.name not in hits:
+ hits[self.name] = {}
+ hits[self.name]["description"] = str(self.description)
+ hits[self.name]["sequence"] = str(self.seq)
+ hits[self.name]["size"] = str(self.size)
+ # GAcont = str((str(self.seq).count("G")+str(self.seq).count("A"))/int(self.size)*100)
+ # hits[self.name]["GAcont"] = "{:.2f}%".format(float(GAcont))
+ if "TMD_" + str(sar_size) not in hits[self.name]:
+ hits[self.name]["TMD_" + str(sar_size)] = []
+ hits[self.name]["TMD_" + str(sar_size)].append(
+ [
+ sar_seq,
+ charge_seq,
+ charge,
+ perc_cont,
+ nterm_coords,
+ sar_coords,
+ cterm_coords,
+ sar_start,
+ sar_end,
+ ]
+ )
+ else:
+ hits[self.name]["TMD_" + str(sar_size)].append(
+ [
+ sar_seq,
+ charge_seq,
+ charge,
+ perc_cont,
+ nterm_coords,
+ sar_coords,
+ cterm_coords,
+ sar_start,
+ sar_end,
+ ]
+ )
+ else:
+ if "TMD_" + str(sar_size) not in hits[self.name]:
+ hits[self.name]["TMD_" + str(sar_size)] = []
+ hits[self.name]["TMD_" + str(sar_size)].append(
+ [
+ sar_seq,
+ charge_seq,
+ charge,
+ perc_cont,
+ nterm_coords,
+ sar_coords,
+ cterm_coords,
+ sar_start,
+ sar_end,
+ ]
+ )
+ else:
+ hits[self.name]["TMD_" + str(sar_size)].append(
+ [
+ sar_seq,
+ charge_seq,
+ charge,
+ perc_cont,
+ nterm_coords,
+ sar_coords,
+ cterm_coords,
+ sar_start,
+ sar_end,
+ ]
+ )
+
+
+def percent_calc(sequence, residues, size):
+ """Calculate the percent of a set of residues within an input sequence"""
+ counted = {}
+ for aa in sequence:
+ # print(aa)
+ if aa in counted:
+ counted[aa] += 1
+ else:
+ counted[aa] = 1
+ residue_amt = 0
+ my_ratios = []
+ for res_of_interest in residues:
+ try:
+ residue_amt = counted[res_of_interest]
+ except KeyError:
+ residue_amt = 0
+ ratio = residue_amt / size
+ my_ratios.append((round(ratio * 100, 2)))
+
+ res_rat = list(zip(residues, my_ratios))
+
+ return res_rat
+
+
+def charge_check(charge_seq, pos_res, neg_res):
+ charge = 0
+ for aa in charge_seq:
+ if aa in pos_res:
+ charge += 1
+ if aa in neg_res:
+ charge -= 1
+ return charge
+
+
+if __name__ == "__main__":
+ sequence = "MAGBYYYTRLCVRKLRKGGGHP"
+ residues = "YL"
+ size = len(sequence)
+ print(size)
+ v = percent_calc(sequence, residues, size)
+ print(v)
+ for i in v:
+ print(i)
diff -r 9f62910edcc9 -r 112751823323 biopython_parsing.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/biopython_parsing.py Mon Jun 05 02:52:57 2023 +0000
@@ -0,0 +1,24 @@
+#!/usr/bin/env python
+# Biopython parsing module. Uses in conjunction with the sar_finder script, and potential future scripts down the line.
+
+from Bio import SeqIO
+
+
+class FASTA_parser:
+ """Parses multi fasta file, and zips together header with sequence"""
+
+ def __init__(self, fa):
+ self.fa = fa
+
+ def multifasta_dict(self):
+ """parses the input multi fasta, and puts results into dictionary"""
+
+ return SeqIO.to_dict(SeqIO.parse(self.fa, "fasta"))
+
+
+if __name__ == "__main__":
+ fa_file = "test-data/mu-proteins.fa"
+ d = FASTA_parser(fa_file).multifasta_dict()
+ print(d)
+ for k, v in d.items():
+ print(v.description)
diff -r 9f62910edcc9 -r 112751823323 cpt-macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt-macros.xml Mon Jun 05 02:52:57 2023 +0000
@@ -0,0 +1,115 @@
+
+
+
+ python
+ biopython
+ requests
+ cpt_gffparser
+
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+ @unpublished{galaxyTools,
+ author = {E. Mijalis, H. Rasche},
+ title = {CPT Galaxy Tools},
+ year = {2013-2017},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {E. Mijalis, H. Rasche},
+ title = {CPT Galaxy Tools},
+ year = {2013-2017},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {C. Ross},
+ title = {CPT Galaxy Tools},
+ year = {2020-},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {E. Mijalis, H. Rasche},
+ title = {CPT Galaxy Tools},
+ year = {2013-2017},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+ @unpublished{galaxyTools,
+ author = {A. Criscione},
+ title = {CPT Galaxy Tools},
+ year = {2019-2021},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {A. Criscione},
+ title = {CPT Galaxy Tools},
+ year = {2019-2021},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {C. Maughmer},
+ title = {CPT Galaxy Tools},
+ year = {2017-2020},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ @unpublished{galaxyTools,
+ author = {C. Maughmer},
+ title = {CPT Galaxy Tools},
+ year = {2017-2020},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
diff -r 9f62910edcc9 -r 112751823323 cpt_sar_finder/SAR_finder.py
--- a/cpt_sar_finder/SAR_finder.py Fri Jun 17 13:15:55 2022 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,44 +0,0 @@
-import sys
-import argparse
-import os
-import re
-from biopython_parsing import FASTA_parser
-from file_operations import fasta_from_SAR_dict, gff3_from_SAR_dict, tab_from_SAR_dict
-from SAR_functions import CheckSequence
-
-if __name__ == "__main__":
- parser = argparse.ArgumentParser(description="SAR Finder")
-
- parser.add_argument("fa",type=argparse.FileType("r"),help="organism's multi fasta file")
-
- parser.add_argument("--min",type=int,default=20,help="minimum size of candidate peptide")
-
- parser.add_argument("--max",type=int,default=200,help="maximum size of candidate peptide")
-
- parser.add_argument("--sar_min",type=int,default=15,help="minimum size of candidate peptide TMD domain")
-
- parser.add_argument("--sar_max",type=int,default=24,help="maximum size of candidate peptide TMD domain")
-
- parser.add_argument("--out_fa",type=argparse.FileType("w"),help="multifasta output of candidate SAR proteins",default="candidate_SAR.fa")
-
- parser.add_argument("--out_stat",type=argparse.FileType("w"),help="summary statistic file for candidate SAR proteins, tab separated",default="candidate_SAR_stats.tsv")
-
- parser.add_argument("--out_gff3",type=argparse.FileType("w"),help="multigff3 file for candidate SAR proteins",default="candidate_SAR.gff3")
-
- args = parser.parse_args()
-
- fa_dict = FASTA_parser(fa=args.fa).multifasta_dict()
-
- sars = {}
-
- for protein_name, protein_data in fa_dict.items():
- sar = CheckSequence(protein_name, protein_data)
- #sar.check_sizes(min=args.min,max=args.max)
- hydros = sar.shrink_results(sar_min=args.sar_min, sar_max=args.sar_max)
- sars.update(hydros)
-
-
- gff3_from_SAR_dict(sars, args.out_gff3)
- tab_from_SAR_dict(sars,args.out_stat,"SGAT",sar_min=args.sar_min, sar_max=args.sar_max)
- fasta_from_SAR_dict(sars,args.out_fa)
- #stat_file_from_SAR_dict(sars,args.out_stat,sar_min=args.sar_min,sar_max=args.sar_max) # fix this whenever ready.
\ No newline at end of file
diff -r 9f62910edcc9 -r 112751823323 cpt_sar_finder/SAR_finder.xml
--- a/cpt_sar_finder/SAR_finder.xml Fri Jun 17 13:15:55 2022 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,73 +0,0 @@
-
- SAR Domain Finder
-
- macros.xml
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- 50-60% Gly/Ala/Ser/Thr content. SAR endolysins are expected to have a net positive
-charge on the N-terminus by the positive-inside rule.
-
-**INPUT** --> Protein Multi FASTA
-
-**OUTPUT** -->
-
-* Multi FASTA with candidate proteins that pass the SAR domain criteria
-
-* Tabular summary file that lists every subdomain fitting the criteria for each potential SAR domain-containing protein with the following: protein name/sequence/length, SAR length/start/sequence/end, individual and total GAST% content in SAR, and N-terminal sequence/net charge
-
-* Multi GFF3 for unique candidate SAR domain-containing proteins
-
- ]]>
-
- 10.1371/journal.pcbi.1008214
- https://dx.doi.org/10.1016/bs.aivir.2018.09.003
-
- @unpublished{galaxyTools,
- author = {C. Ross},
- title = {CPT Galaxy Tools},
- year = {2020-},
- note = {https://github.com/tamu-cpt/galaxy-tools/}
- }
-
-
-
diff -r 9f62910edcc9 -r 112751823323 cpt_sar_finder/SAR_functions.py
--- a/cpt_sar_finder/SAR_functions.py Fri Jun 17 13:15:55 2022 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,193 +0,0 @@
-#!/usr/bin/env python
-
-import sys
-import argparse
-import os
-import re
-from Bio import SeqIO
-
-
-class CheckSequence:
- """
- SAR endolysin Verification class, which starts with complete FA file, and is shrunk by each function to reveal best candidates of SAR endolysin proteins
- """
-
-
- def __init__(self, protein_name, protein_data):
- self.name = protein_name
- self.seq = protein_data.seq
- self.description = protein_data.description
- self.size = len(self.seq)
- self.store = {}
-
-
- def check_sizes(self,min,max):
- """ check the minimum and maximum peptide lengths """
- if self.size < min:
- print("too small")
- elif self.size > max:
- print("too large")
- else:
- print(f"{self.name} : {self.seq}")
- return True
-
-
- def check_hydrophobicity_and_charge(self,sar_min=15,sar_max=20,perc_residues="SGAT"):
- """ verifies the existence of a hydrophobic region within the sequence """
- hydrophobic_residues = "['FIWLVMYCATGSP']" # fed through regex
- hits = self.store
- pos_res = "RK"
- neg_res = "DE"
-
- if self.size > 50:
- seq = self.seq[0:50]
- else:
- seq = self.seq
- for sar_size in range(sar_min, sar_max, 1):
- for i in range(0,len(seq)-sar_size,1):
- sar_seq = str(seq[i:i+sar_size])
- if re.search((hydrophobic_residues+"{"+str(sar_size)+"}"),sar_seq):
- charge_seq, charge, perc_cont, sar_coords, nterm_coords, cterm_coords, sar_start, sar_end = rep_funcs(self,seq,i,pos_res,neg_res,sar_seq,perc_residues,sar_size)
- storage_dict(self=self,sar_size=sar_size,sar_seq=sar_seq,hits=hits,charge_seq=charge_seq,charge=charge,perc_cont=perc_cont,nterm_coords=nterm_coords,sar_coords=sar_coords,cterm_coords=cterm_coords,sar_start=sar_start,sar_end=sar_end)
- #print("TMDSIZE: {}\tINDEX: {}".format(sar_size,i+1))
- elif "K" in sar_seq[0] and re.search((hydrophobic_residues+"{"+str(sar_size-1)+"}"),sar_seq[1:]): # check frontend snorkels
- charge_seq, charge, perc_cont, sar_coords, nterm_coords, cterm_coords, sar_start, sar_end = rep_funcs(self,seq,i,pos_res,neg_res,sar_seq,perc_residues,sar_size)
- storage_dict(self=self,sar_size=sar_size,sar_seq=sar_seq,hits=hits,charge_seq=charge_seq,charge=charge,perc_cont=perc_cont,nterm_coords=nterm_coords,sar_coords=sar_coords,cterm_coords=cterm_coords,sar_start=sar_start,sar_end=sar_end)
- #print("TMDSIZE: {}\tINDEX: {}".format(sar_size,i+1))
- elif "K" in sar_seq[-1] and re.search((hydrophobic_residues+"{"+str(sar_size-1)+"}"),sar_seq[:-1]): # check backend snorkels
- charge_seq, charge, perc_cont, sar_coords, nterm_coords, cterm_coords, sar_start, sar_end = rep_funcs(self,seq,i,pos_res,neg_res,sar_seq,perc_residues,sar_size)
- storage_dict(self=self,sar_size=sar_size,sar_seq=sar_seq,hits=hits,charge_seq=charge_seq,charge=charge,perc_cont=perc_cont,nterm_coords=nterm_coords,sar_coords=sar_coords,cterm_coords=cterm_coords,sar_start=sar_start,sar_end=sar_end)
- #print("TMDSIZE: {}\tINDEX: {}".format(sar_size,i+1))
- continue
-
- return hits
-
- def shrink_results(self,sar_min=15,sar_max=20,perc_residues="SGAT"):
- """ removes repetiive hits, keeps only the shortest and longest of each SAR domain """
- compare_candidates = {}
- hits = self.check_hydrophobicity_and_charge(sar_min=sar_min,sar_max=sar_max)
- for sar_name, data in hits.items():
- #print(sar_name)
- compare_candidates[sar_name] = {}
- #print("\nThese are the values: {}".format(v))
- #count_of_times = 0
- tmd_log = []
- for sar_size in range(sar_max,sar_min-1,-1):
- if "TMD_"+str(sar_size) in data:
- tmd_log.append(sar_size)
- #print(tmd_log)
- for idx,the_data in enumerate(data["TMD_"+str(sar_size)]):
- #print(the_data[7])
- #print(the_data)
- #print(f"This is the index: {idx}")
- #print(f"This is the list of data at this index: {the_data}")
- if the_data[7] in compare_candidates[sar_name]: # index to start
- compare_candidates[sar_name][the_data[7]]["count"] += 1
- compare_candidates[sar_name][the_data[7]]["size"].append(sar_size)
- compare_candidates[sar_name][the_data[7]]["index"].append(idx)
- else:
- compare_candidates[sar_name][the_data[7]] = {}
- compare_candidates[sar_name][the_data[7]]["count"] = 1
- compare_candidates[sar_name][the_data[7]]["size"] = [sar_size]
- compare_candidates[sar_name][the_data[7]]["index"] = [idx]
- hits[sar_name]["biggest_sar"] = tmd_log[0]
- for sar_name, compare_data in compare_candidates.items():
- for data in compare_data.values():
- if len(data["size"]) >= 3:
- #print(f"{each_size} --> {data}")
- minmax = [min(data["size"]),max(data["size"])]
- nonminmax = [x for x in data["size"] if x not in minmax]
- nonminmax_index = []
- for each_nonminmax in nonminmax:
- v = data["size"].index(each_nonminmax)
- x = data["index"][v]
- nonminmax_index.append(x)
- nons = zip(nonminmax,nonminmax_index)
- for value in nons:
- #hits[sar_name]["TMD_"+str(value[0])] = hits[sar_name]["TMD_"+str(value[0])].pop(value[1])
- hits[sar_name]["TMD_"+str(value[0])][value[1]] = [""]
-
- return hits
-
-
-def rep_funcs(self,seq,loc,pos_res,neg_res,sar_seq,perc_residues,sar_size):
- """ run a set of functions together before sending the results to the storage dictionary """
-
- charge_seq = str(seq[:loc])
- charge = charge_check(charge_seq,pos_res,neg_res)
- perc_cont = percent_calc(sar_seq,perc_residues,int(sar_size))
- sar_start = loc
- sar_end = loc + sar_size
- sar_coords = "{}..{}".format(loc,loc+sar_size)
- nterm_coords = "{}..{}".format("0",loc-1)
- cterm_coords = "{}..{}".format(loc+sar_size+1,self.size)
-
- return charge_seq, charge, perc_cont, sar_coords, nterm_coords, cterm_coords, sar_start, sar_end
-
-
-### Extra "helper" functions
-def storage_dict(self,sar_size,sar_seq,hits,charge_seq,charge,perc_cont,nterm_coords,sar_coords,cterm_coords,sar_start,sar_end): # probably not good to call "self" a param here...definitley not PEP approved...
- """ organize dictionary for hydrophobicity check """
- if self.name not in hits:
- hits[self.name] = {}
- hits[self.name]["description"] = str(self.description)
- hits[self.name]["sequence"] = str(self.seq)
- hits[self.name]["size"] = str(self.size)
- #GAcont = str((str(self.seq).count("G")+str(self.seq).count("A"))/int(self.size)*100)
- #hits[self.name]["GAcont"] = "{:.2f}%".format(float(GAcont))
- if "TMD_"+str(sar_size) not in hits[self.name]:
- hits[self.name]["TMD_"+str(sar_size)] = []
- hits[self.name]["TMD_"+str(sar_size)].append([sar_seq,charge_seq,charge,perc_cont,nterm_coords,sar_coords,cterm_coords,sar_start,sar_end])
- else:
- hits[self.name]["TMD_"+str(sar_size)].append([sar_seq,charge_seq,charge,perc_cont,nterm_coords,sar_coords,cterm_coords,sar_start,sar_end])
- else:
- if "TMD_"+str(sar_size) not in hits[self.name]:
- hits[self.name]["TMD_"+str(sar_size)] = []
- hits[self.name]["TMD_"+str(sar_size)].append([sar_seq,charge_seq,charge,perc_cont,nterm_coords,sar_coords,cterm_coords,sar_start,sar_end])
- else:
- hits[self.name]["TMD_"+str(sar_size)].append([sar_seq,charge_seq,charge,perc_cont,nterm_coords,sar_coords,cterm_coords,sar_start,sar_end])
-
-
-def percent_calc(sequence,residues,size):
- """ Calculate the percent of a set of residues within an input sequence """
- counted = {}
- for aa in sequence:
- #print(aa)
- if aa in counted:
- counted[aa] += 1
- else:
- counted[aa] = 1
- residue_amt = 0
- my_ratios = []
- for res_of_interest in residues:
- try:
- residue_amt = counted[res_of_interest]
- except KeyError:
- residue_amt = 0
- ratio = residue_amt/size
- my_ratios.append((round(ratio*100,2)))
-
- res_rat = list(zip(residues,my_ratios))
-
- return res_rat
-
-
-def charge_check(charge_seq,pos_res,neg_res):
- charge = 0
- for aa in charge_seq:
- if aa in pos_res:
- charge += 1
- if aa in neg_res:
- charge -= 1
- return charge
-
-if __name__ == "__main__":
- sequence = "MAGBYYYTRLCVRKLRKGGGHP"
- residues = "YL"
- size = len(sequence)
- print(size)
- v = percent_calc(sequence,residues,size)
- print(v)
- for i in v:
- print(i)
-
diff -r 9f62910edcc9 -r 112751823323 cpt_sar_finder/biopython_parsing.py
--- a/cpt_sar_finder/biopython_parsing.py Fri Jun 17 13:15:55 2022 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,24 +0,0 @@
-#!/usr/bin/env python
-# Biopython parsing module. Uses in conjunction with the sar_finder script, and potential future scripts down the line.
-
-from Bio import SeqIO
-
-class FASTA_parser:
- """ Parses multi fasta file, and zips together header with sequence """
-
- def __init__(self, fa):
- self.fa = fa
-
- def multifasta_dict(self):
- """ parses the input multi fasta, and puts results into dictionary """
-
- return SeqIO.to_dict(SeqIO.parse(self.fa,"fasta"))
-
-
-if __name__ == "__main__":
- fa_file = "test-data/mu-proteins.fa"
- d = FASTA_parser(fa_file).multifasta_dict()
- print(d)
- for k, v in d.items():
- print(v.description)
-
diff -r 9f62910edcc9 -r 112751823323 cpt_sar_finder/cpt-macros.xml
--- a/cpt_sar_finder/cpt-macros.xml Fri Jun 17 13:15:55 2022 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,115 +0,0 @@
-
-
-
-
- python
- biopython
- requests
-
-
-
-
-
-
-
- 10.1371/journal.pcbi.1008214
- @unpublished{galaxyTools,
- author = {E. Mijalis, H. Rasche},
- title = {CPT Galaxy Tools},
- year = {2013-2017},
- note = {https://github.com/tamu-cpt/galaxy-tools/}
- }
-
-
-
-
- 10.1371/journal.pcbi.1008214
-
- @unpublished{galaxyTools,
- author = {E. Mijalis, H. Rasche},
- title = {CPT Galaxy Tools},
- year = {2013-2017},
- note = {https://github.com/tamu-cpt/galaxy-tools/}
- }
-
-
-
-
-
-
- 10.1371/journal.pcbi.1008214
-
- @unpublished{galaxyTools,
- author = {C. Ross},
- title = {CPT Galaxy Tools},
- year = {2020-},
- note = {https://github.com/tamu-cpt/galaxy-tools/}
- }
-
-
-
-
-
-
- 10.1371/journal.pcbi.1008214
-
- @unpublished{galaxyTools,
- author = {E. Mijalis, H. Rasche},
- title = {CPT Galaxy Tools},
- year = {2013-2017},
- note = {https://github.com/tamu-cpt/galaxy-tools/}
- }
-
-
- @unpublished{galaxyTools,
- author = {A. Criscione},
- title = {CPT Galaxy Tools},
- year = {2019-2021},
- note = {https://github.com/tamu-cpt/galaxy-tools/}
- }
-
-
-
-
-
-
- 10.1371/journal.pcbi.1008214
-
- @unpublished{galaxyTools,
- author = {A. Criscione},
- title = {CPT Galaxy Tools},
- year = {2019-2021},
- note = {https://github.com/tamu-cpt/galaxy-tools/}
- }
-
-
-
-
-
-
- 10.1371/journal.pcbi.1008214
-
- @unpublished{galaxyTools,
- author = {C. Maughmer},
- title = {CPT Galaxy Tools},
- year = {2017-2020},
- note = {https://github.com/tamu-cpt/galaxy-tools/}
- }
-
-
-
-
-
-
- @unpublished{galaxyTools,
- author = {C. Maughmer},
- title = {CPT Galaxy Tools},
- year = {2017-2020},
- note = {https://github.com/tamu-cpt/galaxy-tools/}
- }
-
-
-
-
diff -r 9f62910edcc9 -r 112751823323 cpt_sar_finder/file_operations.py
--- a/cpt_sar_finder/file_operations.py Fri Jun 17 13:15:55 2022 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,98 +0,0 @@
-
-def fasta_from_SAR_dict(sar_dict,fa_file):
- """ makes a multi fasta with candidates from SAR dictionary """
- with fa_file as f:
- for data in sar_dict.values():
- f.writelines(">{}\n".format(data["description"]))
- f.writelines("{}\n".format(data["sequence"]))
-
-def gff3_from_SAR_dict(sar_dict,gff3_file):
- """ make a multi gff3 with candidates from SAR dictionary """
- gff3_cols = ["Seqid","Source","Type","Start","End","Score","Strand","Phase","Attributes"]
- with gff3_file as f:
- f.writelines(f"{gff3_cols[0]}\t{gff3_cols[1]}\t{gff3_cols[2]}\t{gff3_cols[3]}\t{gff3_cols[4]}\t{gff3_cols[5]}\t{gff3_cols[6]}\t{gff3_cols[7]}\t{gff3_cols[8]}\n")
- if sar_dict:
- #print(sar_dict)
- for name, data in sar_dict.items():
- min_idx = 0
- f.writelines("##gff-version 3\n")
- f.writelines(f"##sequence-region {name}\n")
- n_start, n_end = split_seq_string(data["TMD_"+str(data["biggest_sar"])][min_idx][4])
- sar_start, sar_end = split_seq_string(data["TMD_"+str(data["biggest_sar"])][min_idx][5])
- c_start, c_end = split_seq_string(data["TMD_"+str(data["biggest_sar"])][min_idx][6])
- f.writelines(f'{name}\tSAR_finder\tTopological domain\t{n_start}\t{n_end}\t.\t.\t.\tNote=N-terminal net charge is {data["TMD_"+str(data["biggest_sar"])][min_idx][2]}\n')
- f.writelines(f'{name}\tSAR_finder\tSAR domain\t{sar_start}\t{sar_end}\t.\t.\t.\tNote=residue % in SAR {[perc for perc in data["TMD_"+str(data["biggest_sar"])][min_idx][3]]},Total % is {round(sum(j for i,j in data["TMD_"+str(data["biggest_sar"])][min_idx][3]),2)}\n')
- f.writelines(f'{name}\tSAR_finder\tTopological domain\t{c_start}\t{c_end}\t.\t.\t.\tNote=C-terminus\n')
- else:
- f.writelines("##gff-version 3\n")
- f.writelines(f"##sequence-region\n")
-
-
-def tab_from_SAR_dict(sar_dict,stat_file,hydrophillic_res, sar_min, sar_max):
- """ convert SAR dict to a dataframe """
- columns = ["Name","Protein Sequence","Protein Length","SAR Length","SAR Start","Putative SAR Sequence","SAR End",[f"{res}%" for res in hydrophillic_res],"% Total","N-term Sequence","N-term net Charge"] # using different residues for percent calc: [f"{res}%" for res in hydrophillic_res]
- with stat_file as f:
- f.writelines(f"{columns[0]}\t{columns[1]}\t{columns[2]}\t{columns[3]}\t{columns[4]}\t{columns[5]}\t{columns[6]}\t{columns[7]}\t{columns[8]}\t{columns[9]}\t{columns[10]}\n")
- if sar_dict:
- #print(sar_dict)
- for name, data in sar_dict.items():
- for tmd_size in range(sar_max, sar_min-1, -1):
- if "TMD_"+str(tmd_size) in data:
- for each_match in data["TMD_"+str(tmd_size)]:
- if each_match != [""]:
- #print(f"{name} - {data}")
- #print(each_match)
- #for perc in each_match[3]:
- # print(perc)
- try:
- f.writelines(f'{name}\t{data["sequence"]}\t{data["size"]}\t{tmd_size}\t{int(each_match[7])+1}\t{each_match[0]}\t{int(each_match[8])+1}\t{[perc for perc in each_match[3]]}\t{round(sum(j for i,j in each_match[3]),2)}\t{each_match[1]}\t{each_match[2]}\n')
- except IndexError:
- f.writelines(f'ERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\n')
- else:
- continue
-
-def stat_file_from_SAR_dict(sar_dict, stat_file, sar_min, sar_max):
- """ summary statistics from SAR finder function """
- with stat_file as f:
- f.writelines("..........:::::: Candidate SAR Proteins ::::::..........\n\n")
- if sar_dict:
- for data in sar_dict.values():
- f.writelines("Protein Description and Name: {}\n".format(data["description"]))
- f.writelines("Protein Sequence: {}\n".format(data["sequence"]))
- f.writelines("Protein Length: {}\n".format(data["size"]))
- f.writelines("SAR Criteria matching region(s)\n")
- for tmd_size in range(sar_max, sar_min-1, -1):
- if "TMD_"+str(tmd_size) in data:
- f.writelines("\nSAR length of {}:\n".format(tmd_size))
- for each_match in data["TMD_"+str(tmd_size)]:
- if each_match != ['']:
- f.writelines("\nPotential SAR domain sequence: {}\n".format(each_match[0]))
- f.writelines("N-term sequence: {}\n".format(each_match[1]))
- f.writelines("N-term net charge: {}\n".format(each_match[2]))
- for each_perc_calc in each_match[3]:
- f.writelines("Percent {} content: {}%\n".format(each_perc_calc[0],each_perc_calc[1]))
- f.writelines("N-term coords: {}\n".format(each_match[4]))
- f.writelines("SAR coords: {}\n".format(each_match[5]))
- f.writelines("C-term coords: {}\n".format(each_match[6]))
- f.writelines("SAR start: {}\n".format(each_match[7]))
- else:
- continue
- f.writelines("========================================================\n\n")
- else:
- f.writelines("No candidate SAR Proteins found")
-
-def split_seq_string(input_range, python_indexing=True):
- """ splits a #..# sequence into the two respective starts and ends, if python indexing, adds 1, otherwise keeps """
- if python_indexing:
- values = input_range.split("..")
- start =int(values[0]) + 1
- end = int(values[1]) + 1
- else:
- values = input_range.split("..")
- start = values[0]
- end = values[1]
-
- return start, end
-
-if __name__ == "__main__":
- pass
\ No newline at end of file
diff -r 9f62910edcc9 -r 112751823323 cpt_sar_finder/macros.xml
--- a/cpt_sar_finder/macros.xml Fri Jun 17 13:15:55 2022 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,15 +0,0 @@
-
-
-
-
- python
- biopython
-
-
-
-
-
-
-
\ No newline at end of file
diff -r 9f62910edcc9 -r 112751823323 cpt_sar_finder/test-data/candidate_SAR.fa
--- a/cpt_sar_finder/test-data/candidate_SAR.fa Fri Jun 17 13:15:55 2022 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,2 +0,0 @@
->SAR-endolysin
-MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGPDIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTLLKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ
diff -r 9f62910edcc9 -r 112751823323 cpt_sar_finder/test-data/candidate_SAR.gff3
--- a/cpt_sar_finder/test-data/candidate_SAR.gff3 Fri Jun 17 13:15:55 2022 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,6 +0,0 @@
-Seqid Source Type Start End Score Strand Phase Attributes
-##gff-version 3
-##sequence-region SAR-endolysin
-SAR-endolysin SAR_finder Topological domain 1 8 . . . Note=N-terminal net charge is 2
-SAR-endolysin SAR_finder SAR domain 9 26 . . . Note=residue % in SAR [('S', 0.0), ('G', 29.41), ('A', 23.53), ('T', 5.88)],Total % is 58.82
-SAR-endolysin SAR_finder Topological domain 27 172 . . . Note=C-terminus
diff -r 9f62910edcc9 -r 112751823323 cpt_sar_finder/test-data/candidate_SAR_stats.tsv
--- a/cpt_sar_finder/test-data/candidate_SAR_stats.tsv Fri Jun 17 13:15:55 2022 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,6 +0,0 @@
-Name Protein Sequence Protein Length SAR Length SAR Start Putative SAR Sequence SAR End ['S%', 'G%', 'A%', 'T%'] % Total N-term Sequence N-term net Charge
-SAR-endolysin MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGPDIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTLLKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ 171 17 9 KAALLAVTIAGGGVGGY 26 [('S', 0.0), ('G', 29.41), ('A', 23.53), ('T', 5.88)] 58.82 MAGIPKKL 2
-SAR-endolysin MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGPDIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTLLKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ 171 16 10 AALLAVTIAGGGVGGY 26 [('S', 0.0), ('G', 31.25), ('A', 25.0), ('T', 6.25)] 62.5 MAGIPKKLK 3
-SAR-endolysin MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGPDIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTLLKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ 171 15 9 KAALLAVTIAGGGVG 24 [('S', 0.0), ('G', 26.67), ('A', 26.67), ('T', 6.67)] 60.01 MAGIPKKL 2
-SAR-endolysin MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGPDIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTLLKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ 171 15 10 AALLAVTIAGGGVGG 25 [('S', 0.0), ('G', 33.33), ('A', 26.67), ('T', 6.67)] 66.67 MAGIPKKLK 3
-SAR-endolysin MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGPDIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTLLKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ 171 15 11 ALLAVTIAGGGVGGY 26 [('S', 0.0), ('G', 33.33), ('A', 20.0), ('T', 6.67)] 60.0 MAGIPKKLKA 3
diff -r 9f62910edcc9 -r 112751823323 cpt_sar_finder/test-data/simple-proteins.fa
--- a/cpt_sar_finder/test-data/simple-proteins.fa Fri Jun 17 13:15:55 2022 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,7 +0,0 @@
->SAR-endolysin
-MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGP
-DIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTL
-LKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ
->CPT_NC_000929.1_038
-MLKIKPAAGKAIRDPLTMKLLASEGEEKPRNSFWIRRLAAGDVVEVGSTENTADDTDAAP
-KKRSKSK
\ No newline at end of file
diff -r 9f62910edcc9 -r 112751823323 file_operations.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/file_operations.py Mon Jun 05 02:52:57 2023 +0000
@@ -0,0 +1,163 @@
+def fasta_from_SAR_dict(sar_dict, fa_file):
+ """makes a multi fasta with candidates from SAR dictionary"""
+ with fa_file as f:
+ for data in sar_dict.values():
+ f.writelines(">{}\n".format(data["description"]))
+ f.writelines("{}\n".format(data["sequence"]))
+
+
+def gff3_from_SAR_dict(sar_dict, gff3_file):
+ """make a multi gff3 with candidates from SAR dictionary"""
+ gff3_cols = [
+ "Seqid",
+ "Source",
+ "Type",
+ "Start",
+ "End",
+ "Score",
+ "Strand",
+ "Phase",
+ "Attributes",
+ ]
+ with gff3_file as f:
+ f.writelines(
+ f"{gff3_cols[0]}\t{gff3_cols[1]}\t{gff3_cols[2]}\t{gff3_cols[3]}\t{gff3_cols[4]}\t{gff3_cols[5]}\t{gff3_cols[6]}\t{gff3_cols[7]}\t{gff3_cols[8]}\n"
+ )
+ if sar_dict:
+ # print(sar_dict)
+ for name, data in sar_dict.items():
+ min_idx = 0
+ f.writelines("##gff-version 3\n")
+ f.writelines(f"##sequence-region {name}\n")
+ n_start, n_end = split_seq_string(
+ data["TMD_" + str(data["biggest_sar"])][min_idx][4]
+ )
+ sar_start, sar_end = split_seq_string(
+ data["TMD_" + str(data["biggest_sar"])][min_idx][5]
+ )
+ c_start, c_end = split_seq_string(
+ data["TMD_" + str(data["biggest_sar"])][min_idx][6]
+ )
+ f.writelines(
+ f'{name}\tSAR_finder\tTopological domain\t{n_start}\t{n_end}\t.\t.\t.\tNote=N-terminal net charge is {data["TMD_"+str(data["biggest_sar"])][min_idx][2]}\n'
+ )
+ f.writelines(
+ f'{name}\tSAR_finder\tSAR domain\t{sar_start}\t{sar_end}\t.\t.\t.\tNote=residue % in SAR {[perc for perc in data["TMD_"+str(data["biggest_sar"])][min_idx][3]]},Total % is {round(sum(j for i,j in data["TMD_"+str(data["biggest_sar"])][min_idx][3]),2)}\n'
+ )
+ f.writelines(
+ f"{name}\tSAR_finder\tTopological domain\t{c_start}\t{c_end}\t.\t.\t.\tNote=C-terminus\n"
+ )
+ else:
+ f.writelines("##gff-version 3\n")
+ f.writelines(f"##sequence-region\n")
+
+
+def tab_from_SAR_dict(sar_dict, stat_file, hydrophillic_res, sar_min, sar_max):
+ """convert SAR dict to a dataframe"""
+ columns = [
+ "Name",
+ "Protein Sequence",
+ "Protein Length",
+ "SAR Length",
+ "SAR Start",
+ "Putative SAR Sequence",
+ "SAR End",
+ [f"{res}%" for res in hydrophillic_res],
+ "% Total",
+ "N-term Sequence",
+ "N-term net Charge",
+ ] # using different residues for percent calc: [f"{res}%" for res in hydrophillic_res]
+ with stat_file as f:
+ f.writelines(
+ f"{columns[0]}\t{columns[1]}\t{columns[2]}\t{columns[3]}\t{columns[4]}\t{columns[5]}\t{columns[6]}\t{columns[7]}\t{columns[8]}\t{columns[9]}\t{columns[10]}\n"
+ )
+ if sar_dict:
+ # print(sar_dict)
+ for name, data in sar_dict.items():
+ for tmd_size in range(sar_max, sar_min - 1, -1):
+ if "TMD_" + str(tmd_size) in data:
+ for each_match in data["TMD_" + str(tmd_size)]:
+ if each_match != [""]:
+ # print(f"{name} - {data}")
+ # print(each_match)
+ # for perc in each_match[3]:
+ # print(perc)
+ try:
+ f.writelines(
+ f'{name}\t{data["sequence"]}\t{data["size"]}\t{tmd_size}\t{int(each_match[7])+1}\t{each_match[0]}\t{int(each_match[8])+1}\t{[perc for perc in each_match[3]]}\t{round(sum(j for i,j in each_match[3]),2)}\t{each_match[1]}\t{each_match[2]}\n'
+ )
+ except IndexError:
+ f.writelines(
+ f"ERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\n"
+ )
+ else:
+ continue
+
+
+def stat_file_from_SAR_dict(sar_dict, stat_file, sar_min, sar_max):
+ """summary statistics from SAR finder function"""
+ with stat_file as f:
+ f.writelines("..........:::::: Candidate SAR Proteins ::::::..........\n\n")
+ if sar_dict:
+ for data in sar_dict.values():
+ f.writelines(
+ "Protein Description and Name: {}\n".format(data["description"])
+ )
+ f.writelines("Protein Sequence: {}\n".format(data["sequence"]))
+ f.writelines("Protein Length: {}\n".format(data["size"]))
+ f.writelines("SAR Criteria matching region(s)\n")
+ for tmd_size in range(sar_max, sar_min - 1, -1):
+ if "TMD_" + str(tmd_size) in data:
+ f.writelines("\nSAR length of {}:\n".format(tmd_size))
+ for each_match in data["TMD_" + str(tmd_size)]:
+ if each_match != [""]:
+ f.writelines(
+ "\nPotential SAR domain sequence: {}\n".format(
+ each_match[0]
+ )
+ )
+ f.writelines(
+ "N-term sequence: {}\n".format(each_match[1])
+ )
+ f.writelines(
+ "N-term net charge: {}\n".format(each_match[2])
+ )
+ for each_perc_calc in each_match[3]:
+ f.writelines(
+ "Percent {} content: {}%\n".format(
+ each_perc_calc[0], each_perc_calc[1]
+ )
+ )
+ f.writelines(
+ "N-term coords: {}\n".format(each_match[4])
+ )
+ f.writelines("SAR coords: {}\n".format(each_match[5]))
+ f.writelines(
+ "C-term coords: {}\n".format(each_match[6])
+ )
+ f.writelines("SAR start: {}\n".format(each_match[7]))
+ else:
+ continue
+ f.writelines(
+ "========================================================\n\n"
+ )
+ else:
+ f.writelines("No candidate SAR Proteins found")
+
+
+def split_seq_string(input_range, python_indexing=True):
+ """splits a #..# sequence into the two respective starts and ends, if python indexing, adds 1, otherwise keeps"""
+ if python_indexing:
+ values = input_range.split("..")
+ start = int(values[0]) + 1
+ end = int(values[1]) + 1
+ else:
+ values = input_range.split("..")
+ start = values[0]
+ end = values[1]
+
+ return start, end
+
+
+if __name__ == "__main__":
+ pass
diff -r 9f62910edcc9 -r 112751823323 macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml Mon Jun 05 02:52:57 2023 +0000
@@ -0,0 +1,74 @@
+
+
+
+ progressivemauve
+
+ bcbiogff
+
+
+
+ 2.4.0
+
+ 10.1371/journal.pone.0011147
+
+
+ 10.1093/bioinformatics/btm039
+
+
+ '$xmfa'
+
+
+
+
+
+ '$sequences'
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ '$gff3_data'
+
+
+ #if str($reference_genome.reference_genome_source) == 'cached':
+ '${reference_genome.fasta_indexes.fields.path}'
+ #else if str($reference_genome.reference_genome_source) == 'history':
+ genomeref.fa
+ #end if
+
+
+ #if $reference_genome.reference_genome_source == 'history':
+ ln -s '$reference_genome.genome_fasta' genomeref.fa;
+ #end if
+
+
+ #if str($reference_genome.reference_genome_source) == 'cached':
+ '${reference_genome.fasta_indexes.fields.path}'
+ #else if str($reference_genome.reference_genome_source) == 'history':
+ genomeref.fa
+ #end if
+
+
diff -r 9f62910edcc9 -r 112751823323 test-data/candidate_SAR.fa
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/candidate_SAR.fa Mon Jun 05 02:52:57 2023 +0000
@@ -0,0 +1,2 @@
+>SAR-endolysin
+MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGPDIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTLLKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ
diff -r 9f62910edcc9 -r 112751823323 test-data/candidate_SAR.gff3
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/candidate_SAR.gff3 Mon Jun 05 02:52:57 2023 +0000
@@ -0,0 +1,6 @@
+Seqid Source Type Start End Score Strand Phase Attributes
+##gff-version 3
+##sequence-region SAR-endolysin
+SAR-endolysin SAR_finder Topological domain 1 8 . . . Note=N-terminal net charge is 2
+SAR-endolysin SAR_finder SAR domain 9 26 . . . Note=residue % in SAR [('S', 0.0), ('G', 29.41), ('A', 23.53), ('T', 5.88)],Total % is 58.82
+SAR-endolysin SAR_finder Topological domain 27 172 . . . Note=C-terminus
diff -r 9f62910edcc9 -r 112751823323 test-data/candidate_SAR_stats.tsv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/candidate_SAR_stats.tsv Mon Jun 05 02:52:57 2023 +0000
@@ -0,0 +1,6 @@
+Name Protein Sequence Protein Length SAR Length SAR Start Putative SAR Sequence SAR End ['S%', 'G%', 'A%', 'T%'] % Total N-term Sequence N-term net Charge
+SAR-endolysin MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGPDIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTLLKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ 171 17 9 KAALLAVTIAGGGVGGY 26 [('S', 0.0), ('G', 29.41), ('A', 23.53), ('T', 5.88)] 58.82 MAGIPKKL 2
+SAR-endolysin MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGPDIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTLLKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ 171 16 10 AALLAVTIAGGGVGGY 26 [('S', 0.0), ('G', 31.25), ('A', 25.0), ('T', 6.25)] 62.5 MAGIPKKLK 3
+SAR-endolysin MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGPDIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTLLKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ 171 15 9 KAALLAVTIAGGGVG 24 [('S', 0.0), ('G', 26.67), ('A', 26.67), ('T', 6.67)] 60.01 MAGIPKKL 2
+SAR-endolysin MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGPDIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTLLKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ 171 15 10 AALLAVTIAGGGVGG 25 [('S', 0.0), ('G', 33.33), ('A', 26.67), ('T', 6.67)] 66.67 MAGIPKKLK 3
+SAR-endolysin MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGPDIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTLLKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ 171 15 11 ALLAVTIAGGGVGGY 26 [('S', 0.0), ('G', 33.33), ('A', 20.0), ('T', 6.67)] 60.0 MAGIPKKLKA 3
diff -r 9f62910edcc9 -r 112751823323 test-data/simple-proteins.fa
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/simple-proteins.fa Mon Jun 05 02:52:57 2023 +0000
@@ -0,0 +1,7 @@
+>SAR-endolysin
+MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGP
+DIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTL
+LKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ
+>CPT_NC_000929.1_038
+MLKIKPAAGKAIRDPLTMKLLASEGEEKPRNSFWIRRLAAGDVVEVGSTENTADDTDAAP
+KKRSKSK
\ No newline at end of file