Mercurial > repos > cpt > cpt_sar_finder

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/SAR_finder.py	Mon Jun 05 02:52:57 2023 +0000
@@ -0,0 +1,76 @@
+import sys
+import argparse
+import os
+import re
+from biopython_parsing import FASTA_parser
+from file_operations import fasta_from_SAR_dict, gff3_from_SAR_dict, tab_from_SAR_dict
+from SAR_functions import CheckSequence
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="SAR Finder")
+
+    parser.add_argument(
+        "fa", type=argparse.FileType("r"), help="organism's multi fasta file"
+    )
+
+    parser.add_argument(
+        "--min", type=int, default=20, help="minimum size of candidate peptide"
+    )
+
+    parser.add_argument(
+        "--max", type=int, default=200, help="maximum size of candidate peptide"
+    )
+
+    parser.add_argument(
+        "--sar_min",
+        type=int,
+        default=15,
+        help="minimum size of candidate peptide TMD domain",
+    )
+
+    parser.add_argument(
+        "--sar_max",
+        type=int,
+        default=24,
+        help="maximum size of candidate peptide TMD domain",
+    )
+
+    parser.add_argument(
+        "--out_fa",
+        type=argparse.FileType("w"),
+        help="multifasta output of candidate SAR proteins",
+        default="candidate_SAR.fa",
+    )
+
+    parser.add_argument(
+        "--out_stat",
+        type=argparse.FileType("w"),
+        help="summary statistic file for candidate SAR proteins, tab separated",
+        default="candidate_SAR_stats.tsv",
+    )
+
+    parser.add_argument(
+        "--out_gff3",
+        type=argparse.FileType("w"),
+        help="multigff3 file for candidate SAR proteins",
+        default="candidate_SAR.gff3",
+    )
+
+    args = parser.parse_args()
+
+    fa_dict = FASTA_parser(fa=args.fa).multifasta_dict()
+
+    sars = {}
+
+    for protein_name, protein_data in fa_dict.items():
+        sar = CheckSequence(protein_name, protein_data)
+        # sar.check_sizes(min=args.min,max=args.max)
+        hydros = sar.shrink_results(sar_min=args.sar_min, sar_max=args.sar_max)
+        sars.update(hydros)
+
+    gff3_from_SAR_dict(sars, args.out_gff3)
+    tab_from_SAR_dict(
+        sars, args.out_stat, "SGAT", sar_min=args.sar_min, sar_max=args.sar_max
+    )
+    fasta_from_SAR_dict(sars, args.out_fa)
+    # stat_file_from_SAR_dict(sars,args.out_stat,sar_min=args.sar_min,sar_max=args.sar_max) # fix this whenever ready.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/SAR_finder.xml	Mon Jun 05 02:52:57 2023 +0000
@@ -0,0 +1,73 @@
+<tool id="edu.tamu.cpt.sar.sar_finder" name="SAR Finder" version="1.0">
+    <description>SAR Domain Finder</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements">
+    </expand>
+    <command detect_errors="aggressive"><![CDATA[
+python '$__tool_directory__/SAR_finder.py'
+'$fa'
+--sar_min '$sar_min'
+--sar_max '$sar_max'
+--out_fa '$out_fa'
+--out_gff3 '$out_gff3'
+--out_stat '$out_stat'
+    ]]></command>
+    <inputs>
+        <param label="Multi FASTA File" name="fa" type="data" format="fasta"/>
+        <param label="SAR domain minimal size" name="sar_min" type="integer" value="15"/>
+        <param label="SAR domain maximum size" name="sar_max" type="integer" value="20"/>
+    </inputs>
+    <outputs>
+        <data format="tabular" name="out_stat" label="candidate_SAR_stats.tsv"/>
+        <data format="fasta" name="out_fa" label="candidate_SAR.fa"/>
+        <data format="gff3" name="out_gff3" label="candidate_SAR.gff3"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="fa" value="simple-proteins.fa"/>
+            <param name="sar_min" value="15"/>
+            <param name="sar_max" value="20"/>
+            <output name="out_stat" file="candidate_SAR_stats.tsv"/>
+            <output name="out_fa" file="candidate_SAR.fa"/>
+            <output name="out_gff3" file="candidate_SAR.gff3"/>
+        </test>
+    </tests>
+    <help><![CDATA[
+
+**What it does**
+A tool that analyzes the sequence within the first 50 residues of a protein for a weakly hydrophobic domain called Signal-Anchor-Release (aka SAR).
+The tool finds proteins that contain a stretch (default 15-20 residues) of hydrophobic residues (Ile, Leu, Val, Phe, Tyr, Trp, Met, Gly, Ala, Ser) and
+calculates the % Gly/Ala/Ser/Thr residues in the hydrophobic stretch. The net charge on the N-terminus is also displayed to aid in determining the
+SAR orientation in the membrane.[2]
+
+Definition: A Signal-Anchor-Release (SAR) domain is an N-terminal, weakly hydrophobic transmembrane region rich in Gly/Ala and/or Ser (and sometimes Thr) residues.
+The SAR domain is sometimes found in phage lysis proteins, including endolysins and holins. The SAR domain can be released from the membrane in a proton
+motive force-dependent manner. Known SAR domains in phage endolysins often have >50-60% Gly/Ala/Ser/Thr content. SAR endolysins are expected to have a net positive
+charge on the N-terminus by the positive-inside rule.
+
+**INPUT** --> Protein Multi FASTA
+
+**OUTPUT** -->
+
+* Multi FASTA with candidate proteins that pass the SAR domain criteria
+
+* Tabular summary file that lists every subdomain fitting the criteria for each potential SAR domain-containing protein with the following: protein name/sequence/length, SAR length/start/sequence/end, individual and total GAST% content in SAR, and N-terminal sequence/net charge
+
+* Multi GFF3 for unique candidate SAR domain-containing proteins
+
+    ]]></help>
+    <citations>
+        <citation type="doi">10.1371/journal.pcbi.1008214</citation>
+        <citation type="doi">https://dx.doi.org/10.1016/bs.aivir.2018.09.003</citation>
+        <citation type="bibtex">
+            @unpublished{galaxyTools,
+            author = {C. Ross},
+            title = {CPT Galaxy Tools},
+            year = {2020-},
+            note = {https://github.com/tamu-cpt/galaxy-tools/}
+            }
+        </citation>
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/SAR_functions.py	Mon Jun 05 02:52:57 2023 +0000
@@ -0,0 +1,348 @@
+#!/usr/bin/env python
+
+import sys
+import argparse
+import os
+import re
+from Bio import SeqIO
+
+
+class CheckSequence:
+    """
+    SAR endolysin Verification class, which starts with complete FA file, and is shrunk by each function to reveal best candidates of SAR endolysin proteins
+    """
+
+    def __init__(self, protein_name, protein_data):
+        self.name = protein_name
+        self.seq = protein_data.seq
+        self.description = protein_data.description
+        self.size = len(self.seq)
+        self.store = {}
+
+    def check_sizes(self, min, max):
+        """check the minimum and maximum peptide lengths"""
+        if self.size < min:
+            print("too small")
+        elif self.size > max:
+            print("too large")
+        else:
+            print(f"{self.name} : {self.seq}")
+            return True
+
+    def check_hydrophobicity_and_charge(
+        self, sar_min=15, sar_max=20, perc_residues="SGAT"
+    ):
+        """verifies the existence of a hydrophobic region within the sequence"""
+        hydrophobic_residues = "['FIWLVMYCATGSP']"  # fed through regex
+        hits = self.store
+        pos_res = "RK"
+        neg_res = "DE"
+
+        if self.size > 50:
+            seq = self.seq[0:50]
+        else:
+            seq = self.seq
+        for sar_size in range(sar_min, sar_max, 1):
+            for i in range(0, len(seq) - sar_size, 1):
+                sar_seq = str(seq[i : i + sar_size])
+                if re.search(
+                    (hydrophobic_residues + "{" + str(sar_size) + "}"), sar_seq
+                ):
+                    (
+                        charge_seq,
+                        charge,
+                        perc_cont,
+                        sar_coords,
+                        nterm_coords,
+                        cterm_coords,
+                        sar_start,
+                        sar_end,
+                    ) = rep_funcs(
+                        self, seq, i, pos_res, neg_res, sar_seq, perc_residues, sar_size
+                    )
+                    storage_dict(
+                        self=self,
+                        sar_size=sar_size,
+                        sar_seq=sar_seq,
+                        hits=hits,
+                        charge_seq=charge_seq,
+                        charge=charge,
+                        perc_cont=perc_cont,
+                        nterm_coords=nterm_coords,
+                        sar_coords=sar_coords,
+                        cterm_coords=cterm_coords,
+                        sar_start=sar_start,
+                        sar_end=sar_end,
+                    )
+                    # print("TMDSIZE: {}\tINDEX: {}".format(sar_size,i+1))
+                elif "K" in sar_seq[0] and re.search(
+                    (hydrophobic_residues + "{" + str(sar_size - 1) + "}"), sar_seq[1:]
+                ):  # check frontend snorkels
+                    (
+                        charge_seq,
+                        charge,
+                        perc_cont,
+                        sar_coords,
+                        nterm_coords,
+                        cterm_coords,
+                        sar_start,
+                        sar_end,
+                    ) = rep_funcs(
+                        self, seq, i, pos_res, neg_res, sar_seq, perc_residues, sar_size
+                    )
+                    storage_dict(
+                        self=self,
+                        sar_size=sar_size,
+                        sar_seq=sar_seq,
+                        hits=hits,
+                        charge_seq=charge_seq,
+                        charge=charge,
+                        perc_cont=perc_cont,
+                        nterm_coords=nterm_coords,
+                        sar_coords=sar_coords,
+                        cterm_coords=cterm_coords,
+                        sar_start=sar_start,
+                        sar_end=sar_end,
+                    )
+                    # print("TMDSIZE: {}\tINDEX: {}".format(sar_size,i+1))
+                elif "K" in sar_seq[-1] and re.search(
+                    (hydrophobic_residues + "{" + str(sar_size - 1) + "}"), sar_seq[:-1]
+                ):  # check backend snorkels
+                    (
+                        charge_seq,
+                        charge,
+                        perc_cont,
+                        sar_coords,
+                        nterm_coords,
+                        cterm_coords,
+                        sar_start,
+                        sar_end,
+                    ) = rep_funcs(
+                        self, seq, i, pos_res, neg_res, sar_seq, perc_residues, sar_size
+                    )
+                    storage_dict(
+                        self=self,
+                        sar_size=sar_size,
+                        sar_seq=sar_seq,
+                        hits=hits,
+                        charge_seq=charge_seq,
+                        charge=charge,
+                        perc_cont=perc_cont,
+                        nterm_coords=nterm_coords,
+                        sar_coords=sar_coords,
+                        cterm_coords=cterm_coords,
+                        sar_start=sar_start,
+                        sar_end=sar_end,
+                    )
+                    # print("TMDSIZE: {}\tINDEX: {}".format(sar_size,i+1))
+                continue
+
+        return hits
+
+    def shrink_results(self, sar_min=15, sar_max=20, perc_residues="SGAT"):
+        """removes repetiive hits, keeps only the shortest and longest of each SAR domain"""
+        compare_candidates = {}
+        hits = self.check_hydrophobicity_and_charge(sar_min=sar_min, sar_max=sar_max)
+        for sar_name, data in hits.items():
+            # print(sar_name)
+            compare_candidates[sar_name] = {}
+            # print("\nThese are the values: {}".format(v))
+            # count_of_times = 0
+            tmd_log = []
+            for sar_size in range(sar_max, sar_min - 1, -1):
+                if "TMD_" + str(sar_size) in data:
+                    tmd_log.append(sar_size)
+                    # print(tmd_log)
+                    for idx, the_data in enumerate(data["TMD_" + str(sar_size)]):
+                        # print(the_data[7])
+                        # print(the_data)
+                        # print(f"This is the index: {idx}")
+                        # print(f"This is the list of data at this index: {the_data}")
+                        if (
+                            the_data[7] in compare_candidates[sar_name]
+                        ):  # index to start
+                            compare_candidates[sar_name][the_data[7]]["count"] += 1
+                            compare_candidates[sar_name][the_data[7]]["size"].append(
+                                sar_size
+                            )
+                            compare_candidates[sar_name][the_data[7]]["index"].append(
+                                idx
+                            )
+                        else:
+                            compare_candidates[sar_name][the_data[7]] = {}
+                            compare_candidates[sar_name][the_data[7]]["count"] = 1
+                            compare_candidates[sar_name][the_data[7]]["size"] = [
+                                sar_size
+                            ]
+                            compare_candidates[sar_name][the_data[7]]["index"] = [idx]
+            hits[sar_name]["biggest_sar"] = tmd_log[0]
+        for sar_name, compare_data in compare_candidates.items():
+            for data in compare_data.values():
+                if len(data["size"]) >= 3:
+                    # print(f"{each_size} --> {data}")
+                    minmax = [min(data["size"]), max(data["size"])]
+                    nonminmax = [x for x in data["size"] if x not in minmax]
+                    nonminmax_index = []
+                    for each_nonminmax in nonminmax:
+                        v = data["size"].index(each_nonminmax)
+                        x = data["index"][v]
+                        nonminmax_index.append(x)
+                    nons = zip(nonminmax, nonminmax_index)
+                    for value in nons:
+                        # hits[sar_name]["TMD_"+str(value[0])] = hits[sar_name]["TMD_"+str(value[0])].pop(value[1])
+                        hits[sar_name]["TMD_" + str(value[0])][value[1]] = [""]
+
+        return hits
+
+
+def rep_funcs(self, seq, loc, pos_res, neg_res, sar_seq, perc_residues, sar_size):
+    """run a set of functions together before sending the results to the storage dictionary"""
+
+    charge_seq = str(seq[:loc])
+    charge = charge_check(charge_seq, pos_res, neg_res)
+    perc_cont = percent_calc(sar_seq, perc_residues, int(sar_size))
+    sar_start = loc
+    sar_end = loc + sar_size
+    sar_coords = "{}..{}".format(loc, loc + sar_size)
+    nterm_coords = "{}..{}".format("0", loc - 1)
+    cterm_coords = "{}..{}".format(loc + sar_size + 1, self.size)
+
+    return (
+        charge_seq,
+        charge,
+        perc_cont,
+        sar_coords,
+        nterm_coords,
+        cterm_coords,
+        sar_start,
+        sar_end,
+    )
+
+
+### Extra "helper" functions
+def storage_dict(
+    self,
+    sar_size,
+    sar_seq,
+    hits,
+    charge_seq,
+    charge,
+    perc_cont,
+    nterm_coords,
+    sar_coords,
+    cterm_coords,
+    sar_start,
+    sar_end,
+):  # probably not good to call "self" a param here...definitley not PEP approved...
+    """organize dictionary for hydrophobicity check"""
+    if self.name not in hits:
+        hits[self.name] = {}
+        hits[self.name]["description"] = str(self.description)
+        hits[self.name]["sequence"] = str(self.seq)
+        hits[self.name]["size"] = str(self.size)
+        # GAcont = str((str(self.seq).count("G")+str(self.seq).count("A"))/int(self.size)*100)
+        # hits[self.name]["GAcont"] = "{:.2f}%".format(float(GAcont))
+        if "TMD_" + str(sar_size) not in hits[self.name]:
+            hits[self.name]["TMD_" + str(sar_size)] = []
+            hits[self.name]["TMD_" + str(sar_size)].append(
+                [
+                    sar_seq,
+                    charge_seq,
+                    charge,
+                    perc_cont,
+                    nterm_coords,
+                    sar_coords,
+                    cterm_coords,
+                    sar_start,
+                    sar_end,
+                ]
+            )
+        else:
+            hits[self.name]["TMD_" + str(sar_size)].append(
+                [
+                    sar_seq,
+                    charge_seq,
+                    charge,
+                    perc_cont,
+                    nterm_coords,
+                    sar_coords,
+                    cterm_coords,
+                    sar_start,
+                    sar_end,
+                ]
+            )
+    else:
+        if "TMD_" + str(sar_size) not in hits[self.name]:
+            hits[self.name]["TMD_" + str(sar_size)] = []
+            hits[self.name]["TMD_" + str(sar_size)].append(
+                [
+                    sar_seq,
+                    charge_seq,
+                    charge,
+                    perc_cont,
+                    nterm_coords,
+                    sar_coords,
+                    cterm_coords,
+                    sar_start,
+                    sar_end,
+                ]
+            )
+        else:
+            hits[self.name]["TMD_" + str(sar_size)].append(
+                [
+                    sar_seq,
+                    charge_seq,
+                    charge,
+                    perc_cont,
+                    nterm_coords,
+                    sar_coords,
+                    cterm_coords,
+                    sar_start,
+                    sar_end,
+                ]
+            )
+
+
+def percent_calc(sequence, residues, size):
+    """Calculate the percent of a set of residues within an input sequence"""
+    counted = {}
+    for aa in sequence:
+        # print(aa)
+        if aa in counted:
+            counted[aa] += 1
+        else:
+            counted[aa] = 1
+    residue_amt = 0
+    my_ratios = []
+    for res_of_interest in residues:
+        try:
+            residue_amt = counted[res_of_interest]
+        except KeyError:
+            residue_amt = 0
+        ratio = residue_amt / size
+        my_ratios.append((round(ratio * 100, 2)))
+
+    res_rat = list(zip(residues, my_ratios))
+
+    return res_rat
+
+
+def charge_check(charge_seq, pos_res, neg_res):
+    charge = 0
+    for aa in charge_seq:
+        if aa in pos_res:
+            charge += 1
+        if aa in neg_res:
+            charge -= 1
+    return charge
+
+
+if __name__ == "__main__":
+    sequence = "MAGBYYYTRLCVRKLRKGGGHP"
+    residues = "YL"
+    size = len(sequence)
+    print(size)
+    v = percent_calc(sequence, residues, size)
+    print(v)
+    for i in v:
+        print(i)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/biopython_parsing.py	Mon Jun 05 02:52:57 2023 +0000
@@ -0,0 +1,24 @@
+#!/usr/bin/env python
+# Biopython parsing module. Uses in conjunction with the sar_finder script, and potential future scripts down the line.
+
+from Bio import SeqIO
+
+
+class FASTA_parser:
+    """Parses multi fasta file, and zips together header with sequence"""
+
+    def __init__(self, fa):
+        self.fa = fa
+
+    def multifasta_dict(self):
+        """parses the input multi fasta, and puts results into dictionary"""
+
+        return SeqIO.to_dict(SeqIO.parse(self.fa, "fasta"))
+
+
+if __name__ == "__main__":
+    fa_file = "test-data/mu-proteins.fa"
+    d = FASTA_parser(fa_file).multifasta_dict()
+    print(d)
+    for k, v in d.items():
+        print(v.description)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt-macros.xml	Mon Jun 05 02:52:57 2023 +0000
@@ -0,0 +1,115 @@
+<macros>
+    <xml name="gff_requirements">
+        <requirements>
+            <requirement type="package" version="2.7">python</requirement>
+            <requirement type="package" version="1.65">biopython</requirement>
+            <requirement type="package" version="2.12.1">requests</requirement>
+			<requirement type="package" version="1.2.2">cpt_gffparser</requirement>
+            <yield/>
+        </requirements>
+        <version_command>
+		<![CDATA[
+			cd '$__tool_directory__' && git rev-parse HEAD
+		]]>
+		</version_command>
+    </xml>
+    <xml name="citation/mijalisrasche">
+        <citation type="doi">10.1371/journal.pcbi.1008214</citation>
+        <citation type="bibtex">@unpublished{galaxyTools,
+		author = {E. Mijalis, H. Rasche},
+		title = {CPT Galaxy Tools},
+		year = {2013-2017},
+		note = {https://github.com/tamu-cpt/galaxy-tools/}
+		}
+		</citation>
+    </xml>
+    <xml name="citations">
+        <citations>
+            <citation type="doi">10.1371/journal.pcbi.1008214</citation>
+            <citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {E. Mijalis, H. Rasche},
+				title = {CPT Galaxy Tools},
+				year = {2013-2017},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+            <yield/>
+        </citations>
+    </xml>
+    <xml name="citations-crr">
+        <citations>
+            <citation type="doi">10.1371/journal.pcbi.1008214</citation>
+            <citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {C. Ross},
+				title = {CPT Galaxy Tools},
+				year = {2020-},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+            <yield/>
+        </citations>
+    </xml>
+    <xml name="citations-2020">
+        <citations>
+            <citation type="doi">10.1371/journal.pcbi.1008214</citation>
+            <citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {E. Mijalis, H. Rasche},
+				title = {CPT Galaxy Tools},
+				year = {2013-2017},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+            <citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {A. Criscione},
+				title = {CPT Galaxy Tools},
+				year = {2019-2021},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+                        </citation>
+            <yield/>
+        </citations>
+    </xml>
+    <xml name="citations-2020-AJC-solo">
+        <citations>
+            <citation type="doi">10.1371/journal.pcbi.1008214</citation>
+            <citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {A. Criscione},
+				title = {CPT Galaxy Tools},
+				year = {2019-2021},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+                        </citation>
+            <yield/>
+        </citations>
+    </xml>
+    <xml name="citations-clm">
+        <citations>
+            <citation type="doi">10.1371/journal.pcbi.1008214</citation>
+            <citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {C. Maughmer},
+				title = {CPT Galaxy Tools},
+				year = {2017-2020},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+            <yield/>
+        </citations>
+    </xml>
+    <xml name="sl-citations-clm">
+        <citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {C. Maughmer},
+				title = {CPT Galaxy Tools},
+				year = {2017-2020},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+        <yield/>
+    </xml>
+</macros>
--- a/cpt_sar_finder/SAR_finder.py	Fri Jun 17 13:15:55 2022 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,44 +0,0 @@
-import sys
-import argparse
-import os
-import re
-from biopython_parsing import FASTA_parser
-from file_operations import fasta_from_SAR_dict, gff3_from_SAR_dict, tab_from_SAR_dict
-from SAR_functions import CheckSequence
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="SAR Finder")
-
-    parser.add_argument("fa",type=argparse.FileType("r"),help="organism's multi fasta file")
-
-    parser.add_argument("--min",type=int,default=20,help="minimum size of candidate peptide")
-
-    parser.add_argument("--max",type=int,default=200,help="maximum size of candidate peptide")
-
-    parser.add_argument("--sar_min",type=int,default=15,help="minimum size of candidate peptide TMD domain")
-
-    parser.add_argument("--sar_max",type=int,default=24,help="maximum size of candidate peptide TMD domain")
-
-    parser.add_argument("--out_fa",type=argparse.FileType("w"),help="multifasta output of candidate SAR proteins",default="candidate_SAR.fa")
-
-    parser.add_argument("--out_stat",type=argparse.FileType("w"),help="summary statistic file for candidate SAR proteins, tab separated",default="candidate_SAR_stats.tsv")
-
-    parser.add_argument("--out_gff3",type=argparse.FileType("w"),help="multigff3 file for candidate SAR proteins",default="candidate_SAR.gff3")
-
-    args = parser.parse_args()
-
-    fa_dict = FASTA_parser(fa=args.fa).multifasta_dict()
-
-    sars = {}
-
-    for protein_name, protein_data in fa_dict.items():
-        sar = CheckSequence(protein_name, protein_data)
-        #sar.check_sizes(min=args.min,max=args.max)
-        hydros = sar.shrink_results(sar_min=args.sar_min, sar_max=args.sar_max)
-        sars.update(hydros)
-
-
-    gff3_from_SAR_dict(sars, args.out_gff3)
-    tab_from_SAR_dict(sars,args.out_stat,"SGAT",sar_min=args.sar_min, sar_max=args.sar_max)
-    fasta_from_SAR_dict(sars,args.out_fa)
-    #stat_file_from_SAR_dict(sars,args.out_stat,sar_min=args.sar_min,sar_max=args.sar_max) # fix this whenever ready.
\ No newline at end of file
--- a/cpt_sar_finder/SAR_finder.xml	Fri Jun 17 13:15:55 2022 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,73 +0,0 @@
-<tool id="edu.tamu.cpt.sar.sar_finder" name="SAR Finder" version="1.0">
-    <description>SAR Domain Finder</description>
-    <macros>
-        <import>macros.xml</import>
-    </macros>
-    <expand macro="requirements">
-    </expand>
-    <command detect_errors="aggressive"><![CDATA[
-python $__tool_directory__/SAR_finder.py
-$fa
---sar_min $sar_min
---sar_max $sar_max
---out_fa $out_fa
---out_gff3 $out_gff3
---out_stat $out_stat
-    ]]></command>
-    <inputs>
-        <param label="Multi FASTA File" name="fa" type="data" format="fasta" />
-        <param label="SAR domain minimal size" name="sar_min" type="integer" value="15" />
-        <param label="SAR domain maximum size" name="sar_max" type="integer" value="20" />
-    </inputs>
-    <outputs>
-        <data format="tabular" name="out_stat" label="candidate_SAR_stats.tsv"/>
-        <data format="fasta" name="out_fa" label="candidate_SAR.fa"/>
-        <data format="gff3" name="out_gff3" label="candidate_SAR.gff3"/>
-    </outputs>
-        <tests>
-            <test>
-                <param name="fa" value="simple-proteins.fa"/>
-                <param name="sar_min" value="15"/>
-                <param name="sar_max" value="20"/>
-                <output name="out_stat" file="candidate_SAR_stats.tsv"/>
-                <output name="out_fa" file="candidate_SAR.fa"/>
-                <output name="out_gff3" file="candidate_SAR.gff3"/>
-            </test>
-        </tests>
-    <help><![CDATA[
-
-**What it does**
-A tool that analyzes the sequence within the first 50 residues of a protein for a weakly hydrophobic domain called Signal-Anchor-Release (aka SAR).
-The tool finds proteins that contain a stretch (default 15-20 residues) of hydrophobic residues (Ile, Leu, Val, Phe, Tyr, Trp, Met, Gly, Ala, Ser) and
-calculates the % Gly/Ala/Ser/Thr residues in the hydrophobic stretch. The net charge on the N-terminus is also displayed to aid in determining the
-SAR orientation in the membrane.[2]
-
-Definition: A Signal-Anchor-Release (SAR) domain is an N-terminal, weakly hydrophobic transmembrane region rich in Gly/Ala and/or Ser (and sometimes Thr) residues.
-The SAR domain is sometimes found in phage lysis proteins, including endolysins and holins. The SAR domain can be released from the membrane in a proton
-motive force-dependent manner. Known SAR domains in phage endolysins often have >50-60% Gly/Ala/Ser/Thr content. SAR endolysins are expected to have a net positive
-charge on the N-terminus by the positive-inside rule.
-
-**INPUT** --> Protein Multi FASTA
-
-**OUTPUT** -->
-
-* Multi FASTA with candidate proteins that pass the SAR domain criteria
-
-* Tabular summary file that lists every subdomain fitting the criteria for each potential SAR domain-containing protein with the following: protein name/sequence/length, SAR length/start/sequence/end, individual and total GAST% content in SAR, and N-terminal sequence/net charge
-
-* Multi GFF3 for unique candidate SAR domain-containing proteins
-
-    ]]></help>
-    <citations>
-        <citation type="doi">10.1371/journal.pcbi.1008214</citation>
-        <citation type="doi">https://dx.doi.org/10.1016/bs.aivir.2018.09.003</citation>
-        <citation type="bibtex">
-            @unpublished{galaxyTools,
-            author = {C. Ross},
-            title = {CPT Galaxy Tools},
-            year = {2020-},
-            note = {https://github.com/tamu-cpt/galaxy-tools/}
-            }
-        </citation>
-    </citations>
-</tool>
--- a/cpt_sar_finder/SAR_functions.py	Fri Jun 17 13:15:55 2022 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,193 +0,0 @@
-#!/usr/bin/env python
-
-import sys
-import argparse
-import os
-import re
-from Bio import SeqIO
-
-
-class CheckSequence:
-    """
-    SAR endolysin Verification class, which starts with complete FA file, and is shrunk by each function to reveal best candidates of SAR endolysin proteins
-    """
-
-
-    def __init__(self, protein_name, protein_data):
-        self.name = protein_name
-        self.seq = protein_data.seq
-        self.description = protein_data.description
-        self.size = len(self.seq)
-        self.store = {}
-
-
-    def check_sizes(self,min,max):
-        """ check the minimum and maximum peptide lengths """
-        if self.size < min:
-            print("too small")
-        elif self.size > max:
-            print("too large")
-        else:
-            print(f"{self.name} : {self.seq}")
-            return True
-
-
-    def check_hydrophobicity_and_charge(self,sar_min=15,sar_max=20,perc_residues="SGAT"):
-        """ verifies the existence of a hydrophobic region within the sequence """
-        hydrophobic_residues = "['FIWLVMYCATGSP']" # fed through regex
-        hits = self.store
-        pos_res = "RK"
-        neg_res = "DE"
-
-        if self.size > 50:
-            seq = self.seq[0:50]
-        else:
-            seq = self.seq
-        for sar_size in range(sar_min, sar_max, 1):
-            for i in range(0,len(seq)-sar_size,1):
-                sar_seq = str(seq[i:i+sar_size])
-                if re.search((hydrophobic_residues+"{"+str(sar_size)+"}"),sar_seq):
-                    charge_seq, charge, perc_cont, sar_coords, nterm_coords, cterm_coords, sar_start, sar_end = rep_funcs(self,seq,i,pos_res,neg_res,sar_seq,perc_residues,sar_size)
-                    storage_dict(self=self,sar_size=sar_size,sar_seq=sar_seq,hits=hits,charge_seq=charge_seq,charge=charge,perc_cont=perc_cont,nterm_coords=nterm_coords,sar_coords=sar_coords,cterm_coords=cterm_coords,sar_start=sar_start,sar_end=sar_end)
-                    #print("TMDSIZE: {}\tINDEX: {}".format(sar_size,i+1))
-                elif "K" in sar_seq[0] and re.search((hydrophobic_residues+"{"+str(sar_size-1)+"}"),sar_seq[1:]): # check frontend snorkels
-                    charge_seq, charge, perc_cont, sar_coords, nterm_coords, cterm_coords, sar_start, sar_end = rep_funcs(self,seq,i,pos_res,neg_res,sar_seq,perc_residues,sar_size)
-                    storage_dict(self=self,sar_size=sar_size,sar_seq=sar_seq,hits=hits,charge_seq=charge_seq,charge=charge,perc_cont=perc_cont,nterm_coords=nterm_coords,sar_coords=sar_coords,cterm_coords=cterm_coords,sar_start=sar_start,sar_end=sar_end)
-                    #print("TMDSIZE: {}\tINDEX: {}".format(sar_size,i+1))
-                elif "K" in sar_seq[-1] and re.search((hydrophobic_residues+"{"+str(sar_size-1)+"}"),sar_seq[:-1]): # check backend snorkels
-                    charge_seq, charge, perc_cont, sar_coords, nterm_coords, cterm_coords, sar_start, sar_end = rep_funcs(self,seq,i,pos_res,neg_res,sar_seq,perc_residues,sar_size)
-                    storage_dict(self=self,sar_size=sar_size,sar_seq=sar_seq,hits=hits,charge_seq=charge_seq,charge=charge,perc_cont=perc_cont,nterm_coords=nterm_coords,sar_coords=sar_coords,cterm_coords=cterm_coords,sar_start=sar_start,sar_end=sar_end)
-                    #print("TMDSIZE: {}\tINDEX: {}".format(sar_size,i+1))
-                continue
-
-        return hits
-
-    def shrink_results(self,sar_min=15,sar_max=20,perc_residues="SGAT"):
-        """ removes repetiive hits, keeps only the shortest and longest of each SAR domain """
-        compare_candidates = {}
-        hits = self.check_hydrophobicity_and_charge(sar_min=sar_min,sar_max=sar_max)
-        for sar_name, data in hits.items():
-            #print(sar_name)
-            compare_candidates[sar_name] = {}
-            #print("\nThese are the values: {}".format(v))
-            #count_of_times = 0
-            tmd_log = []
-            for sar_size in range(sar_max,sar_min-1,-1):
-                if "TMD_"+str(sar_size) in data:
-                    tmd_log.append(sar_size)
-                    #print(tmd_log)
-                    for idx,the_data in enumerate(data["TMD_"+str(sar_size)]):
-                        #print(the_data[7])
-                        #print(the_data)
-                        #print(f"This is the index: {idx}")
-                        #print(f"This is the list of data at this index: {the_data}")
-                        if the_data[7] in compare_candidates[sar_name]: # index to start
-                            compare_candidates[sar_name][the_data[7]]["count"] += 1
-                            compare_candidates[sar_name][the_data[7]]["size"].append(sar_size)
-                            compare_candidates[sar_name][the_data[7]]["index"].append(idx)
-                        else:
-                            compare_candidates[sar_name][the_data[7]] = {}
-                            compare_candidates[sar_name][the_data[7]]["count"] = 1
-                            compare_candidates[sar_name][the_data[7]]["size"] = [sar_size]
-                            compare_candidates[sar_name][the_data[7]]["index"] = [idx]
-            hits[sar_name]["biggest_sar"] = tmd_log[0]
-        for sar_name, compare_data in compare_candidates.items():
-            for data in compare_data.values():
-                if len(data["size"]) >= 3:
-                    #print(f"{each_size} --> {data}")
-                    minmax = [min(data["size"]),max(data["size"])]
-                    nonminmax = [x for x in data["size"] if x not in minmax]
-                    nonminmax_index = []
-                    for each_nonminmax in nonminmax:
-                        v = data["size"].index(each_nonminmax)
-                        x = data["index"][v]
-                        nonminmax_index.append(x)
-                    nons = zip(nonminmax,nonminmax_index)
-                    for value in nons:
-                        #hits[sar_name]["TMD_"+str(value[0])] = hits[sar_name]["TMD_"+str(value[0])].pop(value[1])
-                        hits[sar_name]["TMD_"+str(value[0])][value[1]] = [""]
-
-        return hits
-
-
-def rep_funcs(self,seq,loc,pos_res,neg_res,sar_seq,perc_residues,sar_size):
-    """ run a set of functions together before sending the results to the storage dictionary """
-
-    charge_seq = str(seq[:loc])
-    charge = charge_check(charge_seq,pos_res,neg_res)
-    perc_cont = percent_calc(sar_seq,perc_residues,int(sar_size))
-    sar_start = loc
-    sar_end = loc + sar_size
-    sar_coords = "{}..{}".format(loc,loc+sar_size)
-    nterm_coords = "{}..{}".format("0",loc-1)
-    cterm_coords = "{}..{}".format(loc+sar_size+1,self.size)
-
-    return charge_seq, charge, perc_cont, sar_coords, nterm_coords, cterm_coords, sar_start, sar_end
-
-
-### Extra "helper" functions
-def storage_dict(self,sar_size,sar_seq,hits,charge_seq,charge,perc_cont,nterm_coords,sar_coords,cterm_coords,sar_start,sar_end): # probably not good to call "self" a param here...definitley not PEP approved...
-    """ organize dictionary for hydrophobicity check """
-    if self.name not in hits:
-        hits[self.name] = {}
-        hits[self.name]["description"] = str(self.description)
-        hits[self.name]["sequence"] = str(self.seq)
-        hits[self.name]["size"] = str(self.size)
-        #GAcont = str((str(self.seq).count("G")+str(self.seq).count("A"))/int(self.size)*100)
-        #hits[self.name]["GAcont"] = "{:.2f}%".format(float(GAcont))
-        if "TMD_"+str(sar_size) not in hits[self.name]:
-            hits[self.name]["TMD_"+str(sar_size)] = []
-            hits[self.name]["TMD_"+str(sar_size)].append([sar_seq,charge_seq,charge,perc_cont,nterm_coords,sar_coords,cterm_coords,sar_start,sar_end])
-        else:
-            hits[self.name]["TMD_"+str(sar_size)].append([sar_seq,charge_seq,charge,perc_cont,nterm_coords,sar_coords,cterm_coords,sar_start,sar_end])
-    else:
-        if "TMD_"+str(sar_size) not in hits[self.name]:
-            hits[self.name]["TMD_"+str(sar_size)] = []
-            hits[self.name]["TMD_"+str(sar_size)].append([sar_seq,charge_seq,charge,perc_cont,nterm_coords,sar_coords,cterm_coords,sar_start,sar_end])
-        else:
-            hits[self.name]["TMD_"+str(sar_size)].append([sar_seq,charge_seq,charge,perc_cont,nterm_coords,sar_coords,cterm_coords,sar_start,sar_end])
-
-
-def percent_calc(sequence,residues,size):
-    """ Calculate the percent of a set of residues within an input sequence """
-    counted = {}
-    for aa in sequence:
-        #print(aa)
-        if aa in counted:
-            counted[aa] += 1
-        else:
-            counted[aa] = 1
-    residue_amt = 0
-    my_ratios = []
-    for res_of_interest in residues:
-        try:
-            residue_amt = counted[res_of_interest]
-        except KeyError:
-            residue_amt = 0
-        ratio = residue_amt/size
-        my_ratios.append((round(ratio*100,2)))
-
-    res_rat = list(zip(residues,my_ratios))
-
-    return res_rat
-
-
-def charge_check(charge_seq,pos_res,neg_res):
-    charge = 0
-    for aa in charge_seq:
-        if aa in pos_res:
-            charge += 1
-        if aa in neg_res:
-            charge -= 1
-    return charge
-
-if __name__ == "__main__":
-    sequence = "MAGBYYYTRLCVRKLRKGGGHP"
-    residues = "YL"
-    size = len(sequence)
-    print(size)
-    v = percent_calc(sequence,residues,size)
-    print(v)
-    for i in v:
-        print(i)
-
--- a/cpt_sar_finder/biopython_parsing.py	Fri Jun 17 13:15:55 2022 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,24 +0,0 @@
-#!/usr/bin/env python
-# Biopython parsing module. Uses in conjunction with the sar_finder script, and potential future scripts down the line.
-
-from Bio import SeqIO
-
-class FASTA_parser:
-    """ Parses multi fasta file, and zips together header with sequence """
-
-    def __init__(self, fa):
-        self.fa = fa
-
-    def multifasta_dict(self):
-        """ parses the input multi fasta, and puts results into dictionary """
-
-        return SeqIO.to_dict(SeqIO.parse(self.fa,"fasta"))
-
-
-if __name__ == "__main__":
-    fa_file = "test-data/mu-proteins.fa"
-    d = FASTA_parser(fa_file).multifasta_dict()
-    print(d)
-    for k, v in d.items():
-        print(v.description)
-
--- a/cpt_sar_finder/cpt-macros.xml	Fri Jun 17 13:15:55 2022 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,115 +0,0 @@
-<?xml version="1.0"?>
-<macros>
-	<xml name="gff_requirements">
-		<requirements>
-			<requirement type="package" version="2.7">python</requirement>
-			<requirement type="package" version="1.65">biopython</requirement>
-			<requirement type="package" version="2.12.1">requests</requirement>
-			<yield/>
-		</requirements>
-		<version_command>
-		<![CDATA[
-			cd $__tool_directory__ && git rev-parse HEAD
-		]]>
-		</version_command>
-	</xml>
-	<xml name="citation/mijalisrasche">
-		<citation type="doi">10.1371/journal.pcbi.1008214</citation>
-		<citation type="bibtex">@unpublished{galaxyTools,
-		author = {E. Mijalis, H. Rasche},
-		title = {CPT Galaxy Tools},
-		year = {2013-2017},
-		note = {https://github.com/tamu-cpt/galaxy-tools/}
-		}
-		</citation>
-	</xml>
-	<xml name="citations">
-		<citations>
-			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
-			<citation type="bibtex">
-			@unpublished{galaxyTools,
-				author = {E. Mijalis, H. Rasche},
-				title = {CPT Galaxy Tools},
-				year = {2013-2017},
-				note = {https://github.com/tamu-cpt/galaxy-tools/}
-			}
-			</citation>
-		<yield/>
-		</citations>
-	</xml>
-    	<xml name="citations-crr">
-		<citations>
-			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
-			<citation type="bibtex">
-			@unpublished{galaxyTools,
-				author = {C. Ross},
-				title = {CPT Galaxy Tools},
-				year = {2020-},
-				note = {https://github.com/tamu-cpt/galaxy-tools/}
-			}
-			</citation>
-		<yield/>
-		</citations>
-	</xml>
-        <xml name="citations-2020">
-		<citations>
-			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
-			<citation type="bibtex">
-			@unpublished{galaxyTools,
-				author = {E. Mijalis, H. Rasche},
-				title = {CPT Galaxy Tools},
-				year = {2013-2017},
-				note = {https://github.com/tamu-cpt/galaxy-tools/}
-			}
-			</citation>
-                        <citation type="bibtex">
-			@unpublished{galaxyTools,
-				author = {A. Criscione},
-				title = {CPT Galaxy Tools},
-				year = {2019-2021},
-				note = {https://github.com/tamu-cpt/galaxy-tools/}
-			}
-                        </citation>
-                        <yield/>
-		</citations>
-	</xml>
-        <xml name="citations-2020-AJC-solo">
-		<citations>
-			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
-                        <citation type="bibtex">
-			@unpublished{galaxyTools,
-				author = {A. Criscione},
-				title = {CPT Galaxy Tools},
-				year = {2019-2021},
-				note = {https://github.com/tamu-cpt/galaxy-tools/}
-			}
-                        </citation>
-                        <yield/>
-		</citations>
-	</xml>
-        <xml name="citations-clm">
-		<citations>
-			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
-			<citation type="bibtex">
-			@unpublished{galaxyTools,
-				author = {C. Maughmer},
-				title = {CPT Galaxy Tools},
-				year = {2017-2020},
-				note = {https://github.com/tamu-cpt/galaxy-tools/}
-			}
-			</citation>
-                        <yield/>
-		</citations>
-	</xml>
-        <xml name="sl-citations-clm">
-			<citation type="bibtex">
-			@unpublished{galaxyTools,
-				author = {C. Maughmer},
-				title = {CPT Galaxy Tools},
-				year = {2017-2020},
-				note = {https://github.com/tamu-cpt/galaxy-tools/}
-			}
-			</citation>
-                        <yield/>
-	</xml>
-</macros>
--- a/cpt_sar_finder/file_operations.py	Fri Jun 17 13:15:55 2022 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,98 +0,0 @@
-
-def fasta_from_SAR_dict(sar_dict,fa_file):
-    """ makes a multi fasta with candidates from SAR dictionary """
-    with fa_file as f:
-        for data in sar_dict.values():
-            f.writelines(">{}\n".format(data["description"]))
-            f.writelines("{}\n".format(data["sequence"]))
-
-def gff3_from_SAR_dict(sar_dict,gff3_file):
-    """ make a multi gff3 with candidates from SAR dictionary """
-    gff3_cols = ["Seqid","Source","Type","Start","End","Score","Strand","Phase","Attributes"]
-    with gff3_file as f:
-        f.writelines(f"{gff3_cols[0]}\t{gff3_cols[1]}\t{gff3_cols[2]}\t{gff3_cols[3]}\t{gff3_cols[4]}\t{gff3_cols[5]}\t{gff3_cols[6]}\t{gff3_cols[7]}\t{gff3_cols[8]}\n")
-        if sar_dict:
-            #print(sar_dict)
-            for name, data in sar_dict.items():
-                min_idx = 0
-                f.writelines("##gff-version 3\n")
-                f.writelines(f"##sequence-region {name}\n")
-                n_start, n_end = split_seq_string(data["TMD_"+str(data["biggest_sar"])][min_idx][4])
-                sar_start, sar_end = split_seq_string(data["TMD_"+str(data["biggest_sar"])][min_idx][5])
-                c_start, c_end = split_seq_string(data["TMD_"+str(data["biggest_sar"])][min_idx][6])
-                f.writelines(f'{name}\tSAR_finder\tTopological domain\t{n_start}\t{n_end}\t.\t.\t.\tNote=N-terminal net charge is {data["TMD_"+str(data["biggest_sar"])][min_idx][2]}\n')
-                f.writelines(f'{name}\tSAR_finder\tSAR domain\t{sar_start}\t{sar_end}\t.\t.\t.\tNote=residue % in SAR {[perc for perc in data["TMD_"+str(data["biggest_sar"])][min_idx][3]]},Total % is {round(sum(j for i,j in data["TMD_"+str(data["biggest_sar"])][min_idx][3]),2)}\n')
-                f.writelines(f'{name}\tSAR_finder\tTopological domain\t{c_start}\t{c_end}\t.\t.\t.\tNote=C-terminus\n')
-        else:
-            f.writelines("##gff-version 3\n")
-            f.writelines(f"##sequence-region\n")
-
-
-def tab_from_SAR_dict(sar_dict,stat_file,hydrophillic_res, sar_min, sar_max):
-    """ convert SAR dict to a dataframe """
-    columns = ["Name","Protein Sequence","Protein Length","SAR Length","SAR Start","Putative SAR Sequence","SAR End",[f"{res}%" for res in hydrophillic_res],"% Total","N-term Sequence","N-term net Charge"] # using different residues for percent calc: [f"{res}%" for res in hydrophillic_res]
-    with stat_file as f:
-        f.writelines(f"{columns[0]}\t{columns[1]}\t{columns[2]}\t{columns[3]}\t{columns[4]}\t{columns[5]}\t{columns[6]}\t{columns[7]}\t{columns[8]}\t{columns[9]}\t{columns[10]}\n")
-        if sar_dict:
-            #print(sar_dict)
-            for name, data in sar_dict.items():
-                for tmd_size in range(sar_max, sar_min-1, -1):
-                    if "TMD_"+str(tmd_size) in data:
-                        for each_match in data["TMD_"+str(tmd_size)]:
-                            if each_match != [""]:
-                                #print(f"{name} - {data}")
-                                #print(each_match)
-                                #for perc in each_match[3]:
-                                #    print(perc)
-                                try:
-                                    f.writelines(f'{name}\t{data["sequence"]}\t{data["size"]}\t{tmd_size}\t{int(each_match[7])+1}\t{each_match[0]}\t{int(each_match[8])+1}\t{[perc for perc in each_match[3]]}\t{round(sum(j for i,j in each_match[3]),2)}\t{each_match[1]}\t{each_match[2]}\n')
-                                except IndexError:
-                                    f.writelines(f'ERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\n')
-                            else:
-                                continue
-
-def stat_file_from_SAR_dict(sar_dict, stat_file, sar_min, sar_max):
-    """ summary statistics from SAR finder function """
-    with stat_file as f:
-        f.writelines("..........:::::: Candidate SAR Proteins ::::::..........\n\n")
-        if sar_dict:
-            for data in sar_dict.values():
-                f.writelines("Protein Description and Name: {}\n".format(data["description"]))
-                f.writelines("Protein Sequence: {}\n".format(data["sequence"]))
-                f.writelines("Protein Length: {}\n".format(data["size"]))
-                f.writelines("SAR Criteria matching region(s)\n")
-                for tmd_size in range(sar_max, sar_min-1, -1):
-                    if "TMD_"+str(tmd_size) in data:
-                        f.writelines("\nSAR length of {}:\n".format(tmd_size))
-                        for each_match in data["TMD_"+str(tmd_size)]:
-                            if each_match != ['']:
-                                f.writelines("\nPotential SAR domain sequence: {}\n".format(each_match[0]))
-                                f.writelines("N-term sequence: {}\n".format(each_match[1]))
-                                f.writelines("N-term net charge: {}\n".format(each_match[2]))
-                                for each_perc_calc in each_match[3]:
-                                    f.writelines("Percent {} content: {}%\n".format(each_perc_calc[0],each_perc_calc[1]))
-                                f.writelines("N-term coords: {}\n".format(each_match[4]))
-                                f.writelines("SAR coords: {}\n".format(each_match[5]))
-                                f.writelines("C-term coords: {}\n".format(each_match[6]))
-                                f.writelines("SAR start: {}\n".format(each_match[7]))
-                            else:
-                                continue
-                f.writelines("========================================================\n\n")
-        else:
-            f.writelines("No candidate SAR Proteins found")
-
-def split_seq_string(input_range, python_indexing=True):
-    """ splits a #..# sequence into the two respective starts and ends, if python indexing, adds 1, otherwise keeps """
-    if python_indexing:
-        values = input_range.split("..")
-        start =int(values[0]) + 1
-        end = int(values[1]) + 1
-    else:
-        values = input_range.split("..")
-        start = values[0]
-        end = values[1]
-
-    return start, end
-
-if __name__ == "__main__":
-    pass
\ No newline at end of file
--- a/cpt_sar_finder/macros.xml	Fri Jun 17 13:15:55 2022 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,15 +0,0 @@
-<?xml version="1.0"?>
-<macros>
-	<xml name="requirements">
-		<requirements>
-			<requirement type="package" version="3.6.1">python</requirement>
-			<requirement type="package" version="1.67">biopython</requirement>
-			<yield/>
-		</requirements>
-        <version_command>
-		<![CDATA[
-			cd $__tool_directory__ && git rev-parse HEAD
-		]]>
-		</version_command>
-	</xml>
-</macros>
\ No newline at end of file
--- a/cpt_sar_finder/test-data/candidate_SAR.fa	Fri Jun 17 13:15:55 2022 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,2 +0,0 @@
->SAR-endolysin
-MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGPDIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTLLKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ
--- a/cpt_sar_finder/test-data/candidate_SAR.gff3	Fri Jun 17 13:15:55 2022 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,6 +0,0 @@
-Seqid	Source	Type	Start	End	Score	Strand	Phase	Attributes
-##gff-version 3
-##sequence-region SAR-endolysin
-SAR-endolysin	SAR_finder	Topological domain	1	8	.	.	.	Note=N-terminal net charge is 2
-SAR-endolysin	SAR_finder	SAR domain	9	26	.	.	.	Note=residue % in SAR [('S', 0.0), ('G', 29.41), ('A', 23.53), ('T', 5.88)],Total % is 58.82
-SAR-endolysin	SAR_finder	Topological domain	27	172	.	.	.	Note=C-terminus
--- a/cpt_sar_finder/test-data/candidate_SAR_stats.tsv	Fri Jun 17 13:15:55 2022 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,6 +0,0 @@
-Name	Protein Sequence	Protein Length	SAR Length	SAR Start	Putative SAR Sequence	SAR End	['S%', 'G%', 'A%', 'T%']	% Total	N-term Sequence	N-term net Charge
-SAR-endolysin	MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGPDIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTLLKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ	171	17	9	KAALLAVTIAGGGVGGY	26	[('S', 0.0), ('G', 29.41), ('A', 23.53), ('T', 5.88)]	58.82	MAGIPKKL	2
-SAR-endolysin	MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGPDIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTLLKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ	171	16	10	AALLAVTIAGGGVGGY	26	[('S', 0.0), ('G', 31.25), ('A', 25.0), ('T', 6.25)]	62.5	MAGIPKKLK	3
-SAR-endolysin	MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGPDIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTLLKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ	171	15	9	KAALLAVTIAGGGVG	24	[('S', 0.0), ('G', 26.67), ('A', 26.67), ('T', 6.67)]	60.01	MAGIPKKL	2
-SAR-endolysin	MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGPDIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTLLKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ	171	15	10	AALLAVTIAGGGVGG	25	[('S', 0.0), ('G', 33.33), ('A', 26.67), ('T', 6.67)]	66.67	MAGIPKKLK	3
-SAR-endolysin	MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGPDIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTLLKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ	171	15	11	ALLAVTIAGGGVGGY	26	[('S', 0.0), ('G', 33.33), ('A', 20.0), ('T', 6.67)]	60.0	MAGIPKKLKA	3
--- a/cpt_sar_finder/test-data/simple-proteins.fa	Fri Jun 17 13:15:55 2022 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,7 +0,0 @@
->SAR-endolysin
-MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGP
-DIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTL
-LKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ
->CPT_NC_000929.1_038
-MLKIKPAAGKAIRDPLTMKLLASEGEEKPRNSFWIRRLAAGDVVEVGSTENTADDTDAAP
-KKRSKSK
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/file_operations.py	Mon Jun 05 02:52:57 2023 +0000
@@ -0,0 +1,163 @@
+def fasta_from_SAR_dict(sar_dict, fa_file):
+    """makes a multi fasta with candidates from SAR dictionary"""
+    with fa_file as f:
+        for data in sar_dict.values():
+            f.writelines(">{}\n".format(data["description"]))
+            f.writelines("{}\n".format(data["sequence"]))
+
+
+def gff3_from_SAR_dict(sar_dict, gff3_file):
+    """make a multi gff3 with candidates from SAR dictionary"""
+    gff3_cols = [
+        "Seqid",
+        "Source",
+        "Type",
+        "Start",
+        "End",
+        "Score",
+        "Strand",
+        "Phase",
+        "Attributes",
+    ]
+    with gff3_file as f:
+        f.writelines(
+            f"{gff3_cols[0]}\t{gff3_cols[1]}\t{gff3_cols[2]}\t{gff3_cols[3]}\t{gff3_cols[4]}\t{gff3_cols[5]}\t{gff3_cols[6]}\t{gff3_cols[7]}\t{gff3_cols[8]}\n"
+        )
+        if sar_dict:
+            # print(sar_dict)
+            for name, data in sar_dict.items():
+                min_idx = 0
+                f.writelines("##gff-version 3\n")
+                f.writelines(f"##sequence-region {name}\n")
+                n_start, n_end = split_seq_string(
+                    data["TMD_" + str(data["biggest_sar"])][min_idx][4]
+                )
+                sar_start, sar_end = split_seq_string(
+                    data["TMD_" + str(data["biggest_sar"])][min_idx][5]
+                )
+                c_start, c_end = split_seq_string(
+                    data["TMD_" + str(data["biggest_sar"])][min_idx][6]
+                )
+                f.writelines(
+                    f'{name}\tSAR_finder\tTopological domain\t{n_start}\t{n_end}\t.\t.\t.\tNote=N-terminal net charge is {data["TMD_"+str(data["biggest_sar"])][min_idx][2]}\n'
+                )
+                f.writelines(
+                    f'{name}\tSAR_finder\tSAR domain\t{sar_start}\t{sar_end}\t.\t.\t.\tNote=residue % in SAR {[perc for perc in data["TMD_"+str(data["biggest_sar"])][min_idx][3]]},Total % is {round(sum(j for i,j in data["TMD_"+str(data["biggest_sar"])][min_idx][3]),2)}\n'
+                )
+                f.writelines(
+                    f"{name}\tSAR_finder\tTopological domain\t{c_start}\t{c_end}\t.\t.\t.\tNote=C-terminus\n"
+                )
+        else:
+            f.writelines("##gff-version 3\n")
+            f.writelines(f"##sequence-region\n")
+
+
+def tab_from_SAR_dict(sar_dict, stat_file, hydrophillic_res, sar_min, sar_max):
+    """convert SAR dict to a dataframe"""
+    columns = [
+        "Name",
+        "Protein Sequence",
+        "Protein Length",
+        "SAR Length",
+        "SAR Start",
+        "Putative SAR Sequence",
+        "SAR End",
+        [f"{res}%" for res in hydrophillic_res],
+        "% Total",
+        "N-term Sequence",
+        "N-term net Charge",
+    ]  # using different residues for percent calc: [f"{res}%" for res in hydrophillic_res]
+    with stat_file as f:
+        f.writelines(
+            f"{columns[0]}\t{columns[1]}\t{columns[2]}\t{columns[3]}\t{columns[4]}\t{columns[5]}\t{columns[6]}\t{columns[7]}\t{columns[8]}\t{columns[9]}\t{columns[10]}\n"
+        )
+        if sar_dict:
+            # print(sar_dict)
+            for name, data in sar_dict.items():
+                for tmd_size in range(sar_max, sar_min - 1, -1):
+                    if "TMD_" + str(tmd_size) in data:
+                        for each_match in data["TMD_" + str(tmd_size)]:
+                            if each_match != [""]:
+                                # print(f"{name} - {data}")
+                                # print(each_match)
+                                # for perc in each_match[3]:
+                                #    print(perc)
+                                try:
+                                    f.writelines(
+                                        f'{name}\t{data["sequence"]}\t{data["size"]}\t{tmd_size}\t{int(each_match[7])+1}\t{each_match[0]}\t{int(each_match[8])+1}\t{[perc for perc in each_match[3]]}\t{round(sum(j for i,j in each_match[3]),2)}\t{each_match[1]}\t{each_match[2]}\n'
+                                    )
+                                except IndexError:
+                                    f.writelines(
+                                        f"ERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\n"
+                                    )
+                            else:
+                                continue
+
+
+def stat_file_from_SAR_dict(sar_dict, stat_file, sar_min, sar_max):
+    """summary statistics from SAR finder function"""
+    with stat_file as f:
+        f.writelines("..........:::::: Candidate SAR Proteins ::::::..........\n\n")
+        if sar_dict:
+            for data in sar_dict.values():
+                f.writelines(
+                    "Protein Description and Name: {}\n".format(data["description"])
+                )
+                f.writelines("Protein Sequence: {}\n".format(data["sequence"]))
+                f.writelines("Protein Length: {}\n".format(data["size"]))
+                f.writelines("SAR Criteria matching region(s)\n")
+                for tmd_size in range(sar_max, sar_min - 1, -1):
+                    if "TMD_" + str(tmd_size) in data:
+                        f.writelines("\nSAR length of {}:\n".format(tmd_size))
+                        for each_match in data["TMD_" + str(tmd_size)]:
+                            if each_match != [""]:
+                                f.writelines(
+                                    "\nPotential SAR domain sequence: {}\n".format(
+                                        each_match[0]
+                                    )
+                                )
+                                f.writelines(
+                                    "N-term sequence: {}\n".format(each_match[1])
+                                )
+                                f.writelines(
+                                    "N-term net charge: {}\n".format(each_match[2])
+                                )
+                                for each_perc_calc in each_match[3]:
+                                    f.writelines(
+                                        "Percent {} content: {}%\n".format(
+                                            each_perc_calc[0], each_perc_calc[1]
+                                        )
+                                    )
+                                f.writelines(
+                                    "N-term coords: {}\n".format(each_match[4])
+                                )
+                                f.writelines("SAR coords: {}\n".format(each_match[5]))
+                                f.writelines(
+                                    "C-term coords: {}\n".format(each_match[6])
+                                )
+                                f.writelines("SAR start: {}\n".format(each_match[7]))
+                            else:
+                                continue
+                f.writelines(
+                    "========================================================\n\n"
+                )
+        else:
+            f.writelines("No candidate SAR Proteins found")
+
+
+def split_seq_string(input_range, python_indexing=True):
+    """splits a #..# sequence into the two respective starts and ends, if python indexing, adds 1, otherwise keeps"""
+    if python_indexing:
+        values = input_range.split("..")
+        start = int(values[0]) + 1
+        end = int(values[1]) + 1
+    else:
+        values = input_range.split("..")
+        start = values[0]
+        end = values[1]
+
+    return start, end
+
+
+if __name__ == "__main__":
+    pass
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml	Mon Jun 05 02:52:57 2023 +0000
@@ -0,0 +1,74 @@
+<macros>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package">progressivemauve</requirement>
+            <!--<requirement type="package" version="2.7">python</requirement>-->
+            <requirement type="package" version="0.6.4">bcbiogff</requirement>
+            <yield/>
+        </requirements>
+    </xml>
+    <token name="@WRAPPER_VERSION@">2.4.0</token>
+    <xml name="citation/progressive_mauve">
+        <citation type="doi">10.1371/journal.pone.0011147</citation>
+    </xml>
+    <xml name="citation/gepard">
+        <citation type="doi">10.1093/bioinformatics/btm039</citation>
+    </xml>
+    <token name="@XMFA_INPUT@">
+		'$xmfa'
+	</token>
+    <xml name="xmfa_input" token_formats="xmfa">
+        <param type="data" format="@FORMATS@" name="xmfa" label="XMFA MSA"/>
+    </xml>
+    <token name="@XMFA_FA_INPUT@">
+		'$sequences'
+	</token>
+    <xml name="xmfa_fa_input">
+        <param type="data" format="fasta" name="sequences" label="Sequences in alignment" help="These sequences should be the SAME DATASET that was used in the progressiveMauve run. Failing that, they should be provided in the same order as in original progressiveMauve run"/>
+    </xml>
+    <xml name="genome_selector">
+        <conditional name="reference_genome">
+            <param name="reference_genome_source" type="select" label="Reference Genome">
+                <option value="history" selected="True">From History</option>
+                <option value="cached">Locally Cached</option>
+            </param>
+            <when value="cached">
+                <param name="fasta_indexes" type="select" label="Source FASTA Sequence">
+                    <options from_data_table="all_fasta"/>
+                </param>
+            </when>
+            <when value="history">
+                <param name="genome_fasta" type="data" format="fasta" label="Source FASTA Sequence"/>
+            </when>
+        </conditional>
+    </xml>
+    <xml name="gff3_input">
+        <param label="GFF3 Annotations" name="gff3_data" type="data" format="gff3"/>
+    </xml>
+    <xml name="input/gff3+fasta">
+        <expand macro="gff3_input"/>
+        <expand macro="genome_selector"/>
+    </xml>
+    <token name="@INPUT_GFF@">
+	    '$gff3_data'
+	</token>
+    <token name="@INPUT_FASTA@">
+    #if str($reference_genome.reference_genome_source) == 'cached':
+            '${reference_genome.fasta_indexes.fields.path}'
+    #else if str($reference_genome.reference_genome_source) == 'history':
+            genomeref.fa
+    #end if
+	</token>
+    <token name="@GENOME_SELECTOR_PRE@">
+    #if $reference_genome.reference_genome_source == 'history':
+            ln -s '$reference_genome.genome_fasta' genomeref.fa;
+    #end if
+	</token>
+    <token name="@GENOME_SELECTOR@">
+    #if str($reference_genome.reference_genome_source) == 'cached':
+            '${reference_genome.fasta_indexes.fields.path}'
+    #else if str($reference_genome.reference_genome_source) == 'history':
+            genomeref.fa
+    #end if
+	</token>
+</macros>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/candidate_SAR.fa	Mon Jun 05 02:52:57 2023 +0000
@@ -0,0 +1,2 @@
+>SAR-endolysin
+MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGPDIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTLLKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/candidate_SAR.gff3	Mon Jun 05 02:52:57 2023 +0000
@@ -0,0 +1,6 @@
+Seqid	Source	Type	Start	End	Score	Strand	Phase	Attributes
+##gff-version 3
+##sequence-region SAR-endolysin
+SAR-endolysin	SAR_finder	Topological domain	1	8	.	.	.	Note=N-terminal net charge is 2
+SAR-endolysin	SAR_finder	SAR domain	9	26	.	.	.	Note=residue % in SAR [('S', 0.0), ('G', 29.41), ('A', 23.53), ('T', 5.88)],Total % is 58.82
+SAR-endolysin	SAR_finder	Topological domain	27	172	.	.	.	Note=C-terminus
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/candidate_SAR_stats.tsv	Mon Jun 05 02:52:57 2023 +0000
@@ -0,0 +1,6 @@
+Name	Protein Sequence	Protein Length	SAR Length	SAR Start	Putative SAR Sequence	SAR End	['S%', 'G%', 'A%', 'T%']	% Total	N-term Sequence	N-term net Charge
+SAR-endolysin	MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGPDIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTLLKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ	171	17	9	KAALLAVTIAGGGVGGY	26	[('S', 0.0), ('G', 29.41), ('A', 23.53), ('T', 5.88)]	58.82	MAGIPKKL	2
+SAR-endolysin	MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGPDIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTLLKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ	171	16	10	AALLAVTIAGGGVGGY	26	[('S', 0.0), ('G', 31.25), ('A', 25.0), ('T', 6.25)]	62.5	MAGIPKKLK	3
+SAR-endolysin	MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGPDIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTLLKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ	171	15	9	KAALLAVTIAGGGVG	24	[('S', 0.0), ('G', 26.67), ('A', 26.67), ('T', 6.67)]	60.01	MAGIPKKL	2
+SAR-endolysin	MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGPDIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTLLKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ	171	15	10	AALLAVTIAGGGVGG	25	[('S', 0.0), ('G', 33.33), ('A', 26.67), ('T', 6.67)]	66.67	MAGIPKKLK	3
+SAR-endolysin	MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGPDIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTLLKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ	171	15	11	ALLAVTIAGGGVGGY	26	[('S', 0.0), ('G', 33.33), ('A', 20.0), ('T', 6.67)]	60.0	MAGIPKKLKA	3
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/simple-proteins.fa	Mon Jun 05 02:52:57 2023 +0000
@@ -0,0 +1,7 @@
+>SAR-endolysin
+MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGP
+DIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTL
+LKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ
+>CPT_NC_000929.1_038
+MLKIKPAAGKAIRDPLTMKLLASEGEEKPRNSFWIRRLAAGDVVEVGSTENTADDTDAAP
+KKRSKSK
\ No newline at end of file