changeset 3:fd70980a516b draft

planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author cpt
date Mon, 05 Jun 2023 02:42:01 +0000
parents 1a7fef71aee3
children 673d1776d3b9
files cpt-macros.xml cpt.py cpt_find_spanins/cpt-macros.xml cpt_find_spanins/cpt.py cpt_find_spanins/findSpanin.py cpt_find_spanins/findSpanin.xml cpt_find_spanins/macros.xml cpt_find_spanins/spaninFuncs.py findSpanin.py findSpanin.xml macros.xml spaninFuncs.py
diffstat 12 files changed, 1683 insertions(+), 1509 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt-macros.xml	Mon Jun 05 02:42:01 2023 +0000
@@ -0,0 +1,114 @@
+<macros>
+    <xml name="gff_requirements">
+        <requirements>
+            <requirement type="package" version="2.7">python</requirement>
+            <requirement type="package" version="1.65">biopython</requirement>
+            <requirement type="package" version="2.12.1">requests</requirement>
+            <yield/>
+        </requirements>
+        <version_command>
+		<![CDATA[
+			cd '$__tool_directory__' && git rev-parse HEAD
+		]]>
+		</version_command>
+    </xml>
+    <xml name="citation/mijalisrasche">
+        <citation type="doi">10.1371/journal.pcbi.1008214</citation>
+        <citation type="bibtex">@unpublished{galaxyTools,
+		author = {E. Mijalis, H. Rasche},
+		title = {CPT Galaxy Tools},
+		year = {2013-2017},
+		note = {https://github.com/tamu-cpt/galaxy-tools/}
+		}
+		</citation>
+    </xml>
+    <xml name="citations">
+        <citations>
+            <citation type="doi">10.1371/journal.pcbi.1008214</citation>
+            <citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {E. Mijalis, H. Rasche},
+				title = {CPT Galaxy Tools},
+				year = {2013-2017},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+            <yield/>
+        </citations>
+    </xml>
+    <xml name="citations-crr">
+        <citations>
+            <citation type="doi">10.1371/journal.pcbi.1008214</citation>
+            <citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {C. Ross},
+				title = {CPT Galaxy Tools},
+				year = {2020-},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+            <yield/>
+        </citations>
+    </xml>
+    <xml name="citations-2020">
+        <citations>
+            <citation type="doi">10.1371/journal.pcbi.1008214</citation>
+            <citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {E. Mijalis, H. Rasche},
+				title = {CPT Galaxy Tools},
+				year = {2013-2017},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+            <citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {A. Criscione},
+				title = {CPT Galaxy Tools},
+				year = {2019-2021},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+                        </citation>
+            <yield/>
+        </citations>
+    </xml>
+    <xml name="citations-2020-AJC-solo">
+        <citations>
+            <citation type="doi">10.1371/journal.pcbi.1008214</citation>
+            <citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {A. Criscione},
+				title = {CPT Galaxy Tools},
+				year = {2019-2021},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+                        </citation>
+            <yield/>
+        </citations>
+    </xml>
+    <xml name="citations-clm">
+        <citations>
+            <citation type="doi">10.1371/journal.pcbi.1008214</citation>
+            <citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {C. Maughmer},
+				title = {CPT Galaxy Tools},
+				year = {2017-2020},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+            <yield/>
+        </citations>
+    </xml>
+    <xml name="sl-citations-clm">
+        <citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {C. Maughmer},
+				title = {CPT Galaxy Tools},
+				year = {2017-2020},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+        <yield/>
+    </xml>
+</macros>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt.py	Mon Jun 05 02:42:01 2023 +0000
@@ -0,0 +1,341 @@
+#!/usr/bin/env python
+from Bio.Seq import Seq, reverse_complement, translate
+from Bio.SeqRecord import SeqRecord
+from Bio import SeqIO
+from Bio.Data import CodonTable
+import logging
+
+logging.basicConfig()
+log = logging.getLogger()
+
+PHAGE_IN_MIDDLE = re.compile("^(?P<host>.*)\s*phage (?P<phage>.*)$")
+BACTERIOPHAGE_IN_MIDDLE = re.compile("^(?P<host>.*)\s*bacteriophage (?P<phage>.*)$")
+STARTS_WITH_PHAGE = re.compile(
+    "^(bacterio|vibrio|Bacterio|Vibrio|)?[Pp]hage (?P<phage>.*)$"
+)
+NEW_STYLE_NAMES = re.compile("(?P<phage>v[A-Z]_[A-Z][a-z]{2}_.*)")
+
+
+def phage_name_parser(name):
+    host = None
+    phage = None
+    name = name.replace(", complete genome.", "")
+    name = name.replace(", complete genome", "")
+
+    m = BACTERIOPHAGE_IN_MIDDLE.match(name)
+    if m:
+        host = m.group("host")
+        phage = m.group("phage")
+        return (host, phage)
+
+    m = PHAGE_IN_MIDDLE.match(name)
+    if m:
+        host = m.group("host")
+        phage = m.group("phage")
+        return (host, phage)
+
+    m = STARTS_WITH_PHAGE.match(name)
+    if m:
+        phage = m.group("phage")
+        return (host, phage)
+
+    m = NEW_STYLE_NAMES.match(name)
+    if m:
+        phage = m.group("phage")
+        return (host, phage)
+
+    return (host, phage)
+
+
+class OrfFinder(object):
+    def __init__(self, table, ftype, ends, min_len, strand):
+        self.table = table
+        self.table_obj = CodonTable.ambiguous_generic_by_id[table]
+        self.ends = ends
+        self.ftype = ftype
+        self.min_len = min_len
+        self.starts = sorted(self.table_obj.start_codons)
+        self.stops = sorted(self.table_obj.stop_codons)
+        self.re_starts = re.compile("|".join(self.starts))
+        self.re_stops = re.compile("|".join(self.stops))
+        self.strand = strand
+
+    def locate(self, fasta_file, out_nuc, out_prot, out_bed, out_gff3):
+        seq_format = "fasta"
+        log.debug("Genetic code table %i" % self.table)
+        log.debug("Minimum length %i aa" % self.min_len)
+
+        out_count = 0
+
+        out_gff3.write("##gff-version 3\n")
+
+        for idx, record in enumerate(SeqIO.parse(fasta_file, seq_format)):
+            for i, (f_start, f_end, f_strand, n, t) in enumerate(
+                self.get_all_peptides(str(record.seq).upper())
+            ):
+                out_count += 1
+
+                descr = "length %i aa, %i bp, from %s..%s[%s] of %s" % (
+                    len(t),
+                    len(n),
+                    f_start,
+                    f_end,
+                    f_strand,
+                    record.description,
+                )
+                fid = record.id + "|%s%i" % (self.ftype, i + 1)
+
+                r = SeqRecord(Seq(n), id=fid, name="", description=descr)
+                t = SeqRecord(Seq(t), id=fid, name="", description=descr)
+
+                SeqIO.write(r, out_nuc, "fasta")
+                SeqIO.write(t, out_prot, "fasta")
+
+                nice_strand = "+" if f_strand == +1 else "-"
+
+                out_bed.write(
+                    "\t".join(
+                        map(str, [record.id, f_start, f_end, fid, 0, nice_strand])
+                    )
+                    + "\n"
+                )
+
+                out_gff3.write(
+                    "\t".join(
+                        map(
+                            str,
+                            [
+                                record.id,
+                                "getOrfsOrCds",
+                                "CDS",
+                                f_start + 1,
+                                f_end,
+                                ".",
+                                nice_strand,
+                                0,
+                                "ID=%s.%s.%s" % (self.ftype, idx, i + 1),
+                            ],
+                        )
+                    )
+                    + "\n"
+                )
+        log.info("Found %i %ss", out_count, self.ftype)
+
+    def start_chop_and_trans(self, s, strict=True):
+        """Returns offset, trimmed nuc, protein."""
+        if strict:
+            assert s[-3:] in self.stops, s
+        assert len(s) % 3 == 0
+        for match in self.re_starts.finditer(s, overlapped=True):
+            # Must check the start is in frame
+            start = match.start()
+            if start % 3 == 0:
+                n = s[start:]
+                assert len(n) % 3 == 0, "%s is len %i" % (n, len(n))
+                if strict:
+                    t = translate(n, self.table)
+                else:
+                    # Use when missing stop codon,
+                    t = "M" + translate(n[3:], self.table, to_stop=True)
+                yield start, n, t  # Edited by CPT to be a generator
+
+    def break_up_frame(self, s):
+        """Returns offset, nuc, protein."""
+        start = 0
+        for match in self.re_stops.finditer(s, overlapped=True):
+            index = match.start() + 3
+            if index % 3 != 0:
+                continue
+            n = s[start:index]
+            for (offset, n, t) in self.start_chop_and_trans(n):
+                if n and len(t) >= self.min_len:
+                    yield start + offset, n, t
+            start = index
+
+    def putative_genes_in_sequence(self, nuc_seq):
+        """Returns start, end, strand, nucleotides, protein.
+        Co-ordinates are Python style zero-based.
+        """
+        nuc_seq = nuc_seq.upper()
+        # TODO - Refactor to use a generator function (in start order)
+        # rather than making a list and sorting?
+        answer = []
+        full_len = len(nuc_seq)
+
+        for frame in range(0, 3):
+            for offset, n, t in self.break_up_frame(nuc_seq[frame:]):
+                start = frame + offset  # zero based
+                answer.append((start, start + len(n), +1, n, t))
+
+        rc = reverse_complement(nuc_seq)
+        for frame in range(0, 3):
+            for offset, n, t in self.break_up_frame(rc[frame:]):
+                start = full_len - frame - offset  # zero based
+                answer.append((start, start - len(n), -1, n, t))
+        answer.sort()
+        return answer
+
+    def get_all_peptides(self, nuc_seq):
+        """Returns start, end, strand, nucleotides, protein.
+
+        Co-ordinates are Python style zero-based.
+        """
+        # Refactored into generator by CPT
+        full_len = len(nuc_seq)
+        if self.strand != "reverse":
+            for frame in range(0, 3):
+                for offset, n, t in self.break_up_frame(nuc_seq[frame:]):
+                    start = frame + offset  # zero based
+                    yield (start, start + len(n), +1, n, t)
+        if self.strand != "forward":
+            rc = reverse_complement(nuc_seq)
+            for frame in range(0, 3):
+                for offset, n, t in self.break_up_frame(rc[frame:]):
+                    start = full_len - frame - offset  # zero based
+                    yield (start - len(n), start, -1, n, t)
+
+
+class MGAFinder(object):
+    def __init__(self, table, ftype, ends, min_len):
+        self.table = table
+        self.table_obj = CodonTable.ambiguous_generic_by_id[table]
+        self.ends = ends
+        self.ftype = ftype
+        self.min_len = min_len
+        self.starts = sorted(self.table_obj.start_codons)
+        self.stops = sorted(self.table_obj.stop_codons)
+        self.re_starts = re.compile("|".join(self.starts))
+        self.re_stops = re.compile("|".join(self.stops))
+
+    def locate(self, fasta_file, out_nuc, out_prot, out_bed, out_gff3):
+        seq_format = "fasta"
+        log.debug("Genetic code table %i" % self.table)
+        log.debug("Minimum length %i aa" % self.min_len)
+
+        out_count = 0
+
+        out_gff3.write("##gff-version 3\n")
+
+        for idx, record in enumerate(SeqIO.parse(fasta_file, seq_format)):
+            for i, (f_start, f_end, f_strand, n, t) in enumerate(
+                self.get_all_peptides(str(record.seq).upper())
+            ):
+                out_count += 1
+
+                descr = "length %i aa, %i bp, from %s..%s[%s] of %s" % (
+                    len(t),
+                    len(n),
+                    f_start,
+                    f_end,
+                    f_strand,
+                    record.description,
+                )
+                fid = record.id + "|%s%i" % (self.ftype, i + 1)
+
+                r = SeqRecord(Seq(n), id=fid, name="", description=descr)
+                t = SeqRecord(Seq(t), id=fid, name="", description=descr)
+
+                SeqIO.write(r, out_nuc, "fasta")
+                SeqIO.write(t, out_prot, "fasta")
+
+                nice_strand = "+" if f_strand == +1 else "-"
+
+                out_bed.write(
+                    "\t".join(
+                        map(str, [record.id, f_start, f_end, fid, 0, nice_strand])
+                    )
+                    + "\n"
+                )
+
+                out_gff3.write(
+                    "\t".join(
+                        map(
+                            str,
+                            [
+                                record.id,
+                                "getOrfsOrCds",
+                                "CDS",
+                                f_start + 1,
+                                f_end,
+                                ".",
+                                nice_strand,
+                                0,
+                                "ID=%s.%s.%s" % (self.ftype, idx, i + 1),
+                            ],
+                        )
+                    )
+                    + "\n"
+                )
+        log.info("Found %i %ss", out_count, self.ftype)
+
+    def start_chop_and_trans(self, s, strict=True):
+        """Returns offset, trimmed nuc, protein."""
+        if strict:
+            assert s[-3:] in self.stops, s
+        assert len(s) % 3 == 0
+        for match in self.re_starts.finditer(s, overlapped=True):
+            # Must check the start is in frame
+            start = match.start()
+            if start % 3 == 0:
+                n = s[start:]
+                assert len(n) % 3 == 0, "%s is len %i" % (n, len(n))
+                if strict:
+                    t = translate(n, self.table)
+                else:
+                    # Use when missing stop codon,
+                    t = "M" + translate(n[3:], self.table, to_stop=True)
+                yield start, n, t
+
+    def break_up_frame(self, s):
+        """Returns offset, nuc, protein."""
+        start = 0
+        for match in self.re_stops.finditer(s, overlapped=True):
+            index = match.start() + 3
+            if index % 3 != 0:
+                continue
+            n = s[start:index]
+            for (offset, n, t) in self.start_chop_and_trans(n):
+                if n and len(t) >= self.min_len:
+                    yield start + offset, n, t
+            start = index
+
+    def putative_genes_in_sequence(self, nuc_seq):
+        """Returns start, end, strand, nucleotides, protein.
+        Co-ordinates are Python style zero-based.
+        """
+        nuc_seq = nuc_seq.upper()
+        # TODO - Refactor to use a generator function (in start order)
+        # rather than making a list and sorting?
+        answer = []
+        full_len = len(nuc_seq)
+
+        for frame in range(0, 3):
+            for offset, n, t in self.break_up_frame(nuc_seq[frame:]):
+                start = frame + offset  # zero based
+                answer.append((start, start + len(n), +1, n, t))
+
+        rc = reverse_complement(nuc_seq)
+        for frame in range(0, 3):
+            for offset, n, t in self.break_up_frame(rc[frame:]):
+                start = full_len - frame - offset  # zero based
+                answer.append((start, start - len(n), -1, n, t))
+        answer.sort()
+        return answer
+
+    def get_all_peptides(self, nuc_seq):
+        """Returns start, end, strand, nucleotides, protein.
+
+        Co-ordinates are Python style zero-based.
+        """
+        # Refactored into generator by CPT
+
+        full_len = len(nuc_seq)
+        for frame in range(0, 3):
+            for offset, n, t in self.break_up_frame(nuc_seq[frame:]):
+                start = frame + offset  # zero based
+                yield (start, start + len(n), +1, n, t)
+        rc = reverse_complement(nuc_seq)
+        for frame in range(0, 3):
+            for offset, n, t in self.break_up_frame(rc[frame:]):
+                start = full_len - frame - offset  # zero based
+                yield (start - len(n), start, -1, n, t)
--- a/cpt_find_spanins/cpt-macros.xml	Fri Jun 17 12:40:59 2022 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,115 +0,0 @@
-<?xml version="1.0"?>
-<macros>
-	<xml name="gff_requirements">
-		<requirements>
-			<requirement type="package" version="2.7">python</requirement>
-			<requirement type="package" version="1.65">biopython</requirement>
-			<requirement type="package" version="2.12.1">requests</requirement>
-			<yield/>
-		</requirements>
-		<version_command>
-		<![CDATA[
-			cd $__tool_directory__ && git rev-parse HEAD
-		]]>
-		</version_command>
-	</xml>
-	<xml name="citation/mijalisrasche">
-		<citation type="doi">10.1371/journal.pcbi.1008214</citation>
-		<citation type="bibtex">@unpublished{galaxyTools,
-		author = {E. Mijalis, H. Rasche},
-		title = {CPT Galaxy Tools},
-		year = {2013-2017},
-		note = {https://github.com/tamu-cpt/galaxy-tools/}
-		}
-		</citation>
-	</xml>
-	<xml name="citations">
-		<citations>
-			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
-			<citation type="bibtex">
-			@unpublished{galaxyTools,
-				author = {E. Mijalis, H. Rasche},
-				title = {CPT Galaxy Tools},
-				year = {2013-2017},
-				note = {https://github.com/tamu-cpt/galaxy-tools/}
-			}
-			</citation> 
-		<yield/>
-		</citations>
-	</xml>
-    	<xml name="citations-crr">
-		<citations>
-			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
-			<citation type="bibtex">
-			@unpublished{galaxyTools,
-				author = {C. Ross},
-				title = {CPT Galaxy Tools},
-				year = {2020-},
-				note = {https://github.com/tamu-cpt/galaxy-tools/}
-			}
-			</citation>
-		<yield/>
-		</citations>
-	</xml>
-        <xml name="citations-2020">
-		<citations>
-			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
-			<citation type="bibtex">
-			@unpublished{galaxyTools,
-				author = {E. Mijalis, H. Rasche},
-				title = {CPT Galaxy Tools},
-				year = {2013-2017},
-				note = {https://github.com/tamu-cpt/galaxy-tools/}
-			}
-			</citation>
-                        <citation type="bibtex">
-			@unpublished{galaxyTools,
-				author = {A. Criscione},
-				title = {CPT Galaxy Tools},
-				year = {2019-2021},
-				note = {https://github.com/tamu-cpt/galaxy-tools/}
-			}
-                        </citation>
-                        <yield/>
-		</citations>
-	</xml>
-        <xml name="citations-2020-AJC-solo">
-		<citations>
-			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
-                        <citation type="bibtex">
-			@unpublished{galaxyTools,
-				author = {A. Criscione},
-				title = {CPT Galaxy Tools},
-				year = {2019-2021},
-				note = {https://github.com/tamu-cpt/galaxy-tools/}
-			}
-                        </citation>
-                        <yield/>
-		</citations>
-	</xml>
-        <xml name="citations-clm">
-		<citations>
-			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
-			<citation type="bibtex">
-			@unpublished{galaxyTools,
-				author = {C. Maughmer},
-				title = {CPT Galaxy Tools},
-				year = {2017-2020},
-				note = {https://github.com/tamu-cpt/galaxy-tools/}
-			}
-			</citation>
-                        <yield/>
-		</citations>
-	</xml>
-        <xml name="sl-citations-clm">
-			<citation type="bibtex">
-			@unpublished{galaxyTools,
-				author = {C. Maughmer},
-				title = {CPT Galaxy Tools},
-				year = {2017-2020},
-				note = {https://github.com/tamu-cpt/galaxy-tools/}
-			}
-			</citation>
-                        <yield/>
-	</xml>
-</macros>
--- a/cpt_find_spanins/cpt.py	Fri Jun 17 12:40:59 2022 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,341 +0,0 @@
-#!/usr/bin/env python
-from Bio.Seq import Seq, reverse_complement, translate
-from Bio.SeqRecord import SeqRecord
-from Bio import SeqIO
-from Bio.Data import CodonTable
-import logging
-
-logging.basicConfig()
-log = logging.getLogger()
-
-PHAGE_IN_MIDDLE = re.compile("^(?P<host>.*)\s*phage (?P<phage>.*)$")
-BACTERIOPHAGE_IN_MIDDLE = re.compile("^(?P<host>.*)\s*bacteriophage (?P<phage>.*)$")
-STARTS_WITH_PHAGE = re.compile(
-    "^(bacterio|vibrio|Bacterio|Vibrio|)?[Pp]hage (?P<phage>.*)$"
-)
-NEW_STYLE_NAMES = re.compile("(?P<phage>v[A-Z]_[A-Z][a-z]{2}_.*)")
-
-
-def phage_name_parser(name):
-    host = None
-    phage = None
-    name = name.replace(", complete genome.", "")
-    name = name.replace(", complete genome", "")
-
-    m = BACTERIOPHAGE_IN_MIDDLE.match(name)
-    if m:
-        host = m.group("host")
-        phage = m.group("phage")
-        return (host, phage)
-
-    m = PHAGE_IN_MIDDLE.match(name)
-    if m:
-        host = m.group("host")
-        phage = m.group("phage")
-        return (host, phage)
-
-    m = STARTS_WITH_PHAGE.match(name)
-    if m:
-        phage = m.group("phage")
-        return (host, phage)
-
-    m = NEW_STYLE_NAMES.match(name)
-    if m:
-        phage = m.group("phage")
-        return (host, phage)
-
-    return (host, phage)
-
-
-class OrfFinder(object):
-    def __init__(self, table, ftype, ends, min_len, strand):
-        self.table = table
-        self.table_obj = CodonTable.ambiguous_generic_by_id[table]
-        self.ends = ends
-        self.ftype = ftype
-        self.min_len = min_len
-        self.starts = sorted(self.table_obj.start_codons)
-        self.stops = sorted(self.table_obj.stop_codons)
-        self.re_starts = re.compile("|".join(self.starts))
-        self.re_stops = re.compile("|".join(self.stops))
-        self.strand = strand
-
-    def locate(self, fasta_file, out_nuc, out_prot, out_bed, out_gff3):
-        seq_format = "fasta"
-        log.debug("Genetic code table %i" % self.table)
-        log.debug("Minimum length %i aa" % self.min_len)
-
-        out_count = 0
-
-        out_gff3.write("##gff-version 3\n")
-
-        for idx, record in enumerate(SeqIO.parse(fasta_file, seq_format)):
-            for i, (f_start, f_end, f_strand, n, t) in enumerate(
-                self.get_all_peptides(str(record.seq).upper())
-            ):
-                out_count += 1
-
-                descr = "length %i aa, %i bp, from %s..%s[%s] of %s" % (
-                    len(t),
-                    len(n),
-                    f_start,
-                    f_end,
-                    f_strand,
-                    record.description,
-                )
-                fid = record.id + "|%s%i" % (self.ftype, i + 1)
-
-                r = SeqRecord(Seq(n), id=fid, name="", description=descr)
-                t = SeqRecord(Seq(t), id=fid, name="", description=descr)
-
-                SeqIO.write(r, out_nuc, "fasta")
-                SeqIO.write(t, out_prot, "fasta")
-
-                nice_strand = "+" if f_strand == +1 else "-"
-
-                out_bed.write(
-                    "\t".join(
-                        map(str, [record.id, f_start, f_end, fid, 0, nice_strand])
-                    )
-                    + "\n"
-                )
-
-                out_gff3.write(
-                    "\t".join(
-                        map(
-                            str,
-                            [
-                                record.id,
-                                "getOrfsOrCds",
-                                "CDS",
-                                f_start + 1,
-                                f_end,
-                                ".",
-                                nice_strand,
-                                0,
-                                "ID=%s.%s.%s" % (self.ftype, idx, i + 1),
-                            ],
-                        )
-                    )
-                    + "\n"
-                )
-        log.info("Found %i %ss", out_count, self.ftype)
-
-    def start_chop_and_trans(self, s, strict=True):
-        """Returns offset, trimmed nuc, protein."""
-        if strict:
-            assert s[-3:] in self.stops, s
-        assert len(s) % 3 == 0
-        for match in self.re_starts.finditer(s, overlapped=True):
-            # Must check the start is in frame
-            start = match.start()
-            if start % 3 == 0:
-                n = s[start:]
-                assert len(n) % 3 == 0, "%s is len %i" % (n, len(n))
-                if strict:
-                    t = translate(n, self.table)
-                else:
-                    # Use when missing stop codon,
-                    t = "M" + translate(n[3:], self.table, to_stop=True)
-                yield start, n, t  # Edited by CPT to be a generator
-
-    def break_up_frame(self, s):
-        """Returns offset, nuc, protein."""
-        start = 0
-        for match in self.re_stops.finditer(s, overlapped=True):
-            index = match.start() + 3
-            if index % 3 != 0:
-                continue
-            n = s[start:index]
-            for (offset, n, t) in self.start_chop_and_trans(n):
-                if n and len(t) >= self.min_len:
-                    yield start + offset, n, t
-            start = index
-
-    def putative_genes_in_sequence(self, nuc_seq):
-        """Returns start, end, strand, nucleotides, protein.
-        Co-ordinates are Python style zero-based.
-        """
-        nuc_seq = nuc_seq.upper()
-        # TODO - Refactor to use a generator function (in start order)
-        # rather than making a list and sorting?
-        answer = []
-        full_len = len(nuc_seq)
-
-        for frame in range(0, 3):
-            for offset, n, t in self.break_up_frame(nuc_seq[frame:]):
-                start = frame + offset  # zero based
-                answer.append((start, start + len(n), +1, n, t))
-
-        rc = reverse_complement(nuc_seq)
-        for frame in range(0, 3):
-            for offset, n, t in self.break_up_frame(rc[frame:]):
-                start = full_len - frame - offset  # zero based
-                answer.append((start, start - len(n), -1, n, t))
-        answer.sort()
-        return answer
-
-    def get_all_peptides(self, nuc_seq):
-        """Returns start, end, strand, nucleotides, protein.
-
-        Co-ordinates are Python style zero-based.
-        """
-        # Refactored into generator by CPT
-        full_len = len(nuc_seq)
-        if self.strand != "reverse":
-            for frame in range(0, 3):
-                for offset, n, t in self.break_up_frame(nuc_seq[frame:]):
-                    start = frame + offset  # zero based
-                    yield (start, start + len(n), +1, n, t)
-        if self.strand != "forward":
-            rc = reverse_complement(nuc_seq)
-            for frame in range(0, 3):
-                for offset, n, t in self.break_up_frame(rc[frame:]):
-                    start = full_len - frame - offset  # zero based
-                    yield (start - len(n), start, -1, n, t)
-
-
-class MGAFinder(object):
-    def __init__(self, table, ftype, ends, min_len):
-        self.table = table
-        self.table_obj = CodonTable.ambiguous_generic_by_id[table]
-        self.ends = ends
-        self.ftype = ftype
-        self.min_len = min_len
-        self.starts = sorted(self.table_obj.start_codons)
-        self.stops = sorted(self.table_obj.stop_codons)
-        self.re_starts = re.compile("|".join(self.starts))
-        self.re_stops = re.compile("|".join(self.stops))
-
-    def locate(self, fasta_file, out_nuc, out_prot, out_bed, out_gff3):
-        seq_format = "fasta"
-        log.debug("Genetic code table %i" % self.table)
-        log.debug("Minimum length %i aa" % self.min_len)
-
-        out_count = 0
-
-        out_gff3.write("##gff-version 3\n")
-
-        for idx, record in enumerate(SeqIO.parse(fasta_file, seq_format)):
-            for i, (f_start, f_end, f_strand, n, t) in enumerate(
-                self.get_all_peptides(str(record.seq).upper())
-            ):
-                out_count += 1
-
-                descr = "length %i aa, %i bp, from %s..%s[%s] of %s" % (
-                    len(t),
-                    len(n),
-                    f_start,
-                    f_end,
-                    f_strand,
-                    record.description,
-                )
-                fid = record.id + "|%s%i" % (self.ftype, i + 1)
-
-                r = SeqRecord(Seq(n), id=fid, name="", description=descr)
-                t = SeqRecord(Seq(t), id=fid, name="", description=descr)
-
-                SeqIO.write(r, out_nuc, "fasta")
-                SeqIO.write(t, out_prot, "fasta")
-
-                nice_strand = "+" if f_strand == +1 else "-"
-
-                out_bed.write(
-                    "\t".join(
-                        map(str, [record.id, f_start, f_end, fid, 0, nice_strand])
-                    )
-                    + "\n"
-                )
-
-                out_gff3.write(
-                    "\t".join(
-                        map(
-                            str,
-                            [
-                                record.id,
-                                "getOrfsOrCds",
-                                "CDS",
-                                f_start + 1,
-                                f_end,
-                                ".",
-                                nice_strand,
-                                0,
-                                "ID=%s.%s.%s" % (self.ftype, idx, i + 1),
-                            ],
-                        )
-                    )
-                    + "\n"
-                )
-        log.info("Found %i %ss", out_count, self.ftype)
-
-    def start_chop_and_trans(self, s, strict=True):
-        """Returns offset, trimmed nuc, protein."""
-        if strict:
-            assert s[-3:] in self.stops, s
-        assert len(s) % 3 == 0
-        for match in self.re_starts.finditer(s, overlapped=True):
-            # Must check the start is in frame
-            start = match.start()
-            if start % 3 == 0:
-                n = s[start:]
-                assert len(n) % 3 == 0, "%s is len %i" % (n, len(n))
-                if strict:
-                    t = translate(n, self.table)
-                else:
-                    # Use when missing stop codon,
-                    t = "M" + translate(n[3:], self.table, to_stop=True)
-                yield start, n, t
-
-    def break_up_frame(self, s):
-        """Returns offset, nuc, protein."""
-        start = 0
-        for match in self.re_stops.finditer(s, overlapped=True):
-            index = match.start() + 3
-            if index % 3 != 0:
-                continue
-            n = s[start:index]
-            for (offset, n, t) in self.start_chop_and_trans(n):
-                if n and len(t) >= self.min_len:
-                    yield start + offset, n, t
-            start = index
-
-    def putative_genes_in_sequence(self, nuc_seq):
-        """Returns start, end, strand, nucleotides, protein.
-        Co-ordinates are Python style zero-based.
-        """
-        nuc_seq = nuc_seq.upper()
-        # TODO - Refactor to use a generator function (in start order)
-        # rather than making a list and sorting?
-        answer = []
-        full_len = len(nuc_seq)
-
-        for frame in range(0, 3):
-            for offset, n, t in self.break_up_frame(nuc_seq[frame:]):
-                start = frame + offset  # zero based
-                answer.append((start, start + len(n), +1, n, t))
-
-        rc = reverse_complement(nuc_seq)
-        for frame in range(0, 3):
-            for offset, n, t in self.break_up_frame(rc[frame:]):
-                start = full_len - frame - offset  # zero based
-                answer.append((start, start - len(n), -1, n, t))
-        answer.sort()
-        return answer
-
-    def get_all_peptides(self, nuc_seq):
-        """Returns start, end, strand, nucleotides, protein.
-
-        Co-ordinates are Python style zero-based.
-        """
-        # Refactored into generator by CPT
-
-        full_len = len(nuc_seq)
-        for frame in range(0, 3):
-            for offset, n, t in self.break_up_frame(nuc_seq[frame:]):
-                start = frame + offset  # zero based
-                yield (start, start + len(n), +1, n, t)
-        rc = reverse_complement(nuc_seq)
-        for frame in range(0, 3):
-            for offset, n, t in self.break_up_frame(rc[frame:]):
-                start = full_len - frame - offset  # zero based
-                yield (start - len(n), start, -1, n, t)
--- a/cpt_find_spanins/findSpanin.py	Fri Jun 17 12:40:59 2022 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,477 +0,0 @@
-##### findSpanin.pl --> findSpanin.py
-######### Much of this code is very "blocked", in the sense that one thing happens...then a function happens on the return...then another function...etc...etc...
-
-import argparse
-import os
-import re # new
-import itertools # new
-from collections import Counter, OrderedDict
-from spaninFuncs import getDescriptions, grabLocs, spaninProximity, splitStrands, tuple_fasta, lineWrapper
-
-### Requirement Inputs
-#### INPUT : putative_isp.fa & putative_osp.fa (in that order)
-#### PARAMETERS :
-
-###############################################################################
-def write_output(candidates):
-    """ output file function...maybe not needed """
-    pass
-
-def reconfigure_dict(spanins):
-    """
-    re organizes dictionary to be more friendly for checks
-    """
-
-    new_spanin_dict = {}
-
-    for each_spanin_type, data_dict in spanins.items():
-        #print(f"{each_spanin_type} == {data_dict}")
-        new_spanin_dict[each_spanin_type] = {}
-        new_spanin_dict[each_spanin_type]['positive'] = {}
-        new_spanin_dict[each_spanin_type]['negative'] = {}
-        new_spanin_dict[each_spanin_type]['positive']['coords'] = []
-        new_spanin_dict[each_spanin_type]['negative']['coords'] = []
-        for outter_orf, inner_data in data_dict.items():
-            list_of_hits = []
-            for data_content in inner_data:
-                #print(data_content)
-                data_content.insert(0, outter_orf)
-                #print(f"new data_content -> {data_content}")
-                #print(data_content)
-                #list_of_hits += [data_content]
-                #new_spanin_dict[each_spanin_type] += [data_content]
-                if data_content[6] == "+":
-                    #print(f"{each_spanin_type} @ POSITIVE")
-                    new_spanin_dict[each_spanin_type]['positive']['coords'] += [data_content]
-                elif data_content[6] == "-":
-                    #print(f"{each_spanin_type} @ NEGATIVE")
-                    new_spanin_dict[each_spanin_type]['negative']['coords'] += [data_content]
-        #print(new_spanin_dict[each_spanin_type])
-            #print(reorganized)
-            #print(f"{outter_orf} => {inner_data}")
-            #print(new_spanin_dict)
-        
-        #print('\n')
-    #for k, v in new_spanin_dict.items():
-        #print(k)
-        #print(v)
-    return new_spanin_dict
-
-
-
-def check_for_uniques(spanins):
-    """
-    Checks for unique spanins based on spanin_type.
-    If the positive strand end site is _the same_ for a i-spanin, we would group that as "1".
-    i.e. if ORF1, ORF2, and ORF3 all ended with location 4231, they would not be unique.
-    """
-    pair_dict = {}
-    pair_dict = {
-        'pairs' : {
-            'location_amount' : [],
-            'pair_number' : {},
-        }
-    }
-    for each_spanin_type, spanin_data in spanins.items():
-        #print(f"{each_spanin_type} ===> {spanin_data}")
-        # early declarations for cases of no results
-        pos_check = [] # end checks
-        pos_uniques = []
-        neg_check = [] # start checks
-        neg_uniques = []
-        unique_ends = []
-        pos_amt_unique = 0
-        neg_amt_unique = 0
-        amt_positive = 0
-        amt_negative = 0
-        spanin_data['uniques'] = 0
-        spanin_data['amount'] = 0
-        #spanin_data['positive']['amt_positive'] = 0
-        #spanin_data['positive']['pos_amt_unique'] = 0
-        #spanin_data['positive']['isp_match'] = []
-        #spanin_data['negative']['amt_negative'] = 0
-        #spanin_data['negative']['neg_amt_unique'] = 0
-        #spanin_data['negative']['isp_match'] = []
-        #print(spanin_data)
-        if spanin_data['positive']['coords']:
-            # do something...
-            #print('in other function')
-            #print(spanin_data['positive']['coords'])
-            for each_hit in spanin_data['positive']['coords']:
-                pos_check.append(each_hit[2])
-                pair_dict['pairs']['location_amount'].append(each_hit[2])
-                pos_uniques = list(set([end_site for end_site in pos_check if pos_check.count(end_site) >= 1]))
-            #print(pos_check)
-            #print(pos_uniques)
-            amt_positive = len(spanin_data['positive']['coords'])
-            pos_amt_unique = len(pos_uniques)
-            if amt_positive:
-                spanin_data['positive']['amt_positive'] = amt_positive
-                spanin_data['positive']['pos_amt_unique'] = pos_amt_unique
-            #pair_dict['pairs']['locations'].extend(pos_uniques)
-        else:
-            spanin_data['positive']['amt_positive'] = 0
-            spanin_data['positive']['pos_amt_unique'] = 0
-        if spanin_data['negative']['coords']:
-
-            # do something else...
-            #print('in other function')
-            #print(spanin_data['negative']['coords'])
-            for each_hit in spanin_data['negative']['coords']:
-                neg_check.append(each_hit[1])
-                pair_dict['pairs']['location_amount'].append(each_hit[1])
-                neg_uniques = list(set([start_site for start_site in neg_check if neg_check.count(start_site) >= 1]))
-            #print(neg_uniques)
-            amt_negative = len(spanin_data['negative']['coords'])
-            neg_amt_unique = len(neg_uniques)
-            if amt_negative:
-                spanin_data['negative']['amt_negative'] = amt_negative
-                spanin_data['negative']['neg_amt_unique'] = neg_amt_unique
-            #pair_dict['pairs']['locations'].extend(neg_uniques)
-        else:
-            spanin_data['negative']['amt_negative'] = 0
-            spanin_data['negative']['neg_amt_unique'] = 0
-        spanin_data['uniques'] += (spanin_data['positive']['pos_amt_unique'] + spanin_data['negative']['neg_amt_unique'])
-        spanin_data['amount'] += (spanin_data['positive']['amt_positive'] + spanin_data['negative']['amt_negative'])
-        #print(spanin_data['uniques'])
-    list(set(pair_dict['pairs']['location_amount']))
-    pair_dict['pairs']['location_amount'] = dict(Counter(pair_dict['pairs']['location_amount']))
-    for data in pair_dict.values():
-        #print(data['locations'])
-        #print(type(data['locations']))
-        v = 0
-        for loc, count in data['location_amount'].items():
-            #data['pair_number'] = {loc
-            v += 1
-            data['pair_number'][loc] = v
-    #print(dict(Counter(pair_dict['pairs']['locations'])))
-    #print(pair_dict)
-    spanins['total_amount'] = spanins['EMBEDDED']['amount'] + spanins['SEPARATED']['amount'] + spanins['OVERLAPPED']['amount']
-    spanins['total_unique'] = spanins['EMBEDDED']['uniques'] + spanins['SEPARATED']['uniques'] + spanins['OVERLAPPED']['uniques']
-    #spanins['total_unique'] = len(pair_dict['pairs']['pair_number'])
-    return spanins, pair_dict
-
-if __name__ == "__main__":
-
-    # Common parameters for both ISP / OSP portion of script
-
-    parser = argparse.ArgumentParser(
-        description="Trim the putative protein candidates and find potential i-spanin / o-spanin pairs"
-    )
-
-    parser.add_argument(
-        "putative_isp_fasta_file",
-        type=argparse.FileType("r"),
-        help='Putative i-spanin FASTA file, output of "generate-putative-isp"',
-    )  # the "input" argument
-
-    parser.add_argument(
-        "putative_osp_fasta_file",
-        type=argparse.FileType("r"),
-        help='Putative o-spanin FASTA file, output of "generate-putative-osp"',
-    )
-
-    parser.add_argument(
-        "--max_isp_osp_distance",
-        dest="max_isp_osp_distance",
-        default=10,
-        type=int,
-        help="max distance from end of i-spanin to start of o-spanin, measured in AAs",
-    )
-
-    parser.add_argument(
-        "--embedded_txt",
-        dest="embedded_txt",
-        type=argparse.FileType("w"),
-        default="_findSpanin_embedded_results.txt",
-        help="Results of potential embedded spanins",
-    )
-    parser.add_argument(
-        "--overlap_txt",
-        dest="overlap_txt",
-        type=argparse.FileType("w"),
-        default="_findSpanin_overlap_results.txt",
-        help="Results of potential overlapping spanins",
-    )
-    parser.add_argument(
-        "--separate_txt",
-        dest="separate_txt",
-        type=argparse.FileType("w"),
-        default="_findSpanin_separated_results.txt",
-        help="Results of potential separated spanins",
-    )
-
-    parser.add_argument(
-        "--summary_txt",
-        dest="summary_txt",
-        type=argparse.FileType("w"),
-        default="_findSpanin_summary.txt",
-        help="Results of potential spanin pairs",
-    )
-    parser.add_argument(
-        "-v", action="version", version="0.3.0"
-    )  # Is this manually updated?
-    args = parser.parse_args()
-
-
-    #### RE-WRITE
-    SPANIN_TYPES = {}
-    SPANIN_TYPES['EMBEDDED'] = {}
-    SPANIN_TYPES['OVERLAPPED'] = {}
-    SPANIN_TYPES['SEPARATED'] = {}
-    #SPANIN_TYPES = {
-    #    'EMBEDDED' : {},
-    #    'OVERLAPPED' : {},
-    #    'SEPARATED' : {},
-    #}
-
-    isp = getDescriptions(args.putative_isp_fasta_file)
-    args.putative_isp_fasta_file = open(args.putative_isp_fasta_file.name, "r")
-    isp_full = tuple_fasta(args.putative_isp_fasta_file)
-
-    osp = getDescriptions(args.putative_osp_fasta_file)
-    args.putative_osp_fasta_file = open(args.putative_osp_fasta_file.name, "r")
-    osp_full = tuple_fasta(args.putative_osp_fasta_file)
-
-    #### location data
-    location_data = {
-        'isp' : [],
-        'osp' : []
-    }
-    spanins = [isp, osp]
-    for idx, each_spanin_type in enumerate(spanins):
-        for description in each_spanin_type:
-            locations = grabLocs(description)
-            if idx == 0: # i-spanin
-                location_data['isp'].append(locations)
-            elif idx == 1: # o-spanin
-                location_data['osp'].append(locations)
-
-    #### Check for types of spanins
-    embedded, overlap, separate = spaninProximity(
-                                        isp=location_data['isp'],
-                                        osp=location_data['osp'], 
-                                        max_dist=args.max_isp_osp_distance * 3
-                                    )
-    
-    SPANIN_TYPES['EMBEDDED'] = embedded
-    SPANIN_TYPES['OVERLAPPED'] = overlap
-    SPANIN_TYPES['SEPARATED'] = separate
-
-    #for spanin_type, spanin in SPANIN_TYPES.items():
-    #    s = 0
-    #    for sequence in spanin.values():
-    #        s += len(sequence)
-    #    SPANIN_TYPES[spanin_type]['amount'] = s
-    #    SPANIN_TYPES[spanin_type]['unique'] = len(spanin.keys())
-    
-    #check_for_unique_spanins(SPANIN_TYPES)
-    spanins = reconfigure_dict(SPANIN_TYPES)
-    spanins, pair_dict = check_for_uniques(spanins)
-    #print(pair_dict)
-    with args.summary_txt as f:
-        for each_spanin_type, spanin_data in spanins.items():
-            try:
-                if each_spanin_type not in ["total_amount","total_unique"]:
-                    #print(each_spanin_type)
-                    #print(each_spanin_type)
-                    f.write("=~~~~~= "+str(each_spanin_type) +" Spanin Candidate Statistics =~~~~~=\n")
-                    f.writelines("Total Candidate Pairs = "+str(spanin_data['amount'])+"\n")
-                    f.writelines("Total Unique Pairs = "+str(spanin_data['uniques'])+"\n")
-                    if each_spanin_type == "EMBEDDED":
-                        for k, v in SPANIN_TYPES['EMBEDDED'].items():
-                            #print(k)
-                            f.writelines(""+str(k)+" ==> Amount of corresponding candidate o-spanins(s): "+str(len(v))+"\n")
-                    if each_spanin_type == "SEPARATED":
-                        for k, v in SPANIN_TYPES['SEPARATED'].items():
-                            f.writelines(""+str(k)+ " ==> Amount of corresponding candidate o-spanins(s): "+str(len(v))+"\n")
-                    if each_spanin_type == "OVERLAPPED":
-                        for k, v in SPANIN_TYPES['OVERLAPPED'].items():
-                            f.writelines(""+str(k)+" ==> Amount of corresponding candidate o-spanins(s): "+str(len(v))+"\n")
-            except TypeError:
-                continue
-        f.write("\n=~~~~~= Tally from ALL spanin types =~~~~~=\n")
-        f.writelines("Total Candidates = "+str(spanins['total_amount'])+"\n")
-        f.writelines("Total Unique Candidate Pairs = "+str(spanins['total_unique'])+"\n")
-
-    args.putative_isp_fasta_file = open(args.putative_isp_fasta_file.name, "r")
-    isp_full = tuple_fasta(args.putative_isp_fasta_file)
-
-    args.putative_osp_fasta_file = open(args.putative_osp_fasta_file.name, "r")
-    osp_full = tuple_fasta(args.putative_osp_fasta_file)
-
-    #print(isp_full)
-    isp_seqs = []
-    osp_seqs = []
-    for isp_tupe in isp_full:
-        #print(isp_tupe)
-        for pisp, posp in embedded.items():
-            #print(f"ISP = searching for {pisp} in {isp_tupe[0]}")
-            if re.search(("("+str(pisp)+")\D"), isp_tupe[0]):
-                #print(isp_tupe[0])
-                #print(peri_count)
-                peri_count = str.split(isp_tupe[0],"~=")[1]
-                isp_seqs.append((pisp,isp_tupe[1],peri_count))
-    #print(isp_seqs)
-    for osp_tupe in osp_full:
-        for pisp, posp in embedded.items():
-            for data in posp:
-                #print(f"OSP = searching for {data[3]} in {osp_tupe[0]}, coming from this object: {data}")
-                if re.search(("("+str(data[3])+")\D"), osp_tupe[0]):
-                    peri_count = str.split(osp_tupe[0],"~=")[1]
-                    osp_seqs.append((data[3],osp_tupe[1],peri_count))
-
-    with args.embedded_txt as f:
-        f.write("================ embedded spanin candidates =================\n")
-        f.write("isp\tisp_start\tisp_end\tosp\tosp_start\tosp_end\tstrand\tpair_number\n")
-        if embedded != {}:
-            #print(embedded)
-            for pisp, posp in embedded.items():
-                #print(f"{pisp} - {posp}")
-                f.write(pisp + "\n")
-                for each_posp in posp:
-                    #print(posp)
-                    f.write(
-                        "\t{}\t{}\t{}\t{}\t{}\t{}\t".format(
-                            each_posp[1],
-                            each_posp[2],
-                            each_posp[3],
-                            each_posp[4],
-                            each_posp[5],
-                            each_posp[6],
-                        )
-                    )
-                    if each_posp[6] == "+":
-                        if each_posp[2] in pair_dict['pairs']['pair_number'].keys():
-                            f.write(""+str(pair_dict['pairs']['pair_number'][each_posp[2]])+"\n")
-                    elif each_posp[6] == "-":
-                        if each_posp[1] in pair_dict['pairs']['pair_number'].keys():
-                            f.write(""+str(pair_dict['pairs']['pair_number'][each_posp[1]])+"\n")
-        else:
-            f.write("nothing found")
-
-    with open(args.embedded_txt.name, "a") as f:
-        f.write("\n================= embedded candidate sequences ================\n")
-        f.write("======================= isp ==========================\n\n")
-        for isp_data in isp_seqs:
-            #print(isp_data)
-            f.write(">isp_orf::{}-peri_count~={}\n{}\n".format(isp_data[0],isp_data[2],lineWrapper(isp_data[1])))
-        f.write("\n======================= osp ========================\n\n")
-        for osp_data in osp_seqs:
-            f.write(">osp_orf::{}-peri_count~={}\n{}\n".format(osp_data[0],osp_data[2],lineWrapper(osp_data[1])))
-
-    args.putative_isp_fasta_file = open(args.putative_isp_fasta_file.name, "r")
-    isp_full = tuple_fasta(args.putative_isp_fasta_file)
-
-    args.putative_osp_fasta_file = open(args.putative_osp_fasta_file.name, "r")
-    osp_full = tuple_fasta(args.putative_osp_fasta_file)
-
-    isp_seqs = []
-    osp_seqs = []
-    for isp_tupe in isp_full:
-        peri_count = str.split(isp_tupe[0],"~=")[1]
-        for pisp, posp in overlap.items():
-            if re.search(("("+str(pisp)+")\D"), isp_tupe[0]):
-                peri_count = str.split(isp_tupe[0],"~=")[1]
-                isp_seqs.append((pisp,isp_tupe[1],peri_count))
-
-    for osp_tupe in osp_full:
-        for pisp, posp in overlap.items():
-            for data in posp:
-                if re.search(("("+str(data[3])+")\D"), osp_tupe[0]):
-                    peri_count = str.split(osp_tupe[0],"~=")[1]
-                    osp_seqs.append((data[3],osp_tupe[1],peri_count))
-
-
-    
-    with args.overlap_txt as f:
-        f.write("================ overlap spanin candidates =================\n")
-        f.write("isp\tisp_start\tisp_end\tosp\tosp_start\tosp_end\tstrand\tpair_number\n")
-        if overlap != {}:
-            for pisp, posp in overlap.items():
-                f.write(pisp + "\n")
-                for each_posp in posp:
-                    f.write(
-                        "\t{}\t{}\t{}\t{}\t{}\t{}\t".format(
-                            each_posp[1],
-                            each_posp[2],
-                            each_posp[3],
-                            each_posp[4],
-                            each_posp[5],
-                            each_posp[6],
-                        )
-                    )
-                    if each_posp[6] == "+":
-                        if each_posp[2] in pair_dict['pairs']['pair_number'].keys():
-                            #print('ovl ; +')
-                            f.write(""+str(pair_dict['pairs']['pair_number'][each_posp[2]])+"\n")
-                    elif each_posp[6] == "-":
-                        if each_posp[1] in pair_dict['pairs']['pair_number'].keys():
-                            f.write(""+str(pair_dict['pairs']['pair_number'][each_posp[1]])+"\n")
-        else:
-            f.write("nothing found")
-
-    with open(args.overlap_txt.name, "a") as f:
-        #print(isp_seqs)
-        f.write("\n================= overlap candidate sequences ================\n")
-        f.write("======================= isp ==========================\n\n")
-        for isp_data in isp_seqs:
-            f.write(">isp_orf::{}-pericount~={}\n{}\n".format(isp_data[0],isp_data[2],lineWrapper(isp_data[1])))
-        f.write("\n======================= osp ========================\n\n")
-        for osp_data in osp_seqs:
-            f.write(">osp_orf::{}-pericount~={}\n{}\n".format(osp_data[0],osp_data[2],lineWrapper(osp_data[1])))
-
-    args.putative_isp_fasta_file = open(args.putative_isp_fasta_file.name, "r")
-    isp_full = tuple_fasta(args.putative_isp_fasta_file)
-    args.putative_osp_fasta_file = open(args.putative_osp_fasta_file.name, "r")
-    osp_full = tuple_fasta(args.putative_osp_fasta_file)
-
-    isp_seqs = []
-    osp_seqs = []
-    for isp_tupe in isp_full:
-        for pisp, posp in separate.items():
-            if re.search(("("+str(pisp)+")\D"), isp_tupe[0]):
-                peri_count = str.split(isp_tupe[0],"~=")[1]
-                isp_seqs.append((pisp,isp_tupe[1],peri_count))
-    #print(isp_seqs)
-    for osp_tupe in osp_full:
-        for pisp, posp in separate.items():
-            for data in posp:
-                if re.search(("("+str(data[3])+")\D"), osp_tupe[0]):
-                    peri_count = str.split(osp_tupe[0],"~=")[1]
-                    osp_seqs.append((data[3],osp_tupe[1],peri_count))
-
-    with args.separate_txt as f:
-        f.write("================ separated spanin candidates =================\n")
-        f.write("isp\tisp_start\tisp_end\tosp\tosp_start\tosp_end\tstrand\tpair_number\n")
-        if separate != {}:
-            for pisp, posp in separate.items():
-                f.write(pisp + "\n")
-                for each_posp in posp:
-                    f.write(
-                        "\t{}\t{}\t{}\t{}\t{}\t{}\t".format(
-                            each_posp[1],
-                            each_posp[2],
-                            each_posp[3],
-                            each_posp[4],
-                            each_posp[5],
-                            each_posp[6],
-                        )
-                    )
-                    if each_posp[6] == "+":
-                        if each_posp[2] in pair_dict['pairs']['pair_number'].keys():
-                            f.write(""+str(pair_dict['pairs']['pair_number'][each_posp[2]])+"\n")
-                    elif each_posp[6] == "-":
-                        if each_posp[1] in pair_dict['pairs']['pair_number'].keys():
-                            f.write(""+str(pair_dict['pairs']['pair_number'][each_posp[1]])+"\n")
-        else:
-            f.write("nothing found")
-
-    with open(args.separate_txt.name, "a") as f:
-        f.write("\n================= separated candidate sequences ================\n")
-        f.write("======================= isp ==========================\n\n")
-        for isp_data in isp_seqs:
-            f.write(">isp_orf::{}-pericount~={}\n{}\n".format(isp_data[0],isp_data[2],lineWrapper(isp_data[1])))
-        f.write("\n======================= osp ========================\n\n")
-        for osp_data in osp_seqs:
-            f.write(">osp_orf::{}-pericount~={}\n{}\n".format(osp_data[0],osp_data[2],lineWrapper(osp_data[1])))
--- a/cpt_find_spanins/findSpanin.xml	Fri Jun 17 12:40:59 2022 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,44 +0,0 @@
-<?xml version="1.1"?>
-<tool id="edu.tamu.cpt2.spanin.findSpanin" name="Find Spanin" version="1.0">
-    <description>With the outputs from the ISP and OSP candidate tools, cull the list down to candidate pairs</description>
-    <macros>
-		<import>macros.xml</import>
-		<import>cpt-macros.xml</import>
-    </macros>
-    <expand macro="requirements">
-    </expand>
-    <command detect_errors="aggressive"><![CDATA[
-python $__tool_directory__/findSpanin.py
-$putative_isp_fasta_file
-$putative_osp_fasta_file
---max_isp_osp_distance $max_isp_osp_distance
---embedded_txt $embedded_txt
---overlap_txt $overlap_txt
---separate_txt $separate_txt
---summary_txt $summary_txt
-]]></command>
-    <inputs>
-        <param label="(putative) i-spanin FASTA file" name="putative_isp_fasta_file" type="data" format="fasta"/>
-        <param label="(putative) o-spanin FASTA file" name="putative_osp_fasta_file" type="data" format="fasta"/>
-        <param label="max distance from end of i-spanin to beginning of o-spanin (measured in AAs)" name="max_isp_osp_distance" type="integer" value="10" />
-    </inputs>
-    <outputs>
-          <data format="txt" name="summary_txt" label="FindSpanin_summary.txt"/>
-          <data format="tabular" name="embedded_txt" label="embedded_results.txt"/>
-          <data format="tabular" name="overlap_txt" label="overlap_results.txt"/>
-          <data format="tabular" name="separate_txt" label="separate_results.txt"/>
-    </outputs>
-    <help><![CDATA[
-**What it does**
-Compares the protein FASTA files with candidate i-spanins and o-spanins from a genome and matches them into candidate pairs based on position in a strand-aware fashion.
-
-**INPUT** --> Putative i-spanin and o-spanin protein multiFASTAs (generated from the ISP/OSP Candidate Tools). 
-
-**METHODOLOGY** 
-Does a pairwise comparison between candidate i-spanins and o-spanins based on their genomic location, and classifies them into the known bimolecular spanin genetic architectures. Classes are: embedded, overlapping, separated, or NOT a potential pair.
-
-**OUTPUT** --> File with candidate pairs for each bimolecular spanin class and a basic summary statistics file. 
-
-]]></help>
-        <expand macro="citations-2020" />
-</tool>
--- a/cpt_find_spanins/macros.xml	Fri Jun 17 12:40:59 2022 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,63 +0,0 @@
-<?xml version="1.0"?>
-<macros>
-	<xml name="requirements">
-		<requirements>
-                        <requirement type="package" version="2019.06.05">regex</requirement>
-			<requirement type="package" version="3.6">python</requirement>
-			<requirement type="package" version="1.77">biopython</requirement>
-			<requirement type="package" version="1.1.7">cpt_gffparser</requirement>  
-			<yield/>
-		</requirements>
-	</xml>
-	<token name="@BLAST_TSV@">
-		"$blast_tsv"
-	</token>
-	<xml name="blast_tsv">
-		<param label="Blast Results" help="TSV/tabular (25 Column)"
-			name="blast_tsv" type="data" format="tabular" />
-	</xml>
-
-	<token name="@BLAST_XML@">
-		"$blast_xml"
-	</token>
-	<xml name="blast_xml">
-		<param label="Blast Results" help="XML format"
-			name="blast_xml" type="data" format="blastxml" />
-	</xml>
-	<xml name="gff3_with_fasta">
-	<param label="Genome Sequences" name="fasta" type="data" format="fasta" />
-	<param label="Genome Annotations" name="gff3" type="data" format="gff3" />
-	</xml>
-	<xml name="genome_selector">
-	    <param name="genome_fasta" type="data" format="fasta" label="Source FASTA Sequence"/>
-	</xml>
-	<xml name="gff3_input">
-		<param label="GFF3 Annotations" name="gff3_data" type="data" format="gff3"/>
-	</xml>
-	<xml name="input/gff3+fasta">
-		<expand macro="gff3_input" />
-		<expand macro="genome_selector" />
-	</xml>
-	<token name="@INPUT_GFF@">
-	"$gff3_data"
-	</token>
-	<token name="@INPUT_FASTA@">
-		genomeref.fa
-	</token>
-	<token name="@GENOME_SELECTOR_PRE@">
-		ln -s $genome_fasta genomeref.fa;
-	</token>
-	<token name="@GENOME_SELECTOR@">
-		genomeref.fa
-	</token>
-        <xml name="input/fasta">
-		<param label="Fasta file" name="sequences" type="data" format="fasta"/>
-	</xml>
-
-	<token name="@SEQUENCE@">
-		"$sequences"
-	</token>
-	<xml name="input/fasta/protein">
-		<param label="Protein fasta file" name="sequences" type="data" format="fasta"/>
-	</xml>
-</macros>
--- a/cpt_find_spanins/spaninFuncs.py	Fri Jun 17 12:40:59 2022 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,469 +0,0 @@
-"""
-PREMISE
-### Functions/Classes that are used in both generate-putative-osp.py and generate-putative-isp.py
-###### Main premise here is to make the above scripts a little more DRY, as well as easily readable for execution.
-###### Documentation will ATTEMPT to be thourough here
-"""
-
-import re
-from Bio import SeqIO
-from Bio import Seq
-from collections import OrderedDict
-
-# Not written in OOP for a LITTLE bit of trying to keep the complication down in case adjustments are needed by someone else.
-# Much of the manipulation is string based; so it should be straightforward as well as moderately quick
-################## GLobal Variables
-Lys = "K"
-
-
-def check_back_end_snorkels(seq, tmsize):
-    """
-        Searches through the backend of a potential TMD snorkel. This is the 2nd part of a TMD snorkel lysine match.
-        --> seq : should be the sequence fed from the "search_region" portion of the sequence
-        --> tmsize : size of the potential TMD being investigated
-    """
-    found = []
-    if seq[tmsize - 4] == Lys and re.search(("[FIWLVMYCATGS]"), seq[tmsize - 5]):
-        found = "match"
-        return found
-    elif seq[tmsize - 3] == Lys and re.search(("[FIWLVMYCATGS]"), seq[tmsize - 4]):
-        found = "match"
-        return found
-    elif seq[tmsize - 2] == Lys and re.search(("[FIWLVMYCATGS]"), seq[tmsize - 3]):
-        found = "match"
-        return found
-    elif seq[tmsize - 1] == Lys and re.search(("[FIWLVMYCATGS]"), seq[tmsize - 2]):
-        found = "match"
-        return found
-    else:
-        found = "NOTmatch"
-        return found
-
-
-def prep_a_gff3(fa, spanin_type, org):
-    """
-        Function parses an input detailed 'fa' file and outputs a 'gff3' file
-        ---> fa = input .fa file
-        ---> output = output a returned list of data, easily portable to a gff3 next
-        ---> spanin_type = 'isp' or 'osp'
-    """
-    with org as f:
-        header = f.readline()
-        orgacc = header.split(" ")
-        orgacc = orgacc[0].split(">")[1].strip()
-    fa_zip = tuple_fasta(fa)
-    data = []
-    for a_pair in fa_zip:
-        # print(a_pair)
-        if re.search(("(\[1\])"), a_pair[0]):
-            strand = "+"
-        elif re.search(("(\[-1\])"), a_pair[0]):
-            strand = "-"  # column 7
-        start = re.search(("[\d]+\.\."), a_pair[0]).group(0).split("..")[0]  # column 4
-        end = re.search(("\.\.[\d]+"), a_pair[0]).group(0).split("..")[1]  # column 5
-        orfid = re.search(("(ORF)[\d]+"), a_pair[0]).group(0)  # column 1
-        if spanin_type == "isp":
-            methodtype = "CDS"  # column 3
-            spanin = "isp"
-        elif spanin_type == "osp":
-            methodtype = "CDS"  # column 3
-            spanin = "osp"
-        elif spanin_type == "usp":
-            methodtype = "CDS"
-            spanin = "usp"
-        else:
-            raise "need to input spanin type"
-        source = "cpt.py|putative-*.py"  # column 2
-        score = "."  # column 6
-        phase = "."  # column 8
-        attributes = "ID=" +orgacc+ "|"+ orfid + ";ALIAS=" + spanin + ";SEQ="+a_pair[1]  # column 9
-        sequence = [[orgacc, source, methodtype, start, end, score, strand, phase, attributes]]
-        data += sequence
-    return data
-
-
-def write_gff3(data, output="results.gff3"):
-    """
-        Parses results from prep_a_gff3 into a gff3 file
-        ---> input : list from prep_a_gff3
-        ---> output : gff3 file
-    """
-    data = data
-    filename = output
-    with filename as f:
-        f.write("#gff-version 3\n")
-        for value in data:
-            f.write(
-                "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
-                    value[0],
-                    value[1],
-                    value[2],
-                    value[3],
-                    value[4],
-                    value[5],
-                    value[6],
-                    value[7],
-                    value[8],
-                )
-            )
-    f.close()
-
-
-def find_tmd(pair, minimum=10, maximum=30, TMDmin=10, TMDmax=20, isp_mode=False, peri_min=18, peri_max=206):
-    """ 
-        Function that searches for lysine snorkels and then for a spanning hydrophobic region that indicates a potential TMD
-        ---> pair : Input of tuple with description and AA sequence (str)
-        ---> minimum : How close from the initial start codon a TMD can be within
-        ---> maximum : How far from the initial start codon a TMD can be within
-        ---> TMDmin : The minimum size that a transmembrane can be (default = 10)
-        ---> TMDmax : The maximum size tha ta transmembrane can be (default = 20)
-    """
-    # hydrophobicAAs = ['P', 'F', 'I', 'W', 'L', 'V', 'M', 'Y', 'C', 'A', 'T', 'G', 'S']
-    tmd = []
-    s = str(pair[1])  # sequence being analyzed
-    # print(s) # for trouble shooting
-    if maximum > len(s):
-        maximum = len(s)
-    search_region = s[minimum - 1 : maximum + 1]
-    #print(f"this is the search region: {search_region}")
-    # print(search_region) # for trouble shooting
-
-    for tmsize in range(TMDmin, TMDmax+1, 1):
-        #print(f"this is the current tmsize we're trying: {tmsize}")
-        # print('==============='+str(tmsize)+'================') # print for troubleshooting
-        pattern = "[PFIWLVMYCATGS]{"+str(tmsize)+"}"  # searches for these hydrophobic residues tmsize total times
-        #print(pattern)
-        #print(f"sending to regex: {search_region}")
-        if re.search(
-            ("[K]"), search_region[1:8]):  # grabbing one below with search region, so I want to grab one ahead here when I query.
-            store_search = re.search(("[K]"), search_region[1:8])  # storing regex object
-            where_we_are = store_search.start()  # finding where we got the hit
-            if re.search(
-                ("[PFIWLVMYCATGS]"), search_region[where_we_are + 1]
-            ) and re.search(
-                ("[PFIWLVMYCATGS]"), search_region[where_we_are - 1]
-            ):  # hydrophobic neighbor
-                #try:
-                g = re.search(("[PFIWLVMYCATGS]"), search_region[where_we_are + 1]).group()
-                backend = check_back_end_snorkels(search_region, tmsize)
-                if backend == "match":
-                    if isp_mode:
-                        g = re.search((pattern), search_region).group()
-                        end_of_tmd = re.search((g), s).end()+1
-                        amt_peri = len(s) - end_of_tmd
-                        if peri_min <= amt_peri <= peri_max:
-                            pair_desc = pair[0] + ", peri_count~="+str(amt_peri)
-                            new_pair = (pair_desc,pair[1])
-                            tmd.append(new_pair)
-                    else:
-                        tmd.append(pair)
-                else:
-                    continue
-        #else:
-            #print("I'm continuing out of snorkel loop")
-            #print(f"{search_region}")
-            #continue
-        if re.search((pattern), search_region):
-            #print(f"found match: {}")
-            #print("I AM HEREEEEEEEEEEEEEEEEEEEEEEE")
-            #try:
-            if isp_mode:
-                g = re.search((pattern), search_region).group()
-                end_of_tmd = re.search((g), s).end()+1
-                amt_peri = len(s) - end_of_tmd
-                if peri_min <= amt_peri <= peri_max:
-                    pair_desc = pair[0] + ", peri_count~="+str(amt_peri)
-                    new_pair = (pair_desc,pair[1])
-                    tmd.append(new_pair)
-            else:
-                tmd.append(pair)
-        else:
-            continue
-
-        return tmd
-
-
-def find_lipobox(pair, minimum=10, maximum=50, min_after=30, max_after=185, regex=1, osp_mode=False):
-    """
-        Function that takes an input tuple, and will return pairs of sequences to their description that have a lipoobox
-        ---> minimum - min distance from start codon to first AA of lipobox
-        ---> maximum - max distance from start codon to first AA of lipobox
-        ---> regex - option 1 (default) => more strict regular expression ; option 2 => looser selection, imported from LipoRy
-        
-    """
-    if regex == 1:
-        pattern = "[ILMFTV][^REKD][GAS]C"  # regex for Lipobox from findSpanin.pl
-    elif regex == 2:
-        pattern = "[ACGSILMFTV][^REKD][GAS]C"  # regex for Lipobox from LipoRy
-
-    candidates = []
-    s = str(pair[1])
-    # print(s) # trouble shooting
-    search_region = s[minimum-1 : maximum + 5] # properly slice the input... add 4 to catch if it hangs off at max input
-    # print(search_region) # trouble shooting
-    patterns = ["[ILMFTV][^REKD][GAS]C","AW[AGS]C"]
-    for pattern in patterns:
-        #print(pattern)  # trouble shooting
-        if re.search((pattern), search_region):  # lipobox must be WITHIN the range...
-            # searches the sequence with the input RegEx AND omits if
-            g = re.search((pattern), search_region).group() # find the exact group match
-            amt_peri = len(s) - re.search((g), s).end() + 1
-            if min_after <= amt_peri <= max_after: # find the lipobox end region
-                if osp_mode:
-                    pair_desc = pair[0] + ", peri_count~="+str(amt_peri)
-                    new_pair = (pair_desc,pair[1])
-                    candidates.append(new_pair)
-                else:
-                    candidates.append(pair)
-
-                return candidates
-
-
-def tuple_fasta(fasta_file):
-    """
-        #### INPUT: Fasta File
-        #### OUTPUT: zipped (zip) : pairwise relationship of description to sequence
-        #### 
-    """
-    fasta = SeqIO.parse(fasta_file, "fasta")
-    descriptions = []
-    sequences = []
-    for r in fasta:  # iterates and stores each description and sequence
-        description = r.description
-        sequence = str(r.seq)
-        if (
-            sequence[0] != "I"
-        ):  # the translation table currently has I as a potential start codon ==> this will remove all ORFs that start with I
-            descriptions.append(description)
-            sequences.append(sequence)
-        else:
-            continue
-
-    return zip(descriptions, sequences)
-
-
-def lineWrapper(text, charactersize=60):
-
-    if len(text) <= charactersize:
-        return text
-    else:
-        return (
-            text[:charactersize]
-            + "\n"
-            + lineWrapper(text[charactersize:], charactersize)
-        )
-
-
-def getDescriptions(fasta):
-    """
-    Takes an output FASTA file, and parses retrieves the description headers. These headers contain information needed
-    for finding locations of a potential i-spanin and o-spanin proximity to one another.
-    """
-    desc = []
-    with fasta as f:
-        for line in f:
-            if line.startswith(">"):
-                desc.append(line)
-    return desc
-
-
-def splitStrands(text, strand="+"):
-    # positive_strands = []
-    # negative_strands = []
-    if strand == "+":
-        if re.search(("(\[1\])"), text):
-            return text
-    elif strand == "-":
-        if re.search(("(\[-1\])"), text):
-            return text
-    # return positive_strands, negative_strands
-
-
-def parse_a_range(pair, start, end):
-    """
-        Takes an input data tuple from a fasta tuple pair and keeps only those within the input sequence range
-        ---> data : fasta tuple data
-        ---> start : start range to keep
-        ---> end : end range to keep (will need to + 1)
-    """
-    matches = []
-    for each_pair in pair:
-
-        s = re.search(("[\d]+\.\."), each_pair[0]).group(0)  # Start of the sequence
-        s = int(s.split("..")[0])
-        e = re.search(("\.\.[\d]+"), each_pair[0]).group(0)
-        e = int(e.split("..")[1])
-        if start - 1 <= s and e <= end + 1:
-            matches.append(each_pair)
-        else:
-            continue
-    # else:
-    # continue
-    # if matches != []:
-    return matches
-    # else:
-    # print('no candidates within selected range')
-
-
-def grabLocs(text):
-    """
-    Grabs the locations of the spanin based on NT location (seen from ORF). Grabs the ORF name, as per named from the ORF class/module
-    from cpt.py
-    """
-    start = re.search(("[\d]+\.\."), text).group(0)  # Start of the sequence ; looks for [numbers]..
-    end = re.search(("\.\.[\d]+"), text).group(0)  # End of the sequence ; Looks for ..[numbers]
-    orf = re.search(("(ORF)[\d]+"), text).group(0)  # Looks for ORF and the numbers that are after it
-    if re.search(("(\[1\])"), text): # stores strand
-        strand = "+"
-    elif re.search(("(\[-1\])"), text): # stores strand
-        strand = "-"
-    start = int(start.split("..")[0])
-    end = int(end.split("..")[1])
-    vals = [start, end, orf, strand]
-
-    return vals
-
-
-def spaninProximity(isp, osp, max_dist=30):
-    """
-    _NOTE THIS FUNCTION COULD BE MODIFIED TO RETURN SEQUENCES_
-    Compares the locations of i-spanins and o-spanins. max_dist is the distance in NT measurement from i-spanin END site
-    to o-spanin START. The user will be inputting AA distance, so a conversion will be necessary (<user_input> * 3)
-    I modified this on 07.30.2020 to bypass the pick + or - strand. To 
-    INPUT: list of OSP and ISP candidates
-    OUTPUT: Return (improved) candidates for overlapping, embedded, and separate list
-    """
-
-    embedded = {}
-    overlap = {}
-    separate = {}
-    for iseq in isp:
-        embedded[iseq[2]] = []
-        overlap[iseq[2]] = []
-        separate[iseq[2]] = []
-        for oseq in osp:
-            if iseq[3] == "+":
-                if oseq[3] == "+":
-                    if iseq[0] < oseq[0] < iseq[1] and oseq[1] < iseq[1]:
-                        ### EMBEDDED ###
-                        combo = [
-                            iseq[0],
-                            iseq[1],
-                            oseq[2],
-                            oseq[0],
-                            oseq[1],
-                            iseq[3],
-                        ]  # ordering a return for dic
-                        embedded[iseq[2]] += [combo]
-                    elif iseq[0] < oseq[0] <= iseq[1] and oseq[1] > iseq[1]:
-                        ### OVERLAP / SEPARATE ###
-                        if (iseq[1] - oseq[0]) < 6:
-                            combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]]
-                            separate[iseq[2]] += [combo]
-                        else:
-                            combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]]
-                            overlap[iseq[2]] += [combo]
-                    elif iseq[1] <= oseq[0] <= iseq[1] + max_dist:
-                        combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]]
-                        separate[iseq[2]] += [combo]
-                    else:
-                        continue
-            if iseq[3] == "-":
-                if oseq[3] == "-":
-                    if iseq[0] <= oseq[1] <= iseq[1] and oseq[0] > iseq[0]:
-                        ### EMBEDDED ###
-                        combo = [
-                            iseq[0],
-                            iseq[1],
-                            oseq[2],
-                            oseq[0],
-                            oseq[1],
-                            iseq[3],
-                        ]  # ordering a return for dict
-                        embedded[iseq[2]] += [combo]
-                    elif iseq[0] <= oseq[1] <= iseq[1] and oseq[0] < iseq[0]:
-                        if (oseq[1] - iseq[0]) < 6:
-                            combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]]
-                            separate[iseq[2]] += [combo]
-                        else:
-                            combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]]
-                            overlap[iseq[2]] += [combo]
-                    elif iseq[0] - 10 < oseq[1] < iseq[0]:
-                        combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]]
-                        separate[iseq[2]] += [combo]
-                    else:
-                        continue
-
-    embedded = {k: embedded[k] for k in embedded if embedded[k]}
-    overlap = {k: overlap[k] for k in overlap if overlap[k]}
-    separate = {k: separate[k] for k in separate if separate[k]}
-
-    return embedded, overlap, separate
-
-
-def check_for_usp():
-    " pass "
-
-############################################### TEST RANGE #########################################################################
-####################################################################################################################################
-if __name__ == "__main__":
-
-    #### TMD TEST
-    test_desc = ["one", "two", "three", "four", "five"]
-    test_seq = [
-        "XXXXXXXXXXXXXXXFMCFMCFMCFMCFMCXXXXXXXXXXXXXXXXXXXXXXXXXX",
-        "XXXXXXXXAAKKKKKKKKKKKKKKKXXXXXXXXXXXXX",
-        "XXXXXXX",
-        "XXXXXXXXXXXKXXXXXXXXXX",
-        "XXXXXXXXXXAKXXXXXXXXXXAKXXXXXXXX",
-    ]
-    # for l in
-    # combo = zip(test_desc,test_seq)
-    pairs = zip(test_desc, test_seq)
-    tmd = []
-    for each_pair in pairs:
-        # print(each_pair)
-        try:
-            tmd += find_tmd(pair=each_pair)
-        except (IndexError, TypeError):
-            continue
-        # try:s = each_pair[1]
-        # tmd += find_tmd(seq=s, tmsize=15)
-    # print('\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n')
-    # print(tmd)
-    # print('\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n')
-
-    #### tuple-fasta TEST
-    # fasta_file = 'out_isp.fa'
-    # ret = tuple_fasta(fasta_file)
-    # print('=============')
-    # for i in ret:
-    # print(i[1])
-
-    #### LipoBox TEST
-    test_desc = ["one", "two", "three", "four", "five", "six", "seven"]
-    test_seq = [
-        "XXXXXXXXXTGGCXXXXXXXXXXXXXXXX",
-        "XXXXXXXXAAKKKKKKKKKKKKKKKXXXXXXXXXXXXX",
-        "XXXXXXX",
-        "AGGCXXXXXXXXXXXXXXXXXXXXTT",
-        "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXTGGC",
-        "XXXXXXXXXXXXXXXXXXXXXXXXXXTGGC",
-        "MSTLRELRLRRALKEQSMRYLLSIKKTLPRWKGALIGLFLICVATISGCASESKLPEPPMVSVDSSLMVEPNLTTEMLNVFSQ*",
-    ]
-    pairs = zip(test_desc, test_seq)
-    lipo = []
-    for each_pair in pairs:
-        #print(each_pair)
-        # try:
-        try:
-            lipo += find_lipobox(pair=each_pair, regex=2)  # , minimum=8)
-        except TypeError:  # catches if something doesnt have the min/max requirements (something is too small)
-            continue
-        # except:
-        # continue
-    # print('\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n')
-    #############################3
-    # g = prep_a_gff3(fa='putative_isp.fa', spanin_type='isp')
-    # print(g)
-    # write_gff3(data=g)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/findSpanin.py	Mon Jun 05 02:42:01 2023 +0000
@@ -0,0 +1,597 @@
+##### findSpanin.pl --> findSpanin.py
+######### Much of this code is very "blocked", in the sense that one thing happens...then a function happens on the return...then another function...etc...etc...
+
+import argparse
+import os
+import re  # new
+import itertools  # new
+from collections import Counter, OrderedDict
+from spaninFuncs import (
+    getDescriptions,
+    grabLocs,
+    spaninProximity,
+    splitStrands,
+    tuple_fasta,
+    lineWrapper,
+)
+
+### Requirement Inputs
+#### INPUT : putative_isp.fa & putative_osp.fa (in that order)
+#### PARAMETERS :
+
+###############################################################################
+def write_output(candidates):
+    """output file function...maybe not needed"""
+    pass
+
+
+def reconfigure_dict(spanins):
+    """
+    re organizes dictionary to be more friendly for checks
+    """
+
+    new_spanin_dict = {}
+
+    for each_spanin_type, data_dict in spanins.items():
+        # print(f"{each_spanin_type} == {data_dict}")
+        new_spanin_dict[each_spanin_type] = {}
+        new_spanin_dict[each_spanin_type]["positive"] = {}
+        new_spanin_dict[each_spanin_type]["negative"] = {}
+        new_spanin_dict[each_spanin_type]["positive"]["coords"] = []
+        new_spanin_dict[each_spanin_type]["negative"]["coords"] = []
+        for outter_orf, inner_data in data_dict.items():
+            list_of_hits = []
+            for data_content in inner_data:
+                # print(data_content)
+                data_content.insert(0, outter_orf)
+                # print(f"new data_content -> {data_content}")
+                # print(data_content)
+                # list_of_hits += [data_content]
+                # new_spanin_dict[each_spanin_type] += [data_content]
+                if data_content[6] == "+":
+                    # print(f"{each_spanin_type} @ POSITIVE")
+                    new_spanin_dict[each_spanin_type]["positive"]["coords"] += [
+                        data_content
+                    ]
+                elif data_content[6] == "-":
+                    # print(f"{each_spanin_type} @ NEGATIVE")
+                    new_spanin_dict[each_spanin_type]["negative"]["coords"] += [
+                        data_content
+                    ]
+        # print(new_spanin_dict[each_spanin_type])
+        # print(reorganized)
+        # print(f"{outter_orf} => {inner_data}")
+        # print(new_spanin_dict)
+
+        # print('\n')
+    # for k, v in new_spanin_dict.items():
+    # print(k)
+    # print(v)
+    return new_spanin_dict
+
+
+def check_for_uniques(spanins):
+    """
+    Checks for unique spanins based on spanin_type.
+    If the positive strand end site is _the same_ for a i-spanin, we would group that as "1".
+    i.e. if ORF1, ORF2, and ORF3 all ended with location 4231, they would not be unique.
+    """
+    pair_dict = {}
+    pair_dict = {
+        "pairs": {
+            "location_amount": [],
+            "pair_number": {},
+        }
+    }
+    for each_spanin_type, spanin_data in spanins.items():
+        # print(f"{each_spanin_type} ===> {spanin_data}")
+        # early declarations for cases of no results
+        pos_check = []  # end checks
+        pos_uniques = []
+        neg_check = []  # start checks
+        neg_uniques = []
+        unique_ends = []
+        pos_amt_unique = 0
+        neg_amt_unique = 0
+        amt_positive = 0
+        amt_negative = 0
+        spanin_data["uniques"] = 0
+        spanin_data["amount"] = 0
+        # spanin_data['positive']['amt_positive'] = 0
+        # spanin_data['positive']['pos_amt_unique'] = 0
+        # spanin_data['positive']['isp_match'] = []
+        # spanin_data['negative']['amt_negative'] = 0
+        # spanin_data['negative']['neg_amt_unique'] = 0
+        # spanin_data['negative']['isp_match'] = []
+        # print(spanin_data)
+        if spanin_data["positive"]["coords"]:
+            # do something...
+            # print('in other function')
+            # print(spanin_data['positive']['coords'])
+            for each_hit in spanin_data["positive"]["coords"]:
+                pos_check.append(each_hit[2])
+                pair_dict["pairs"]["location_amount"].append(each_hit[2])
+                pos_uniques = list(
+                    set(
+                        [
+                            end_site
+                            for end_site in pos_check
+                            if pos_check.count(end_site) >= 1
+                        ]
+                    )
+                )
+            # print(pos_check)
+            # print(pos_uniques)
+            amt_positive = len(spanin_data["positive"]["coords"])
+            pos_amt_unique = len(pos_uniques)
+            if amt_positive:
+                spanin_data["positive"]["amt_positive"] = amt_positive
+                spanin_data["positive"]["pos_amt_unique"] = pos_amt_unique
+            # pair_dict['pairs']['locations'].extend(pos_uniques)
+        else:
+            spanin_data["positive"]["amt_positive"] = 0
+            spanin_data["positive"]["pos_amt_unique"] = 0
+        if spanin_data["negative"]["coords"]:
+
+            # do something else...
+            # print('in other function')
+            # print(spanin_data['negative']['coords'])
+            for each_hit in spanin_data["negative"]["coords"]:
+                neg_check.append(each_hit[1])
+                pair_dict["pairs"]["location_amount"].append(each_hit[1])
+                neg_uniques = list(
+                    set(
+                        [
+                            start_site
+                            for start_site in neg_check
+                            if neg_check.count(start_site) >= 1
+                        ]
+                    )
+                )
+            # print(neg_uniques)
+            amt_negative = len(spanin_data["negative"]["coords"])
+            neg_amt_unique = len(neg_uniques)
+            if amt_negative:
+                spanin_data["negative"]["amt_negative"] = amt_negative
+                spanin_data["negative"]["neg_amt_unique"] = neg_amt_unique
+            # pair_dict['pairs']['locations'].extend(neg_uniques)
+        else:
+            spanin_data["negative"]["amt_negative"] = 0
+            spanin_data["negative"]["neg_amt_unique"] = 0
+        spanin_data["uniques"] += (
+            spanin_data["positive"]["pos_amt_unique"]
+            + spanin_data["negative"]["neg_amt_unique"]
+        )
+        spanin_data["amount"] += (
+            spanin_data["positive"]["amt_positive"]
+            + spanin_data["negative"]["amt_negative"]
+        )
+        # print(spanin_data['uniques'])
+    list(set(pair_dict["pairs"]["location_amount"]))
+    pair_dict["pairs"]["location_amount"] = dict(
+        Counter(pair_dict["pairs"]["location_amount"])
+    )
+    for data in pair_dict.values():
+        # print(data['locations'])
+        # print(type(data['locations']))
+        v = 0
+        for loc, count in data["location_amount"].items():
+            # data['pair_number'] = {loc
+            v += 1
+            data["pair_number"][loc] = v
+    # print(dict(Counter(pair_dict['pairs']['locations'])))
+    # print(pair_dict)
+    spanins["total_amount"] = (
+        spanins["EMBEDDED"]["amount"]
+        + spanins["SEPARATED"]["amount"]
+        + spanins["OVERLAPPED"]["amount"]
+    )
+    spanins["total_unique"] = (
+        spanins["EMBEDDED"]["uniques"]
+        + spanins["SEPARATED"]["uniques"]
+        + spanins["OVERLAPPED"]["uniques"]
+    )
+    # spanins['total_unique'] = len(pair_dict['pairs']['pair_number'])
+    return spanins, pair_dict
+
+
+if __name__ == "__main__":
+
+    # Common parameters for both ISP / OSP portion of script
+
+    parser = argparse.ArgumentParser(
+        description="Trim the putative protein candidates and find potential i-spanin / o-spanin pairs"
+    )
+
+    parser.add_argument(
+        "putative_isp_fasta_file",
+        type=argparse.FileType("r"),
+        help='Putative i-spanin FASTA file, output of "generate-putative-isp"',
+    )  # the "input" argument
+
+    parser.add_argument(
+        "putative_osp_fasta_file",
+        type=argparse.FileType("r"),
+        help='Putative o-spanin FASTA file, output of "generate-putative-osp"',
+    )
+
+    parser.add_argument(
+        "--max_isp_osp_distance",
+        dest="max_isp_osp_distance",
+        default=10,
+        type=int,
+        help="max distance from end of i-spanin to start of o-spanin, measured in AAs",
+    )
+
+    parser.add_argument(
+        "--embedded_txt",
+        dest="embedded_txt",
+        type=argparse.FileType("w"),
+        default="_findSpanin_embedded_results.txt",
+        help="Results of potential embedded spanins",
+    )
+    parser.add_argument(
+        "--overlap_txt",
+        dest="overlap_txt",
+        type=argparse.FileType("w"),
+        default="_findSpanin_overlap_results.txt",
+        help="Results of potential overlapping spanins",
+    )
+    parser.add_argument(
+        "--separate_txt",
+        dest="separate_txt",
+        type=argparse.FileType("w"),
+        default="_findSpanin_separated_results.txt",
+        help="Results of potential separated spanins",
+    )
+
+    parser.add_argument(
+        "--summary_txt",
+        dest="summary_txt",
+        type=argparse.FileType("w"),
+        default="_findSpanin_summary.txt",
+        help="Results of potential spanin pairs",
+    )
+    parser.add_argument(
+        "-v", action="version", version="0.3.0"
+    )  # Is this manually updated?
+    args = parser.parse_args()
+
+    #### RE-WRITE
+    SPANIN_TYPES = {}
+    SPANIN_TYPES["EMBEDDED"] = {}
+    SPANIN_TYPES["OVERLAPPED"] = {}
+    SPANIN_TYPES["SEPARATED"] = {}
+    # SPANIN_TYPES = {
+    #    'EMBEDDED' : {},
+    #    'OVERLAPPED' : {},
+    #    'SEPARATED' : {},
+    # }
+
+    isp = getDescriptions(args.putative_isp_fasta_file)
+    args.putative_isp_fasta_file = open(args.putative_isp_fasta_file.name, "r")
+    isp_full = tuple_fasta(args.putative_isp_fasta_file)
+
+    osp = getDescriptions(args.putative_osp_fasta_file)
+    args.putative_osp_fasta_file = open(args.putative_osp_fasta_file.name, "r")
+    osp_full = tuple_fasta(args.putative_osp_fasta_file)
+
+    #### location data
+    location_data = {"isp": [], "osp": []}
+    spanins = [isp, osp]
+    for idx, each_spanin_type in enumerate(spanins):
+        for description in each_spanin_type:
+            locations = grabLocs(description)
+            if idx == 0:  # i-spanin
+                location_data["isp"].append(locations)
+            elif idx == 1:  # o-spanin
+                location_data["osp"].append(locations)
+
+    #### Check for types of spanins
+    embedded, overlap, separate = spaninProximity(
+        isp=location_data["isp"],
+        osp=location_data["osp"],
+        max_dist=args.max_isp_osp_distance * 3,
+    )
+
+    SPANIN_TYPES["EMBEDDED"] = embedded
+    SPANIN_TYPES["OVERLAPPED"] = overlap
+    SPANIN_TYPES["SEPARATED"] = separate
+
+    # for spanin_type, spanin in SPANIN_TYPES.items():
+    #    s = 0
+    #    for sequence in spanin.values():
+    #        s += len(sequence)
+    #    SPANIN_TYPES[spanin_type]['amount'] = s
+    #    SPANIN_TYPES[spanin_type]['unique'] = len(spanin.keys())
+
+    # check_for_unique_spanins(SPANIN_TYPES)
+    spanins = reconfigure_dict(SPANIN_TYPES)
+    spanins, pair_dict = check_for_uniques(spanins)
+    # print(pair_dict)
+    with args.summary_txt as f:
+        for each_spanin_type, spanin_data in spanins.items():
+            try:
+                if each_spanin_type not in ["total_amount", "total_unique"]:
+                    # print(each_spanin_type)
+                    # print(each_spanin_type)
+                    f.write(
+                        "=~~~~~= "
+                        + str(each_spanin_type)
+                        + " Spanin Candidate Statistics =~~~~~=\n"
+                    )
+                    f.writelines(
+                        "Total Candidate Pairs = " + str(spanin_data["amount"]) + "\n"
+                    )
+                    f.writelines(
+                        "Total Unique Pairs = " + str(spanin_data["uniques"]) + "\n"
+                    )
+                    if each_spanin_type == "EMBEDDED":
+                        for k, v in SPANIN_TYPES["EMBEDDED"].items():
+                            # print(k)
+                            f.writelines(
+                                ""
+                                + str(k)
+                                + " ==> Amount of corresponding candidate o-spanins(s): "
+                                + str(len(v))
+                                + "\n"
+                            )
+                    if each_spanin_type == "SEPARATED":
+                        for k, v in SPANIN_TYPES["SEPARATED"].items():
+                            f.writelines(
+                                ""
+                                + str(k)
+                                + " ==> Amount of corresponding candidate o-spanins(s): "
+                                + str(len(v))
+                                + "\n"
+                            )
+                    if each_spanin_type == "OVERLAPPED":
+                        for k, v in SPANIN_TYPES["OVERLAPPED"].items():
+                            f.writelines(
+                                ""
+                                + str(k)
+                                + " ==> Amount of corresponding candidate o-spanins(s): "
+                                + str(len(v))
+                                + "\n"
+                            )
+            except TypeError:
+                continue
+        f.write("\n=~~~~~= Tally from ALL spanin types =~~~~~=\n")
+        f.writelines("Total Candidates = " + str(spanins["total_amount"]) + "\n")
+        f.writelines(
+            "Total Unique Candidate Pairs = " + str(spanins["total_unique"]) + "\n"
+        )
+
+    args.putative_isp_fasta_file = open(args.putative_isp_fasta_file.name, "r")
+    isp_full = tuple_fasta(args.putative_isp_fasta_file)
+
+    args.putative_osp_fasta_file = open(args.putative_osp_fasta_file.name, "r")
+    osp_full = tuple_fasta(args.putative_osp_fasta_file)
+
+    # print(isp_full)
+    isp_seqs = []
+    osp_seqs = []
+    for isp_tupe in isp_full:
+        # print(isp_tupe)
+        for pisp, posp in embedded.items():
+            # print(f"ISP = searching for {pisp} in {isp_tupe[0]}")
+            if re.search(("(" + str(pisp) + ")\D"), isp_tupe[0]):
+                # print(isp_tupe[0])
+                # print(peri_count)
+                peri_count = str.split(isp_tupe[0], "~=")[1]
+                isp_seqs.append((pisp, isp_tupe[1], peri_count))
+    # print(isp_seqs)
+    for osp_tupe in osp_full:
+        for pisp, posp in embedded.items():
+            for data in posp:
+                # print(f"OSP = searching for {data[3]} in {osp_tupe[0]}, coming from this object: {data}")
+                if re.search(("(" + str(data[3]) + ")\D"), osp_tupe[0]):
+                    peri_count = str.split(osp_tupe[0], "~=")[1]
+                    osp_seqs.append((data[3], osp_tupe[1], peri_count))
+
+    with args.embedded_txt as f:
+        f.write("================ embedded spanin candidates =================\n")
+        f.write(
+            "isp\tisp_start\tisp_end\tosp\tosp_start\tosp_end\tstrand\tpair_number\n"
+        )
+        if embedded != {}:
+            # print(embedded)
+            for pisp, posp in embedded.items():
+                # print(f"{pisp} - {posp}")
+                f.write(pisp + "\n")
+                for each_posp in posp:
+                    # print(posp)
+                    f.write(
+                        "\t{}\t{}\t{}\t{}\t{}\t{}\t".format(
+                            each_posp[1],
+                            each_posp[2],
+                            each_posp[3],
+                            each_posp[4],
+                            each_posp[5],
+                            each_posp[6],
+                        )
+                    )
+                    if each_posp[6] == "+":
+                        if each_posp[2] in pair_dict["pairs"]["pair_number"].keys():
+                            f.write(
+                                ""
+                                + str(pair_dict["pairs"]["pair_number"][each_posp[2]])
+                                + "\n"
+                            )
+                    elif each_posp[6] == "-":
+                        if each_posp[1] in pair_dict["pairs"]["pair_number"].keys():
+                            f.write(
+                                ""
+                                + str(pair_dict["pairs"]["pair_number"][each_posp[1]])
+                                + "\n"
+                            )
+        else:
+            f.write("nothing found")
+
+    with open(args.embedded_txt.name, "a") as f:
+        f.write("\n================= embedded candidate sequences ================\n")
+        f.write("======================= isp ==========================\n\n")
+        for isp_data in isp_seqs:
+            # print(isp_data)
+            f.write(
+                ">isp_orf::{}-peri_count~={}\n{}\n".format(
+                    isp_data[0], isp_data[2], lineWrapper(isp_data[1])
+                )
+            )
+        f.write("\n======================= osp ========================\n\n")
+        for osp_data in osp_seqs:
+            f.write(
+                ">osp_orf::{}-peri_count~={}\n{}\n".format(
+                    osp_data[0], osp_data[2], lineWrapper(osp_data[1])
+                )
+            )
+
+    args.putative_isp_fasta_file = open(args.putative_isp_fasta_file.name, "r")
+    isp_full = tuple_fasta(args.putative_isp_fasta_file)
+
+    args.putative_osp_fasta_file = open(args.putative_osp_fasta_file.name, "r")
+    osp_full = tuple_fasta(args.putative_osp_fasta_file)
+
+    isp_seqs = []
+    osp_seqs = []
+    for isp_tupe in isp_full:
+        peri_count = str.split(isp_tupe[0], "~=")[1]
+        for pisp, posp in overlap.items():
+            if re.search(("(" + str(pisp) + ")\D"), isp_tupe[0]):
+                peri_count = str.split(isp_tupe[0], "~=")[1]
+                isp_seqs.append((pisp, isp_tupe[1], peri_count))
+
+    for osp_tupe in osp_full:
+        for pisp, posp in overlap.items():
+            for data in posp:
+                if re.search(("(" + str(data[3]) + ")\D"), osp_tupe[0]):
+                    peri_count = str.split(osp_tupe[0], "~=")[1]
+                    osp_seqs.append((data[3], osp_tupe[1], peri_count))
+
+    with args.overlap_txt as f:
+        f.write("================ overlap spanin candidates =================\n")
+        f.write(
+            "isp\tisp_start\tisp_end\tosp\tosp_start\tosp_end\tstrand\tpair_number\n"
+        )
+        if overlap != {}:
+            for pisp, posp in overlap.items():
+                f.write(pisp + "\n")
+                for each_posp in posp:
+                    f.write(
+                        "\t{}\t{}\t{}\t{}\t{}\t{}\t".format(
+                            each_posp[1],
+                            each_posp[2],
+                            each_posp[3],
+                            each_posp[4],
+                            each_posp[5],
+                            each_posp[6],
+                        )
+                    )
+                    if each_posp[6] == "+":
+                        if each_posp[2] in pair_dict["pairs"]["pair_number"].keys():
+                            # print('ovl ; +')
+                            f.write(
+                                ""
+                                + str(pair_dict["pairs"]["pair_number"][each_posp[2]])
+                                + "\n"
+                            )
+                    elif each_posp[6] == "-":
+                        if each_posp[1] in pair_dict["pairs"]["pair_number"].keys():
+                            f.write(
+                                ""
+                                + str(pair_dict["pairs"]["pair_number"][each_posp[1]])
+                                + "\n"
+                            )
+        else:
+            f.write("nothing found")
+
+    with open(args.overlap_txt.name, "a") as f:
+        # print(isp_seqs)
+        f.write("\n================= overlap candidate sequences ================\n")
+        f.write("======================= isp ==========================\n\n")
+        for isp_data in isp_seqs:
+            f.write(
+                ">isp_orf::{}-pericount~={}\n{}\n".format(
+                    isp_data[0], isp_data[2], lineWrapper(isp_data[1])
+                )
+            )
+        f.write("\n======================= osp ========================\n\n")
+        for osp_data in osp_seqs:
+            f.write(
+                ">osp_orf::{}-pericount~={}\n{}\n".format(
+                    osp_data[0], osp_data[2], lineWrapper(osp_data[1])
+                )
+            )
+
+    args.putative_isp_fasta_file = open(args.putative_isp_fasta_file.name, "r")
+    isp_full = tuple_fasta(args.putative_isp_fasta_file)
+    args.putative_osp_fasta_file = open(args.putative_osp_fasta_file.name, "r")
+    osp_full = tuple_fasta(args.putative_osp_fasta_file)
+
+    isp_seqs = []
+    osp_seqs = []
+    for isp_tupe in isp_full:
+        for pisp, posp in separate.items():
+            if re.search(("(" + str(pisp) + ")\D"), isp_tupe[0]):
+                peri_count = str.split(isp_tupe[0], "~=")[1]
+                isp_seqs.append((pisp, isp_tupe[1], peri_count))
+    # print(isp_seqs)
+    for osp_tupe in osp_full:
+        for pisp, posp in separate.items():
+            for data in posp:
+                if re.search(("(" + str(data[3]) + ")\D"), osp_tupe[0]):
+                    peri_count = str.split(osp_tupe[0], "~=")[1]
+                    osp_seqs.append((data[3], osp_tupe[1], peri_count))
+
+    with args.separate_txt as f:
+        f.write("================ separated spanin candidates =================\n")
+        f.write(
+            "isp\tisp_start\tisp_end\tosp\tosp_start\tosp_end\tstrand\tpair_number\n"
+        )
+        if separate != {}:
+            for pisp, posp in separate.items():
+                f.write(pisp + "\n")
+                for each_posp in posp:
+                    f.write(
+                        "\t{}\t{}\t{}\t{}\t{}\t{}\t".format(
+                            each_posp[1],
+                            each_posp[2],
+                            each_posp[3],
+                            each_posp[4],
+                            each_posp[5],
+                            each_posp[6],
+                        )
+                    )
+                    if each_posp[6] == "+":
+                        if each_posp[2] in pair_dict["pairs"]["pair_number"].keys():
+                            f.write(
+                                ""
+                                + str(pair_dict["pairs"]["pair_number"][each_posp[2]])
+                                + "\n"
+                            )
+                    elif each_posp[6] == "-":
+                        if each_posp[1] in pair_dict["pairs"]["pair_number"].keys():
+                            f.write(
+                                ""
+                                + str(pair_dict["pairs"]["pair_number"][each_posp[1]])
+                                + "\n"
+                            )
+        else:
+            f.write("nothing found")
+
+    with open(args.separate_txt.name, "a") as f:
+        f.write("\n================= separated candidate sequences ================\n")
+        f.write("======================= isp ==========================\n\n")
+        for isp_data in isp_seqs:
+            f.write(
+                ">isp_orf::{}-pericount~={}\n{}\n".format(
+                    isp_data[0], isp_data[2], lineWrapper(isp_data[1])
+                )
+            )
+        f.write("\n======================= osp ========================\n\n")
+        for osp_data in osp_seqs:
+            f.write(
+                ">osp_orf::{}-pericount~={}\n{}\n".format(
+                    osp_data[0], osp_data[2], lineWrapper(osp_data[1])
+                )
+            )
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/findSpanin.xml	Mon Jun 05 02:42:01 2023 +0000
@@ -0,0 +1,43 @@
+<tool id="edu.tamu.cpt2.spanin.findSpanin" name="Find Spanin" version="1.0">
+    <description>With the outputs from the ISP and OSP candidate tools, cull the list down to candidate pairs</description>
+    <macros>
+        <import>macros.xml</import>
+        <import>cpt-macros.xml</import>
+    </macros>
+    <expand macro="requirements">
+    </expand>
+    <command detect_errors="aggressive"><![CDATA[
+python '$__tool_directory__/findSpanin.py'
+'$putative_isp_fasta_file'
+'$putative_osp_fasta_file'
+--max_isp_osp_distance '$max_isp_osp_distance'
+--embedded_txt '$embedded_txt'
+--overlap_txt '$overlap_txt'
+--separate_txt '$separate_txt'
+--summary_txt '$summary_txt'
+]]></command>
+    <inputs>
+        <param label="(putative) i-spanin FASTA file" name="putative_isp_fasta_file" type="data" format="fasta"/>
+        <param label="(putative) o-spanin FASTA file" name="putative_osp_fasta_file" type="data" format="fasta"/>
+        <param label="max distance from end of i-spanin to beginning of o-spanin (measured in AAs)" name="max_isp_osp_distance" type="integer" value="10"/>
+    </inputs>
+    <outputs>
+        <data format="txt" name="summary_txt" label="FindSpanin_summary.txt"/>
+        <data format="tabular" name="embedded_txt" label="embedded_results.txt"/>
+        <data format="tabular" name="overlap_txt" label="overlap_results.txt"/>
+        <data format="tabular" name="separate_txt" label="separate_results.txt"/>
+    </outputs>
+    <help><![CDATA[
+**What it does**
+Compares the protein FASTA files with candidate i-spanins and o-spanins from a genome and matches them into candidate pairs based on position in a strand-aware fashion.
+
+**INPUT** --> Putative i-spanin and o-spanin protein multiFASTAs (generated from the ISP/OSP Candidate Tools). 
+
+**METHODOLOGY** 
+Does a pairwise comparison between candidate i-spanins and o-spanins based on their genomic location, and classifies them into the known bimolecular spanin genetic architectures. Classes are: embedded, overlapping, separated, or NOT a potential pair.
+
+**OUTPUT** --> File with candidate pairs for each bimolecular spanin class and a basic summary statistics file. 
+
+]]></help>
+    <expand macro="citations-2020"/>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml	Mon Jun 05 02:42:01 2023 +0000
@@ -0,0 +1,58 @@
+<macros>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="2019.06.05">regex</requirement>
+            <requirement type="package" version="3.6">python</requirement>
+            <requirement type="package" version="1.77">biopython</requirement>
+            <requirement type="package" version="1.1.7">cpt_gffparser</requirement>
+            <yield/>
+        </requirements>
+    </xml>
+    <token name="@BLAST_TSV@">
+		$blast_tsv
+	</token>
+    <xml name="blast_tsv">
+        <param label="Blast Results" help="TSV/tabular (25 Column)" name="blast_tsv" type="data" format="tabular"/>
+    </xml>
+    <token name="@BLAST_XML@">
+		$blast_xml
+	</token>
+    <xml name="blast_xml">
+        <param label="Blast Results" help="XML format" name="blast_xml" type="data" format="blastxml"/>
+    </xml>
+    <xml name="gff3_with_fasta">
+        <param label="Genome Sequences" name="fasta" type="data" format="fasta"/>
+        <param label="Genome Annotations" name="gff3" type="data" format="gff3"/>
+    </xml>
+    <xml name="genome_selector">
+        <param name="genome_fasta" type="data" format="fasta" label="Source FASTA Sequence"/>
+    </xml>
+    <xml name="gff3_input">
+        <param label="GFF3 Annotations" name="gff3_data" type="data" format="gff3"/>
+    </xml>
+    <xml name="input/gff3+fasta">
+        <expand macro="gff3_input"/>
+        <expand macro="genome_selector"/>
+    </xml>
+    <token name="@INPUT_GFF@">
+	    $gff3_data
+	</token>
+    <token name="@INPUT_FASTA@">
+		genomeref.fa
+	</token>
+    <token name="@GENOME_SELECTOR_PRE@">
+		ln -s $genome_fasta genomeref.fa;
+	</token>
+    <token name="@GENOME_SELECTOR@">
+		genomeref.fa
+	</token>
+    <xml name="input/fasta">
+        <param label="Fasta file" name="sequences" type="data" format="fasta"/>
+    </xml>
+    <token name="@SEQUENCE@">
+		$sequences
+	</token>
+    <xml name="input/fasta/protein">
+        <param label="Protein fasta file" name="sequences" type="data" format="fasta"/>
+    </xml>
+</macros>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/spaninFuncs.py	Mon Jun 05 02:42:01 2023 +0000
@@ -0,0 +1,530 @@
+"""
+PREMISE
+### Functions/Classes that are used in both generate-putative-osp.py and generate-putative-isp.py
+###### Main premise here is to make the above scripts a little more DRY, as well as easily readable for execution.
+###### Documentation will ATTEMPT to be thourough here
+"""
+
+import re
+from Bio import SeqIO
+from Bio import Seq
+from collections import OrderedDict
+
+# Not written in OOP for a LITTLE bit of trying to keep the complication down in case adjustments are needed by someone else.
+# Much of the manipulation is string based; so it should be straightforward as well as moderately quick
+################## GLobal Variables
+Lys = "K"
+
+
+def check_back_end_snorkels(seq, tmsize):
+    """
+    Searches through the backend of a potential TMD snorkel. This is the 2nd part of a TMD snorkel lysine match.
+    --> seq : should be the sequence fed from the "search_region" portion of the sequence
+    --> tmsize : size of the potential TMD being investigated
+    """
+    found = []
+    if seq[tmsize - 4] == Lys and re.search(("[FIWLVMYCATGS]"), seq[tmsize - 5]):
+        found = "match"
+        return found
+    elif seq[tmsize - 3] == Lys and re.search(("[FIWLVMYCATGS]"), seq[tmsize - 4]):
+        found = "match"
+        return found
+    elif seq[tmsize - 2] == Lys and re.search(("[FIWLVMYCATGS]"), seq[tmsize - 3]):
+        found = "match"
+        return found
+    elif seq[tmsize - 1] == Lys and re.search(("[FIWLVMYCATGS]"), seq[tmsize - 2]):
+        found = "match"
+        return found
+    else:
+        found = "NOTmatch"
+        return found
+
+
+def prep_a_gff3(fa, spanin_type, org):
+    """
+    Function parses an input detailed 'fa' file and outputs a 'gff3' file
+    ---> fa = input .fa file
+    ---> output = output a returned list of data, easily portable to a gff3 next
+    ---> spanin_type = 'isp' or 'osp'
+    """
+    with org as f:
+        header = f.readline()
+        orgacc = header.split(" ")
+        orgacc = orgacc[0].split(">")[1].strip()
+    fa_zip = tuple_fasta(fa)
+    data = []
+    for a_pair in fa_zip:
+        # print(a_pair)
+        if re.search(("(\[1\])"), a_pair[0]):
+            strand = "+"
+        elif re.search(("(\[-1\])"), a_pair[0]):
+            strand = "-"  # column 7
+        start = re.search(("[\d]+\.\."), a_pair[0]).group(0).split("..")[0]  # column 4
+        end = re.search(("\.\.[\d]+"), a_pair[0]).group(0).split("..")[1]  # column 5
+        orfid = re.search(("(ORF)[\d]+"), a_pair[0]).group(0)  # column 1
+        if spanin_type == "isp":
+            methodtype = "CDS"  # column 3
+            spanin = "isp"
+        elif spanin_type == "osp":
+            methodtype = "CDS"  # column 3
+            spanin = "osp"
+        elif spanin_type == "usp":
+            methodtype = "CDS"
+            spanin = "usp"
+        else:
+            raise "need to input spanin type"
+        source = "cpt.py|putative-*.py"  # column 2
+        score = "."  # column 6
+        phase = "."  # column 8
+        attributes = (
+            "ID=" + orgacc + "|" + orfid + ";ALIAS=" + spanin + ";SEQ=" + a_pair[1]
+        )  # column 9
+        sequence = [
+            [orgacc, source, methodtype, start, end, score, strand, phase, attributes]
+        ]
+        data += sequence
+    return data
+
+
+def write_gff3(data, output="results.gff3"):
+    """
+    Parses results from prep_a_gff3 into a gff3 file
+    ---> input : list from prep_a_gff3
+    ---> output : gff3 file
+    """
+    data = data
+    filename = output
+    with filename as f:
+        f.write("#gff-version 3\n")
+        for value in data:
+            f.write(
+                "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
+                    value[0],
+                    value[1],
+                    value[2],
+                    value[3],
+                    value[4],
+                    value[5],
+                    value[6],
+                    value[7],
+                    value[8],
+                )
+            )
+    f.close()
+
+
+def find_tmd(
+    pair,
+    minimum=10,
+    maximum=30,
+    TMDmin=10,
+    TMDmax=20,
+    isp_mode=False,
+    peri_min=18,
+    peri_max=206,
+):
+    """
+    Function that searches for lysine snorkels and then for a spanning hydrophobic region that indicates a potential TMD
+    ---> pair : Input of tuple with description and AA sequence (str)
+    ---> minimum : How close from the initial start codon a TMD can be within
+    ---> maximum : How far from the initial start codon a TMD can be within
+    ---> TMDmin : The minimum size that a transmembrane can be (default = 10)
+    ---> TMDmax : The maximum size tha ta transmembrane can be (default = 20)
+    """
+    # hydrophobicAAs = ['P', 'F', 'I', 'W', 'L', 'V', 'M', 'Y', 'C', 'A', 'T', 'G', 'S']
+    tmd = []
+    s = str(pair[1])  # sequence being analyzed
+    # print(s) # for trouble shooting
+    if maximum > len(s):
+        maximum = len(s)
+    search_region = s[minimum - 1 : maximum + 1]
+    # print(f"this is the search region: {search_region}")
+    # print(search_region) # for trouble shooting
+
+    for tmsize in range(TMDmin, TMDmax + 1, 1):
+        # print(f"this is the current tmsize we're trying: {tmsize}")
+        # print('==============='+str(tmsize)+'================') # print for troubleshooting
+        pattern = (
+            "[PFIWLVMYCATGS]{" + str(tmsize) + "}"
+        )  # searches for these hydrophobic residues tmsize total times
+        # print(pattern)
+        # print(f"sending to regex: {search_region}")
+        if re.search(
+            ("[K]"), search_region[1:8]
+        ):  # grabbing one below with search region, so I want to grab one ahead here when I query.
+            store_search = re.search(
+                ("[K]"), search_region[1:8]
+            )  # storing regex object
+            where_we_are = store_search.start()  # finding where we got the hit
+            if re.search(
+                ("[PFIWLVMYCATGS]"), search_region[where_we_are + 1]
+            ) and re.search(
+                ("[PFIWLVMYCATGS]"), search_region[where_we_are - 1]
+            ):  # hydrophobic neighbor
+                # try:
+                g = re.search(
+                    ("[PFIWLVMYCATGS]"), search_region[where_we_are + 1]
+                ).group()
+                backend = check_back_end_snorkels(search_region, tmsize)
+                if backend == "match":
+                    if isp_mode:
+                        g = re.search((pattern), search_region).group()
+                        end_of_tmd = re.search((g), s).end() + 1
+                        amt_peri = len(s) - end_of_tmd
+                        if peri_min <= amt_peri <= peri_max:
+                            pair_desc = pair[0] + ", peri_count~=" + str(amt_peri)
+                            new_pair = (pair_desc, pair[1])
+                            tmd.append(new_pair)
+                    else:
+                        tmd.append(pair)
+                else:
+                    continue
+        # else:
+        # print("I'm continuing out of snorkel loop")
+        # print(f"{search_region}")
+        # continue
+        if re.search((pattern), search_region):
+            # print(f"found match: {}")
+            # print("I AM HEREEEEEEEEEEEEEEEEEEEEEEE")
+            # try:
+            if isp_mode:
+                g = re.search((pattern), search_region).group()
+                end_of_tmd = re.search((g), s).end() + 1
+                amt_peri = len(s) - end_of_tmd
+                if peri_min <= amt_peri <= peri_max:
+                    pair_desc = pair[0] + ", peri_count~=" + str(amt_peri)
+                    new_pair = (pair_desc, pair[1])
+                    tmd.append(new_pair)
+            else:
+                tmd.append(pair)
+        else:
+            continue
+
+        return tmd
+
+
+def find_lipobox(
+    pair, minimum=10, maximum=50, min_after=30, max_after=185, regex=1, osp_mode=False
+):
+    """
+    Function that takes an input tuple, and will return pairs of sequences to their description that have a lipoobox
+    ---> minimum - min distance from start codon to first AA of lipobox
+    ---> maximum - max distance from start codon to first AA of lipobox
+    ---> regex - option 1 (default) => more strict regular expression ; option 2 => looser selection, imported from LipoRy
+
+    """
+    if regex == 1:
+        pattern = "[ILMFTV][^REKD][GAS]C"  # regex for Lipobox from findSpanin.pl
+    elif regex == 2:
+        pattern = "[ACGSILMFTV][^REKD][GAS]C"  # regex for Lipobox from LipoRy
+
+    candidates = []
+    s = str(pair[1])
+    # print(s) # trouble shooting
+    search_region = s[
+        minimum - 1 : maximum + 5
+    ]  # properly slice the input... add 4 to catch if it hangs off at max input
+    # print(search_region) # trouble shooting
+    patterns = ["[ILMFTV][^REKD][GAS]C", "AW[AGS]C"]
+    for pattern in patterns:
+        # print(pattern)  # trouble shooting
+        if re.search((pattern), search_region):  # lipobox must be WITHIN the range...
+            # searches the sequence with the input RegEx AND omits if
+            g = re.search(
+                (pattern), search_region
+            ).group()  # find the exact group match
+            amt_peri = len(s) - re.search((g), s).end() + 1
+            if min_after <= amt_peri <= max_after:  # find the lipobox end region
+                if osp_mode:
+                    pair_desc = pair[0] + ", peri_count~=" + str(amt_peri)
+                    new_pair = (pair_desc, pair[1])
+                    candidates.append(new_pair)
+                else:
+                    candidates.append(pair)
+
+                return candidates
+
+
+def tuple_fasta(fasta_file):
+    """
+    #### INPUT: Fasta File
+    #### OUTPUT: zipped (zip) : pairwise relationship of description to sequence
+    ####
+    """
+    fasta = SeqIO.parse(fasta_file, "fasta")
+    descriptions = []
+    sequences = []
+    for r in fasta:  # iterates and stores each description and sequence
+        description = r.description
+        sequence = str(r.seq)
+        if (
+            sequence[0] != "I"
+        ):  # the translation table currently has I as a potential start codon ==> this will remove all ORFs that start with I
+            descriptions.append(description)
+            sequences.append(sequence)
+        else:
+            continue
+
+    return zip(descriptions, sequences)
+
+
+def lineWrapper(text, charactersize=60):
+
+    if len(text) <= charactersize:
+        return text
+    else:
+        return (
+            text[:charactersize]
+            + "\n"
+            + lineWrapper(text[charactersize:], charactersize)
+        )
+
+
+def getDescriptions(fasta):
+    """
+    Takes an output FASTA file, and parses retrieves the description headers. These headers contain information needed
+    for finding locations of a potential i-spanin and o-spanin proximity to one another.
+    """
+    desc = []
+    with fasta as f:
+        for line in f:
+            if line.startswith(">"):
+                desc.append(line)
+    return desc
+
+
+def splitStrands(text, strand="+"):
+    # positive_strands = []
+    # negative_strands = []
+    if strand == "+":
+        if re.search(("(\[1\])"), text):
+            return text
+    elif strand == "-":
+        if re.search(("(\[-1\])"), text):
+            return text
+    # return positive_strands, negative_strands
+
+
+def parse_a_range(pair, start, end):
+    """
+    Takes an input data tuple from a fasta tuple pair and keeps only those within the input sequence range
+    ---> data : fasta tuple data
+    ---> start : start range to keep
+    ---> end : end range to keep (will need to + 1)
+    """
+    matches = []
+    for each_pair in pair:
+
+        s = re.search(("[\d]+\.\."), each_pair[0]).group(0)  # Start of the sequence
+        s = int(s.split("..")[0])
+        e = re.search(("\.\.[\d]+"), each_pair[0]).group(0)
+        e = int(e.split("..")[1])
+        if start - 1 <= s and e <= end + 1:
+            matches.append(each_pair)
+        else:
+            continue
+    # else:
+    # continue
+    # if matches != []:
+    return matches
+    # else:
+    # print('no candidates within selected range')
+
+
+def grabLocs(text):
+    """
+    Grabs the locations of the spanin based on NT location (seen from ORF). Grabs the ORF name, as per named from the ORF class/module
+    from cpt.py
+    """
+    start = re.search(("[\d]+\.\."), text).group(
+        0
+    )  # Start of the sequence ; looks for [numbers]..
+    end = re.search(("\.\.[\d]+"), text).group(
+        0
+    )  # End of the sequence ; Looks for ..[numbers]
+    orf = re.search(("(ORF)[\d]+"), text).group(
+        0
+    )  # Looks for ORF and the numbers that are after it
+    if re.search(("(\[1\])"), text):  # stores strand
+        strand = "+"
+    elif re.search(("(\[-1\])"), text):  # stores strand
+        strand = "-"
+    start = int(start.split("..")[0])
+    end = int(end.split("..")[1])
+    vals = [start, end, orf, strand]
+
+    return vals
+
+
+def spaninProximity(isp, osp, max_dist=30):
+    """
+    _NOTE THIS FUNCTION COULD BE MODIFIED TO RETURN SEQUENCES_
+    Compares the locations of i-spanins and o-spanins. max_dist is the distance in NT measurement from i-spanin END site
+    to o-spanin START. The user will be inputting AA distance, so a conversion will be necessary (<user_input> * 3)
+    I modified this on 07.30.2020 to bypass the pick + or - strand. To
+    INPUT: list of OSP and ISP candidates
+    OUTPUT: Return (improved) candidates for overlapping, embedded, and separate list
+    """
+
+    embedded = {}
+    overlap = {}
+    separate = {}
+    for iseq in isp:
+        embedded[iseq[2]] = []
+        overlap[iseq[2]] = []
+        separate[iseq[2]] = []
+        for oseq in osp:
+            if iseq[3] == "+":
+                if oseq[3] == "+":
+                    if iseq[0] < oseq[0] < iseq[1] and oseq[1] < iseq[1]:
+                        ### EMBEDDED ###
+                        combo = [
+                            iseq[0],
+                            iseq[1],
+                            oseq[2],
+                            oseq[0],
+                            oseq[1],
+                            iseq[3],
+                        ]  # ordering a return for dic
+                        embedded[iseq[2]] += [combo]
+                    elif iseq[0] < oseq[0] <= iseq[1] and oseq[1] > iseq[1]:
+                        ### OVERLAP / SEPARATE ###
+                        if (iseq[1] - oseq[0]) < 6:
+                            combo = [
+                                iseq[0],
+                                iseq[1],
+                                oseq[2],
+                                oseq[0],
+                                oseq[1],
+                                iseq[3],
+                            ]
+                            separate[iseq[2]] += [combo]
+                        else:
+                            combo = [
+                                iseq[0],
+                                iseq[1],
+                                oseq[2],
+                                oseq[0],
+                                oseq[1],
+                                iseq[3],
+                            ]
+                            overlap[iseq[2]] += [combo]
+                    elif iseq[1] <= oseq[0] <= iseq[1] + max_dist:
+                        combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1], iseq[3]]
+                        separate[iseq[2]] += [combo]
+                    else:
+                        continue
+            if iseq[3] == "-":
+                if oseq[3] == "-":
+                    if iseq[0] <= oseq[1] <= iseq[1] and oseq[0] > iseq[0]:
+                        ### EMBEDDED ###
+                        combo = [
+                            iseq[0],
+                            iseq[1],
+                            oseq[2],
+                            oseq[0],
+                            oseq[1],
+                            iseq[3],
+                        ]  # ordering a return for dict
+                        embedded[iseq[2]] += [combo]
+                    elif iseq[0] <= oseq[1] <= iseq[1] and oseq[0] < iseq[0]:
+                        if (oseq[1] - iseq[0]) < 6:
+                            combo = [
+                                iseq[0],
+                                iseq[1],
+                                oseq[2],
+                                oseq[0],
+                                oseq[1],
+                                iseq[3],
+                            ]
+                            separate[iseq[2]] += [combo]
+                        else:
+                            combo = [
+                                iseq[0],
+                                iseq[1],
+                                oseq[2],
+                                oseq[0],
+                                oseq[1],
+                                iseq[3],
+                            ]
+                            overlap[iseq[2]] += [combo]
+                    elif iseq[0] - 10 < oseq[1] < iseq[0]:
+                        combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1], iseq[3]]
+                        separate[iseq[2]] += [combo]
+                    else:
+                        continue
+
+    embedded = {k: embedded[k] for k in embedded if embedded[k]}
+    overlap = {k: overlap[k] for k in overlap if overlap[k]}
+    separate = {k: separate[k] for k in separate if separate[k]}
+
+    return embedded, overlap, separate
+
+
+def check_for_usp():
+    "pass"
+
+
+############################################### TEST RANGE #########################################################################
+####################################################################################################################################
+if __name__ == "__main__":
+
+    #### TMD TEST
+    test_desc = ["one", "two", "three", "four", "five"]
+    test_seq = [
+        "XXXXXXXXXXXXXXXFMCFMCFMCFMCFMCXXXXXXXXXXXXXXXXXXXXXXXXXX",
+        "XXXXXXXXAAKKKKKKKKKKKKKKKXXXXXXXXXXXXX",
+        "XXXXXXX",
+        "XXXXXXXXXXXKXXXXXXXXXX",
+        "XXXXXXXXXXAKXXXXXXXXXXAKXXXXXXXX",
+    ]
+    # for l in
+    # combo = zip(test_desc,test_seq)
+    pairs = zip(test_desc, test_seq)
+    tmd = []
+    for each_pair in pairs:
+        # print(each_pair)
+        try:
+            tmd += find_tmd(pair=each_pair)
+        except (IndexError, TypeError):
+            continue
+        # try:s = each_pair[1]
+        # tmd += find_tmd(seq=s, tmsize=15)
+    # print('\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n')
+    # print(tmd)
+    # print('\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n')
+
+    #### tuple-fasta TEST
+    # fasta_file = 'out_isp.fa'
+    # ret = tuple_fasta(fasta_file)
+    # print('=============')
+    # for i in ret:
+    # print(i[1])
+
+    #### LipoBox TEST
+    test_desc = ["one", "two", "three", "four", "five", "six", "seven"]
+    test_seq = [
+        "XXXXXXXXXTGGCXXXXXXXXXXXXXXXX",
+        "XXXXXXXXAAKKKKKKKKKKKKKKKXXXXXXXXXXXXX",
+        "XXXXXXX",
+        "AGGCXXXXXXXXXXXXXXXXXXXXTT",
+        "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXTGGC",
+        "XXXXXXXXXXXXXXXXXXXXXXXXXXTGGC",
+        "MSTLRELRLRRALKEQSMRYLLSIKKTLPRWKGALIGLFLICVATISGCASESKLPEPPMVSVDSSLMVEPNLTTEMLNVFSQ*",
+    ]
+    pairs = zip(test_desc, test_seq)
+    lipo = []
+    for each_pair in pairs:
+        # print(each_pair)
+        # try:
+        try:
+            lipo += find_lipobox(pair=each_pair, regex=2)  # , minimum=8)
+        except TypeError:  # catches if something doesnt have the min/max requirements (something is too small)
+            continue
+        # except:
+        # continue
+    # print('\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n')
+    #############################3
+    # g = prep_a_gff3(fa='putative_isp.fa', spanin_type='isp')
+    # print(g)
+    # write_gff3(data=g)