# HG changeset patch
# User cpt
# Date 1685932921 0
# Node ID fd70980a516b2db8b444cc15577cbb06234cc421
# Parent 1a7fef71aee3ae232349b37f3a3ce5f76a03523c
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
diff -r 1a7fef71aee3 -r fd70980a516b cpt-macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt-macros.xml Mon Jun 05 02:42:01 2023 +0000
@@ -0,0 +1,114 @@
+
+
+
+ python
+ biopython
+ requests
+
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+ @unpublished{galaxyTools,
+ author = {E. Mijalis, H. Rasche},
+ title = {CPT Galaxy Tools},
+ year = {2013-2017},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {E. Mijalis, H. Rasche},
+ title = {CPT Galaxy Tools},
+ year = {2013-2017},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {C. Ross},
+ title = {CPT Galaxy Tools},
+ year = {2020-},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {E. Mijalis, H. Rasche},
+ title = {CPT Galaxy Tools},
+ year = {2013-2017},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+ @unpublished{galaxyTools,
+ author = {A. Criscione},
+ title = {CPT Galaxy Tools},
+ year = {2019-2021},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {A. Criscione},
+ title = {CPT Galaxy Tools},
+ year = {2019-2021},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {C. Maughmer},
+ title = {CPT Galaxy Tools},
+ year = {2017-2020},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ @unpublished{galaxyTools,
+ author = {C. Maughmer},
+ title = {CPT Galaxy Tools},
+ year = {2017-2020},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
diff -r 1a7fef71aee3 -r fd70980a516b cpt.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt.py Mon Jun 05 02:42:01 2023 +0000
@@ -0,0 +1,341 @@
+#!/usr/bin/env python
+from Bio.Seq import Seq, reverse_complement, translate
+from Bio.SeqRecord import SeqRecord
+from Bio import SeqIO
+from Bio.Data import CodonTable
+import logging
+
+logging.basicConfig()
+log = logging.getLogger()
+
+PHAGE_IN_MIDDLE = re.compile("^(?P.*)\s*phage (?P.*)$")
+BACTERIOPHAGE_IN_MIDDLE = re.compile("^(?P.*)\s*bacteriophage (?P.*)$")
+STARTS_WITH_PHAGE = re.compile(
+ "^(bacterio|vibrio|Bacterio|Vibrio|)?[Pp]hage (?P.*)$"
+)
+NEW_STYLE_NAMES = re.compile("(?Pv[A-Z]_[A-Z][a-z]{2}_.*)")
+
+
+def phage_name_parser(name):
+ host = None
+ phage = None
+ name = name.replace(", complete genome.", "")
+ name = name.replace(", complete genome", "")
+
+ m = BACTERIOPHAGE_IN_MIDDLE.match(name)
+ if m:
+ host = m.group("host")
+ phage = m.group("phage")
+ return (host, phage)
+
+ m = PHAGE_IN_MIDDLE.match(name)
+ if m:
+ host = m.group("host")
+ phage = m.group("phage")
+ return (host, phage)
+
+ m = STARTS_WITH_PHAGE.match(name)
+ if m:
+ phage = m.group("phage")
+ return (host, phage)
+
+ m = NEW_STYLE_NAMES.match(name)
+ if m:
+ phage = m.group("phage")
+ return (host, phage)
+
+ return (host, phage)
+
+
+class OrfFinder(object):
+ def __init__(self, table, ftype, ends, min_len, strand):
+ self.table = table
+ self.table_obj = CodonTable.ambiguous_generic_by_id[table]
+ self.ends = ends
+ self.ftype = ftype
+ self.min_len = min_len
+ self.starts = sorted(self.table_obj.start_codons)
+ self.stops = sorted(self.table_obj.stop_codons)
+ self.re_starts = re.compile("|".join(self.starts))
+ self.re_stops = re.compile("|".join(self.stops))
+ self.strand = strand
+
+ def locate(self, fasta_file, out_nuc, out_prot, out_bed, out_gff3):
+ seq_format = "fasta"
+ log.debug("Genetic code table %i" % self.table)
+ log.debug("Minimum length %i aa" % self.min_len)
+
+ out_count = 0
+
+ out_gff3.write("##gff-version 3\n")
+
+ for idx, record in enumerate(SeqIO.parse(fasta_file, seq_format)):
+ for i, (f_start, f_end, f_strand, n, t) in enumerate(
+ self.get_all_peptides(str(record.seq).upper())
+ ):
+ out_count += 1
+
+ descr = "length %i aa, %i bp, from %s..%s[%s] of %s" % (
+ len(t),
+ len(n),
+ f_start,
+ f_end,
+ f_strand,
+ record.description,
+ )
+ fid = record.id + "|%s%i" % (self.ftype, i + 1)
+
+ r = SeqRecord(Seq(n), id=fid, name="", description=descr)
+ t = SeqRecord(Seq(t), id=fid, name="", description=descr)
+
+ SeqIO.write(r, out_nuc, "fasta")
+ SeqIO.write(t, out_prot, "fasta")
+
+ nice_strand = "+" if f_strand == +1 else "-"
+
+ out_bed.write(
+ "\t".join(
+ map(str, [record.id, f_start, f_end, fid, 0, nice_strand])
+ )
+ + "\n"
+ )
+
+ out_gff3.write(
+ "\t".join(
+ map(
+ str,
+ [
+ record.id,
+ "getOrfsOrCds",
+ "CDS",
+ f_start + 1,
+ f_end,
+ ".",
+ nice_strand,
+ 0,
+ "ID=%s.%s.%s" % (self.ftype, idx, i + 1),
+ ],
+ )
+ )
+ + "\n"
+ )
+ log.info("Found %i %ss", out_count, self.ftype)
+
+ def start_chop_and_trans(self, s, strict=True):
+ """Returns offset, trimmed nuc, protein."""
+ if strict:
+ assert s[-3:] in self.stops, s
+ assert len(s) % 3 == 0
+ for match in self.re_starts.finditer(s, overlapped=True):
+ # Must check the start is in frame
+ start = match.start()
+ if start % 3 == 0:
+ n = s[start:]
+ assert len(n) % 3 == 0, "%s is len %i" % (n, len(n))
+ if strict:
+ t = translate(n, self.table)
+ else:
+ # Use when missing stop codon,
+ t = "M" + translate(n[3:], self.table, to_stop=True)
+ yield start, n, t # Edited by CPT to be a generator
+
+ def break_up_frame(self, s):
+ """Returns offset, nuc, protein."""
+ start = 0
+ for match in self.re_stops.finditer(s, overlapped=True):
+ index = match.start() + 3
+ if index % 3 != 0:
+ continue
+ n = s[start:index]
+ for (offset, n, t) in self.start_chop_and_trans(n):
+ if n and len(t) >= self.min_len:
+ yield start + offset, n, t
+ start = index
+
+ def putative_genes_in_sequence(self, nuc_seq):
+ """Returns start, end, strand, nucleotides, protein.
+ Co-ordinates are Python style zero-based.
+ """
+ nuc_seq = nuc_seq.upper()
+ # TODO - Refactor to use a generator function (in start order)
+ # rather than making a list and sorting?
+ answer = []
+ full_len = len(nuc_seq)
+
+ for frame in range(0, 3):
+ for offset, n, t in self.break_up_frame(nuc_seq[frame:]):
+ start = frame + offset # zero based
+ answer.append((start, start + len(n), +1, n, t))
+
+ rc = reverse_complement(nuc_seq)
+ for frame in range(0, 3):
+ for offset, n, t in self.break_up_frame(rc[frame:]):
+ start = full_len - frame - offset # zero based
+ answer.append((start, start - len(n), -1, n, t))
+ answer.sort()
+ return answer
+
+ def get_all_peptides(self, nuc_seq):
+ """Returns start, end, strand, nucleotides, protein.
+
+ Co-ordinates are Python style zero-based.
+ """
+ # Refactored into generator by CPT
+ full_len = len(nuc_seq)
+ if self.strand != "reverse":
+ for frame in range(0, 3):
+ for offset, n, t in self.break_up_frame(nuc_seq[frame:]):
+ start = frame + offset # zero based
+ yield (start, start + len(n), +1, n, t)
+ if self.strand != "forward":
+ rc = reverse_complement(nuc_seq)
+ for frame in range(0, 3):
+ for offset, n, t in self.break_up_frame(rc[frame:]):
+ start = full_len - frame - offset # zero based
+ yield (start - len(n), start, -1, n, t)
+
+
+class MGAFinder(object):
+ def __init__(self, table, ftype, ends, min_len):
+ self.table = table
+ self.table_obj = CodonTable.ambiguous_generic_by_id[table]
+ self.ends = ends
+ self.ftype = ftype
+ self.min_len = min_len
+ self.starts = sorted(self.table_obj.start_codons)
+ self.stops = sorted(self.table_obj.stop_codons)
+ self.re_starts = re.compile("|".join(self.starts))
+ self.re_stops = re.compile("|".join(self.stops))
+
+ def locate(self, fasta_file, out_nuc, out_prot, out_bed, out_gff3):
+ seq_format = "fasta"
+ log.debug("Genetic code table %i" % self.table)
+ log.debug("Minimum length %i aa" % self.min_len)
+
+ out_count = 0
+
+ out_gff3.write("##gff-version 3\n")
+
+ for idx, record in enumerate(SeqIO.parse(fasta_file, seq_format)):
+ for i, (f_start, f_end, f_strand, n, t) in enumerate(
+ self.get_all_peptides(str(record.seq).upper())
+ ):
+ out_count += 1
+
+ descr = "length %i aa, %i bp, from %s..%s[%s] of %s" % (
+ len(t),
+ len(n),
+ f_start,
+ f_end,
+ f_strand,
+ record.description,
+ )
+ fid = record.id + "|%s%i" % (self.ftype, i + 1)
+
+ r = SeqRecord(Seq(n), id=fid, name="", description=descr)
+ t = SeqRecord(Seq(t), id=fid, name="", description=descr)
+
+ SeqIO.write(r, out_nuc, "fasta")
+ SeqIO.write(t, out_prot, "fasta")
+
+ nice_strand = "+" if f_strand == +1 else "-"
+
+ out_bed.write(
+ "\t".join(
+ map(str, [record.id, f_start, f_end, fid, 0, nice_strand])
+ )
+ + "\n"
+ )
+
+ out_gff3.write(
+ "\t".join(
+ map(
+ str,
+ [
+ record.id,
+ "getOrfsOrCds",
+ "CDS",
+ f_start + 1,
+ f_end,
+ ".",
+ nice_strand,
+ 0,
+ "ID=%s.%s.%s" % (self.ftype, idx, i + 1),
+ ],
+ )
+ )
+ + "\n"
+ )
+ log.info("Found %i %ss", out_count, self.ftype)
+
+ def start_chop_and_trans(self, s, strict=True):
+ """Returns offset, trimmed nuc, protein."""
+ if strict:
+ assert s[-3:] in self.stops, s
+ assert len(s) % 3 == 0
+ for match in self.re_starts.finditer(s, overlapped=True):
+ # Must check the start is in frame
+ start = match.start()
+ if start % 3 == 0:
+ n = s[start:]
+ assert len(n) % 3 == 0, "%s is len %i" % (n, len(n))
+ if strict:
+ t = translate(n, self.table)
+ else:
+ # Use when missing stop codon,
+ t = "M" + translate(n[3:], self.table, to_stop=True)
+ yield start, n, t
+
+ def break_up_frame(self, s):
+ """Returns offset, nuc, protein."""
+ start = 0
+ for match in self.re_stops.finditer(s, overlapped=True):
+ index = match.start() + 3
+ if index % 3 != 0:
+ continue
+ n = s[start:index]
+ for (offset, n, t) in self.start_chop_and_trans(n):
+ if n and len(t) >= self.min_len:
+ yield start + offset, n, t
+ start = index
+
+ def putative_genes_in_sequence(self, nuc_seq):
+ """Returns start, end, strand, nucleotides, protein.
+ Co-ordinates are Python style zero-based.
+ """
+ nuc_seq = nuc_seq.upper()
+ # TODO - Refactor to use a generator function (in start order)
+ # rather than making a list and sorting?
+ answer = []
+ full_len = len(nuc_seq)
+
+ for frame in range(0, 3):
+ for offset, n, t in self.break_up_frame(nuc_seq[frame:]):
+ start = frame + offset # zero based
+ answer.append((start, start + len(n), +1, n, t))
+
+ rc = reverse_complement(nuc_seq)
+ for frame in range(0, 3):
+ for offset, n, t in self.break_up_frame(rc[frame:]):
+ start = full_len - frame - offset # zero based
+ answer.append((start, start - len(n), -1, n, t))
+ answer.sort()
+ return answer
+
+ def get_all_peptides(self, nuc_seq):
+ """Returns start, end, strand, nucleotides, protein.
+
+ Co-ordinates are Python style zero-based.
+ """
+ # Refactored into generator by CPT
+
+ full_len = len(nuc_seq)
+ for frame in range(0, 3):
+ for offset, n, t in self.break_up_frame(nuc_seq[frame:]):
+ start = frame + offset # zero based
+ yield (start, start + len(n), +1, n, t)
+ rc = reverse_complement(nuc_seq)
+ for frame in range(0, 3):
+ for offset, n, t in self.break_up_frame(rc[frame:]):
+ start = full_len - frame - offset # zero based
+ yield (start - len(n), start, -1, n, t)
diff -r 1a7fef71aee3 -r fd70980a516b cpt_find_spanins/cpt-macros.xml
--- a/cpt_find_spanins/cpt-macros.xml Fri Jun 17 12:40:59 2022 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,115 +0,0 @@
-
-
-
-
- python
- biopython
- requests
-
-
-
-
-
-
-
- 10.1371/journal.pcbi.1008214
- @unpublished{galaxyTools,
- author = {E. Mijalis, H. Rasche},
- title = {CPT Galaxy Tools},
- year = {2013-2017},
- note = {https://github.com/tamu-cpt/galaxy-tools/}
- }
-
-
-
-
- 10.1371/journal.pcbi.1008214
-
- @unpublished{galaxyTools,
- author = {E. Mijalis, H. Rasche},
- title = {CPT Galaxy Tools},
- year = {2013-2017},
- note = {https://github.com/tamu-cpt/galaxy-tools/}
- }
-
-
-
-
-
-
- 10.1371/journal.pcbi.1008214
-
- @unpublished{galaxyTools,
- author = {C. Ross},
- title = {CPT Galaxy Tools},
- year = {2020-},
- note = {https://github.com/tamu-cpt/galaxy-tools/}
- }
-
-
-
-
-
-
- 10.1371/journal.pcbi.1008214
-
- @unpublished{galaxyTools,
- author = {E. Mijalis, H. Rasche},
- title = {CPT Galaxy Tools},
- year = {2013-2017},
- note = {https://github.com/tamu-cpt/galaxy-tools/}
- }
-
-
- @unpublished{galaxyTools,
- author = {A. Criscione},
- title = {CPT Galaxy Tools},
- year = {2019-2021},
- note = {https://github.com/tamu-cpt/galaxy-tools/}
- }
-
-
-
-
-
-
- 10.1371/journal.pcbi.1008214
-
- @unpublished{galaxyTools,
- author = {A. Criscione},
- title = {CPT Galaxy Tools},
- year = {2019-2021},
- note = {https://github.com/tamu-cpt/galaxy-tools/}
- }
-
-
-
-
-
-
- 10.1371/journal.pcbi.1008214
-
- @unpublished{galaxyTools,
- author = {C. Maughmer},
- title = {CPT Galaxy Tools},
- year = {2017-2020},
- note = {https://github.com/tamu-cpt/galaxy-tools/}
- }
-
-
-
-
-
-
- @unpublished{galaxyTools,
- author = {C. Maughmer},
- title = {CPT Galaxy Tools},
- year = {2017-2020},
- note = {https://github.com/tamu-cpt/galaxy-tools/}
- }
-
-
-
-
diff -r 1a7fef71aee3 -r fd70980a516b cpt_find_spanins/cpt.py
--- a/cpt_find_spanins/cpt.py Fri Jun 17 12:40:59 2022 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,341 +0,0 @@
-#!/usr/bin/env python
-from Bio.Seq import Seq, reverse_complement, translate
-from Bio.SeqRecord import SeqRecord
-from Bio import SeqIO
-from Bio.Data import CodonTable
-import logging
-
-logging.basicConfig()
-log = logging.getLogger()
-
-PHAGE_IN_MIDDLE = re.compile("^(?P.*)\s*phage (?P.*)$")
-BACTERIOPHAGE_IN_MIDDLE = re.compile("^(?P.*)\s*bacteriophage (?P.*)$")
-STARTS_WITH_PHAGE = re.compile(
- "^(bacterio|vibrio|Bacterio|Vibrio|)?[Pp]hage (?P.*)$"
-)
-NEW_STYLE_NAMES = re.compile("(?Pv[A-Z]_[A-Z][a-z]{2}_.*)")
-
-
-def phage_name_parser(name):
- host = None
- phage = None
- name = name.replace(", complete genome.", "")
- name = name.replace(", complete genome", "")
-
- m = BACTERIOPHAGE_IN_MIDDLE.match(name)
- if m:
- host = m.group("host")
- phage = m.group("phage")
- return (host, phage)
-
- m = PHAGE_IN_MIDDLE.match(name)
- if m:
- host = m.group("host")
- phage = m.group("phage")
- return (host, phage)
-
- m = STARTS_WITH_PHAGE.match(name)
- if m:
- phage = m.group("phage")
- return (host, phage)
-
- m = NEW_STYLE_NAMES.match(name)
- if m:
- phage = m.group("phage")
- return (host, phage)
-
- return (host, phage)
-
-
-class OrfFinder(object):
- def __init__(self, table, ftype, ends, min_len, strand):
- self.table = table
- self.table_obj = CodonTable.ambiguous_generic_by_id[table]
- self.ends = ends
- self.ftype = ftype
- self.min_len = min_len
- self.starts = sorted(self.table_obj.start_codons)
- self.stops = sorted(self.table_obj.stop_codons)
- self.re_starts = re.compile("|".join(self.starts))
- self.re_stops = re.compile("|".join(self.stops))
- self.strand = strand
-
- def locate(self, fasta_file, out_nuc, out_prot, out_bed, out_gff3):
- seq_format = "fasta"
- log.debug("Genetic code table %i" % self.table)
- log.debug("Minimum length %i aa" % self.min_len)
-
- out_count = 0
-
- out_gff3.write("##gff-version 3\n")
-
- for idx, record in enumerate(SeqIO.parse(fasta_file, seq_format)):
- for i, (f_start, f_end, f_strand, n, t) in enumerate(
- self.get_all_peptides(str(record.seq).upper())
- ):
- out_count += 1
-
- descr = "length %i aa, %i bp, from %s..%s[%s] of %s" % (
- len(t),
- len(n),
- f_start,
- f_end,
- f_strand,
- record.description,
- )
- fid = record.id + "|%s%i" % (self.ftype, i + 1)
-
- r = SeqRecord(Seq(n), id=fid, name="", description=descr)
- t = SeqRecord(Seq(t), id=fid, name="", description=descr)
-
- SeqIO.write(r, out_nuc, "fasta")
- SeqIO.write(t, out_prot, "fasta")
-
- nice_strand = "+" if f_strand == +1 else "-"
-
- out_bed.write(
- "\t".join(
- map(str, [record.id, f_start, f_end, fid, 0, nice_strand])
- )
- + "\n"
- )
-
- out_gff3.write(
- "\t".join(
- map(
- str,
- [
- record.id,
- "getOrfsOrCds",
- "CDS",
- f_start + 1,
- f_end,
- ".",
- nice_strand,
- 0,
- "ID=%s.%s.%s" % (self.ftype, idx, i + 1),
- ],
- )
- )
- + "\n"
- )
- log.info("Found %i %ss", out_count, self.ftype)
-
- def start_chop_and_trans(self, s, strict=True):
- """Returns offset, trimmed nuc, protein."""
- if strict:
- assert s[-3:] in self.stops, s
- assert len(s) % 3 == 0
- for match in self.re_starts.finditer(s, overlapped=True):
- # Must check the start is in frame
- start = match.start()
- if start % 3 == 0:
- n = s[start:]
- assert len(n) % 3 == 0, "%s is len %i" % (n, len(n))
- if strict:
- t = translate(n, self.table)
- else:
- # Use when missing stop codon,
- t = "M" + translate(n[3:], self.table, to_stop=True)
- yield start, n, t # Edited by CPT to be a generator
-
- def break_up_frame(self, s):
- """Returns offset, nuc, protein."""
- start = 0
- for match in self.re_stops.finditer(s, overlapped=True):
- index = match.start() + 3
- if index % 3 != 0:
- continue
- n = s[start:index]
- for (offset, n, t) in self.start_chop_and_trans(n):
- if n and len(t) >= self.min_len:
- yield start + offset, n, t
- start = index
-
- def putative_genes_in_sequence(self, nuc_seq):
- """Returns start, end, strand, nucleotides, protein.
- Co-ordinates are Python style zero-based.
- """
- nuc_seq = nuc_seq.upper()
- # TODO - Refactor to use a generator function (in start order)
- # rather than making a list and sorting?
- answer = []
- full_len = len(nuc_seq)
-
- for frame in range(0, 3):
- for offset, n, t in self.break_up_frame(nuc_seq[frame:]):
- start = frame + offset # zero based
- answer.append((start, start + len(n), +1, n, t))
-
- rc = reverse_complement(nuc_seq)
- for frame in range(0, 3):
- for offset, n, t in self.break_up_frame(rc[frame:]):
- start = full_len - frame - offset # zero based
- answer.append((start, start - len(n), -1, n, t))
- answer.sort()
- return answer
-
- def get_all_peptides(self, nuc_seq):
- """Returns start, end, strand, nucleotides, protein.
-
- Co-ordinates are Python style zero-based.
- """
- # Refactored into generator by CPT
- full_len = len(nuc_seq)
- if self.strand != "reverse":
- for frame in range(0, 3):
- for offset, n, t in self.break_up_frame(nuc_seq[frame:]):
- start = frame + offset # zero based
- yield (start, start + len(n), +1, n, t)
- if self.strand != "forward":
- rc = reverse_complement(nuc_seq)
- for frame in range(0, 3):
- for offset, n, t in self.break_up_frame(rc[frame:]):
- start = full_len - frame - offset # zero based
- yield (start - len(n), start, -1, n, t)
-
-
-class MGAFinder(object):
- def __init__(self, table, ftype, ends, min_len):
- self.table = table
- self.table_obj = CodonTable.ambiguous_generic_by_id[table]
- self.ends = ends
- self.ftype = ftype
- self.min_len = min_len
- self.starts = sorted(self.table_obj.start_codons)
- self.stops = sorted(self.table_obj.stop_codons)
- self.re_starts = re.compile("|".join(self.starts))
- self.re_stops = re.compile("|".join(self.stops))
-
- def locate(self, fasta_file, out_nuc, out_prot, out_bed, out_gff3):
- seq_format = "fasta"
- log.debug("Genetic code table %i" % self.table)
- log.debug("Minimum length %i aa" % self.min_len)
-
- out_count = 0
-
- out_gff3.write("##gff-version 3\n")
-
- for idx, record in enumerate(SeqIO.parse(fasta_file, seq_format)):
- for i, (f_start, f_end, f_strand, n, t) in enumerate(
- self.get_all_peptides(str(record.seq).upper())
- ):
- out_count += 1
-
- descr = "length %i aa, %i bp, from %s..%s[%s] of %s" % (
- len(t),
- len(n),
- f_start,
- f_end,
- f_strand,
- record.description,
- )
- fid = record.id + "|%s%i" % (self.ftype, i + 1)
-
- r = SeqRecord(Seq(n), id=fid, name="", description=descr)
- t = SeqRecord(Seq(t), id=fid, name="", description=descr)
-
- SeqIO.write(r, out_nuc, "fasta")
- SeqIO.write(t, out_prot, "fasta")
-
- nice_strand = "+" if f_strand == +1 else "-"
-
- out_bed.write(
- "\t".join(
- map(str, [record.id, f_start, f_end, fid, 0, nice_strand])
- )
- + "\n"
- )
-
- out_gff3.write(
- "\t".join(
- map(
- str,
- [
- record.id,
- "getOrfsOrCds",
- "CDS",
- f_start + 1,
- f_end,
- ".",
- nice_strand,
- 0,
- "ID=%s.%s.%s" % (self.ftype, idx, i + 1),
- ],
- )
- )
- + "\n"
- )
- log.info("Found %i %ss", out_count, self.ftype)
-
- def start_chop_and_trans(self, s, strict=True):
- """Returns offset, trimmed nuc, protein."""
- if strict:
- assert s[-3:] in self.stops, s
- assert len(s) % 3 == 0
- for match in self.re_starts.finditer(s, overlapped=True):
- # Must check the start is in frame
- start = match.start()
- if start % 3 == 0:
- n = s[start:]
- assert len(n) % 3 == 0, "%s is len %i" % (n, len(n))
- if strict:
- t = translate(n, self.table)
- else:
- # Use when missing stop codon,
- t = "M" + translate(n[3:], self.table, to_stop=True)
- yield start, n, t
-
- def break_up_frame(self, s):
- """Returns offset, nuc, protein."""
- start = 0
- for match in self.re_stops.finditer(s, overlapped=True):
- index = match.start() + 3
- if index % 3 != 0:
- continue
- n = s[start:index]
- for (offset, n, t) in self.start_chop_and_trans(n):
- if n and len(t) >= self.min_len:
- yield start + offset, n, t
- start = index
-
- def putative_genes_in_sequence(self, nuc_seq):
- """Returns start, end, strand, nucleotides, protein.
- Co-ordinates are Python style zero-based.
- """
- nuc_seq = nuc_seq.upper()
- # TODO - Refactor to use a generator function (in start order)
- # rather than making a list and sorting?
- answer = []
- full_len = len(nuc_seq)
-
- for frame in range(0, 3):
- for offset, n, t in self.break_up_frame(nuc_seq[frame:]):
- start = frame + offset # zero based
- answer.append((start, start + len(n), +1, n, t))
-
- rc = reverse_complement(nuc_seq)
- for frame in range(0, 3):
- for offset, n, t in self.break_up_frame(rc[frame:]):
- start = full_len - frame - offset # zero based
- answer.append((start, start - len(n), -1, n, t))
- answer.sort()
- return answer
-
- def get_all_peptides(self, nuc_seq):
- """Returns start, end, strand, nucleotides, protein.
-
- Co-ordinates are Python style zero-based.
- """
- # Refactored into generator by CPT
-
- full_len = len(nuc_seq)
- for frame in range(0, 3):
- for offset, n, t in self.break_up_frame(nuc_seq[frame:]):
- start = frame + offset # zero based
- yield (start, start + len(n), +1, n, t)
- rc = reverse_complement(nuc_seq)
- for frame in range(0, 3):
- for offset, n, t in self.break_up_frame(rc[frame:]):
- start = full_len - frame - offset # zero based
- yield (start - len(n), start, -1, n, t)
diff -r 1a7fef71aee3 -r fd70980a516b cpt_find_spanins/findSpanin.py
--- a/cpt_find_spanins/findSpanin.py Fri Jun 17 12:40:59 2022 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,477 +0,0 @@
-##### findSpanin.pl --> findSpanin.py
-######### Much of this code is very "blocked", in the sense that one thing happens...then a function happens on the return...then another function...etc...etc...
-
-import argparse
-import os
-import re # new
-import itertools # new
-from collections import Counter, OrderedDict
-from spaninFuncs import getDescriptions, grabLocs, spaninProximity, splitStrands, tuple_fasta, lineWrapper
-
-### Requirement Inputs
-#### INPUT : putative_isp.fa & putative_osp.fa (in that order)
-#### PARAMETERS :
-
-###############################################################################
-def write_output(candidates):
- """ output file function...maybe not needed """
- pass
-
-def reconfigure_dict(spanins):
- """
- re organizes dictionary to be more friendly for checks
- """
-
- new_spanin_dict = {}
-
- for each_spanin_type, data_dict in spanins.items():
- #print(f"{each_spanin_type} == {data_dict}")
- new_spanin_dict[each_spanin_type] = {}
- new_spanin_dict[each_spanin_type]['positive'] = {}
- new_spanin_dict[each_spanin_type]['negative'] = {}
- new_spanin_dict[each_spanin_type]['positive']['coords'] = []
- new_spanin_dict[each_spanin_type]['negative']['coords'] = []
- for outter_orf, inner_data in data_dict.items():
- list_of_hits = []
- for data_content in inner_data:
- #print(data_content)
- data_content.insert(0, outter_orf)
- #print(f"new data_content -> {data_content}")
- #print(data_content)
- #list_of_hits += [data_content]
- #new_spanin_dict[each_spanin_type] += [data_content]
- if data_content[6] == "+":
- #print(f"{each_spanin_type} @ POSITIVE")
- new_spanin_dict[each_spanin_type]['positive']['coords'] += [data_content]
- elif data_content[6] == "-":
- #print(f"{each_spanin_type} @ NEGATIVE")
- new_spanin_dict[each_spanin_type]['negative']['coords'] += [data_content]
- #print(new_spanin_dict[each_spanin_type])
- #print(reorganized)
- #print(f"{outter_orf} => {inner_data}")
- #print(new_spanin_dict)
-
- #print('\n')
- #for k, v in new_spanin_dict.items():
- #print(k)
- #print(v)
- return new_spanin_dict
-
-
-
-def check_for_uniques(spanins):
- """
- Checks for unique spanins based on spanin_type.
- If the positive strand end site is _the same_ for a i-spanin, we would group that as "1".
- i.e. if ORF1, ORF2, and ORF3 all ended with location 4231, they would not be unique.
- """
- pair_dict = {}
- pair_dict = {
- 'pairs' : {
- 'location_amount' : [],
- 'pair_number' : {},
- }
- }
- for each_spanin_type, spanin_data in spanins.items():
- #print(f"{each_spanin_type} ===> {spanin_data}")
- # early declarations for cases of no results
- pos_check = [] # end checks
- pos_uniques = []
- neg_check = [] # start checks
- neg_uniques = []
- unique_ends = []
- pos_amt_unique = 0
- neg_amt_unique = 0
- amt_positive = 0
- amt_negative = 0
- spanin_data['uniques'] = 0
- spanin_data['amount'] = 0
- #spanin_data['positive']['amt_positive'] = 0
- #spanin_data['positive']['pos_amt_unique'] = 0
- #spanin_data['positive']['isp_match'] = []
- #spanin_data['negative']['amt_negative'] = 0
- #spanin_data['negative']['neg_amt_unique'] = 0
- #spanin_data['negative']['isp_match'] = []
- #print(spanin_data)
- if spanin_data['positive']['coords']:
- # do something...
- #print('in other function')
- #print(spanin_data['positive']['coords'])
- for each_hit in spanin_data['positive']['coords']:
- pos_check.append(each_hit[2])
- pair_dict['pairs']['location_amount'].append(each_hit[2])
- pos_uniques = list(set([end_site for end_site in pos_check if pos_check.count(end_site) >= 1]))
- #print(pos_check)
- #print(pos_uniques)
- amt_positive = len(spanin_data['positive']['coords'])
- pos_amt_unique = len(pos_uniques)
- if amt_positive:
- spanin_data['positive']['amt_positive'] = amt_positive
- spanin_data['positive']['pos_amt_unique'] = pos_amt_unique
- #pair_dict['pairs']['locations'].extend(pos_uniques)
- else:
- spanin_data['positive']['amt_positive'] = 0
- spanin_data['positive']['pos_amt_unique'] = 0
- if spanin_data['negative']['coords']:
-
- # do something else...
- #print('in other function')
- #print(spanin_data['negative']['coords'])
- for each_hit in spanin_data['negative']['coords']:
- neg_check.append(each_hit[1])
- pair_dict['pairs']['location_amount'].append(each_hit[1])
- neg_uniques = list(set([start_site for start_site in neg_check if neg_check.count(start_site) >= 1]))
- #print(neg_uniques)
- amt_negative = len(spanin_data['negative']['coords'])
- neg_amt_unique = len(neg_uniques)
- if amt_negative:
- spanin_data['negative']['amt_negative'] = amt_negative
- spanin_data['negative']['neg_amt_unique'] = neg_amt_unique
- #pair_dict['pairs']['locations'].extend(neg_uniques)
- else:
- spanin_data['negative']['amt_negative'] = 0
- spanin_data['negative']['neg_amt_unique'] = 0
- spanin_data['uniques'] += (spanin_data['positive']['pos_amt_unique'] + spanin_data['negative']['neg_amt_unique'])
- spanin_data['amount'] += (spanin_data['positive']['amt_positive'] + spanin_data['negative']['amt_negative'])
- #print(spanin_data['uniques'])
- list(set(pair_dict['pairs']['location_amount']))
- pair_dict['pairs']['location_amount'] = dict(Counter(pair_dict['pairs']['location_amount']))
- for data in pair_dict.values():
- #print(data['locations'])
- #print(type(data['locations']))
- v = 0
- for loc, count in data['location_amount'].items():
- #data['pair_number'] = {loc
- v += 1
- data['pair_number'][loc] = v
- #print(dict(Counter(pair_dict['pairs']['locations'])))
- #print(pair_dict)
- spanins['total_amount'] = spanins['EMBEDDED']['amount'] + spanins['SEPARATED']['amount'] + spanins['OVERLAPPED']['amount']
- spanins['total_unique'] = spanins['EMBEDDED']['uniques'] + spanins['SEPARATED']['uniques'] + spanins['OVERLAPPED']['uniques']
- #spanins['total_unique'] = len(pair_dict['pairs']['pair_number'])
- return spanins, pair_dict
-
-if __name__ == "__main__":
-
- # Common parameters for both ISP / OSP portion of script
-
- parser = argparse.ArgumentParser(
- description="Trim the putative protein candidates and find potential i-spanin / o-spanin pairs"
- )
-
- parser.add_argument(
- "putative_isp_fasta_file",
- type=argparse.FileType("r"),
- help='Putative i-spanin FASTA file, output of "generate-putative-isp"',
- ) # the "input" argument
-
- parser.add_argument(
- "putative_osp_fasta_file",
- type=argparse.FileType("r"),
- help='Putative o-spanin FASTA file, output of "generate-putative-osp"',
- )
-
- parser.add_argument(
- "--max_isp_osp_distance",
- dest="max_isp_osp_distance",
- default=10,
- type=int,
- help="max distance from end of i-spanin to start of o-spanin, measured in AAs",
- )
-
- parser.add_argument(
- "--embedded_txt",
- dest="embedded_txt",
- type=argparse.FileType("w"),
- default="_findSpanin_embedded_results.txt",
- help="Results of potential embedded spanins",
- )
- parser.add_argument(
- "--overlap_txt",
- dest="overlap_txt",
- type=argparse.FileType("w"),
- default="_findSpanin_overlap_results.txt",
- help="Results of potential overlapping spanins",
- )
- parser.add_argument(
- "--separate_txt",
- dest="separate_txt",
- type=argparse.FileType("w"),
- default="_findSpanin_separated_results.txt",
- help="Results of potential separated spanins",
- )
-
- parser.add_argument(
- "--summary_txt",
- dest="summary_txt",
- type=argparse.FileType("w"),
- default="_findSpanin_summary.txt",
- help="Results of potential spanin pairs",
- )
- parser.add_argument(
- "-v", action="version", version="0.3.0"
- ) # Is this manually updated?
- args = parser.parse_args()
-
-
- #### RE-WRITE
- SPANIN_TYPES = {}
- SPANIN_TYPES['EMBEDDED'] = {}
- SPANIN_TYPES['OVERLAPPED'] = {}
- SPANIN_TYPES['SEPARATED'] = {}
- #SPANIN_TYPES = {
- # 'EMBEDDED' : {},
- # 'OVERLAPPED' : {},
- # 'SEPARATED' : {},
- #}
-
- isp = getDescriptions(args.putative_isp_fasta_file)
- args.putative_isp_fasta_file = open(args.putative_isp_fasta_file.name, "r")
- isp_full = tuple_fasta(args.putative_isp_fasta_file)
-
- osp = getDescriptions(args.putative_osp_fasta_file)
- args.putative_osp_fasta_file = open(args.putative_osp_fasta_file.name, "r")
- osp_full = tuple_fasta(args.putative_osp_fasta_file)
-
- #### location data
- location_data = {
- 'isp' : [],
- 'osp' : []
- }
- spanins = [isp, osp]
- for idx, each_spanin_type in enumerate(spanins):
- for description in each_spanin_type:
- locations = grabLocs(description)
- if idx == 0: # i-spanin
- location_data['isp'].append(locations)
- elif idx == 1: # o-spanin
- location_data['osp'].append(locations)
-
- #### Check for types of spanins
- embedded, overlap, separate = spaninProximity(
- isp=location_data['isp'],
- osp=location_data['osp'],
- max_dist=args.max_isp_osp_distance * 3
- )
-
- SPANIN_TYPES['EMBEDDED'] = embedded
- SPANIN_TYPES['OVERLAPPED'] = overlap
- SPANIN_TYPES['SEPARATED'] = separate
-
- #for spanin_type, spanin in SPANIN_TYPES.items():
- # s = 0
- # for sequence in spanin.values():
- # s += len(sequence)
- # SPANIN_TYPES[spanin_type]['amount'] = s
- # SPANIN_TYPES[spanin_type]['unique'] = len(spanin.keys())
-
- #check_for_unique_spanins(SPANIN_TYPES)
- spanins = reconfigure_dict(SPANIN_TYPES)
- spanins, pair_dict = check_for_uniques(spanins)
- #print(pair_dict)
- with args.summary_txt as f:
- for each_spanin_type, spanin_data in spanins.items():
- try:
- if each_spanin_type not in ["total_amount","total_unique"]:
- #print(each_spanin_type)
- #print(each_spanin_type)
- f.write("=~~~~~= "+str(each_spanin_type) +" Spanin Candidate Statistics =~~~~~=\n")
- f.writelines("Total Candidate Pairs = "+str(spanin_data['amount'])+"\n")
- f.writelines("Total Unique Pairs = "+str(spanin_data['uniques'])+"\n")
- if each_spanin_type == "EMBEDDED":
- for k, v in SPANIN_TYPES['EMBEDDED'].items():
- #print(k)
- f.writelines(""+str(k)+" ==> Amount of corresponding candidate o-spanins(s): "+str(len(v))+"\n")
- if each_spanin_type == "SEPARATED":
- for k, v in SPANIN_TYPES['SEPARATED'].items():
- f.writelines(""+str(k)+ " ==> Amount of corresponding candidate o-spanins(s): "+str(len(v))+"\n")
- if each_spanin_type == "OVERLAPPED":
- for k, v in SPANIN_TYPES['OVERLAPPED'].items():
- f.writelines(""+str(k)+" ==> Amount of corresponding candidate o-spanins(s): "+str(len(v))+"\n")
- except TypeError:
- continue
- f.write("\n=~~~~~= Tally from ALL spanin types =~~~~~=\n")
- f.writelines("Total Candidates = "+str(spanins['total_amount'])+"\n")
- f.writelines("Total Unique Candidate Pairs = "+str(spanins['total_unique'])+"\n")
-
- args.putative_isp_fasta_file = open(args.putative_isp_fasta_file.name, "r")
- isp_full = tuple_fasta(args.putative_isp_fasta_file)
-
- args.putative_osp_fasta_file = open(args.putative_osp_fasta_file.name, "r")
- osp_full = tuple_fasta(args.putative_osp_fasta_file)
-
- #print(isp_full)
- isp_seqs = []
- osp_seqs = []
- for isp_tupe in isp_full:
- #print(isp_tupe)
- for pisp, posp in embedded.items():
- #print(f"ISP = searching for {pisp} in {isp_tupe[0]}")
- if re.search(("("+str(pisp)+")\D"), isp_tupe[0]):
- #print(isp_tupe[0])
- #print(peri_count)
- peri_count = str.split(isp_tupe[0],"~=")[1]
- isp_seqs.append((pisp,isp_tupe[1],peri_count))
- #print(isp_seqs)
- for osp_tupe in osp_full:
- for pisp, posp in embedded.items():
- for data in posp:
- #print(f"OSP = searching for {data[3]} in {osp_tupe[0]}, coming from this object: {data}")
- if re.search(("("+str(data[3])+")\D"), osp_tupe[0]):
- peri_count = str.split(osp_tupe[0],"~=")[1]
- osp_seqs.append((data[3],osp_tupe[1],peri_count))
-
- with args.embedded_txt as f:
- f.write("================ embedded spanin candidates =================\n")
- f.write("isp\tisp_start\tisp_end\tosp\tosp_start\tosp_end\tstrand\tpair_number\n")
- if embedded != {}:
- #print(embedded)
- for pisp, posp in embedded.items():
- #print(f"{pisp} - {posp}")
- f.write(pisp + "\n")
- for each_posp in posp:
- #print(posp)
- f.write(
- "\t{}\t{}\t{}\t{}\t{}\t{}\t".format(
- each_posp[1],
- each_posp[2],
- each_posp[3],
- each_posp[4],
- each_posp[5],
- each_posp[6],
- )
- )
- if each_posp[6] == "+":
- if each_posp[2] in pair_dict['pairs']['pair_number'].keys():
- f.write(""+str(pair_dict['pairs']['pair_number'][each_posp[2]])+"\n")
- elif each_posp[6] == "-":
- if each_posp[1] in pair_dict['pairs']['pair_number'].keys():
- f.write(""+str(pair_dict['pairs']['pair_number'][each_posp[1]])+"\n")
- else:
- f.write("nothing found")
-
- with open(args.embedded_txt.name, "a") as f:
- f.write("\n================= embedded candidate sequences ================\n")
- f.write("======================= isp ==========================\n\n")
- for isp_data in isp_seqs:
- #print(isp_data)
- f.write(">isp_orf::{}-peri_count~={}\n{}\n".format(isp_data[0],isp_data[2],lineWrapper(isp_data[1])))
- f.write("\n======================= osp ========================\n\n")
- for osp_data in osp_seqs:
- f.write(">osp_orf::{}-peri_count~={}\n{}\n".format(osp_data[0],osp_data[2],lineWrapper(osp_data[1])))
-
- args.putative_isp_fasta_file = open(args.putative_isp_fasta_file.name, "r")
- isp_full = tuple_fasta(args.putative_isp_fasta_file)
-
- args.putative_osp_fasta_file = open(args.putative_osp_fasta_file.name, "r")
- osp_full = tuple_fasta(args.putative_osp_fasta_file)
-
- isp_seqs = []
- osp_seqs = []
- for isp_tupe in isp_full:
- peri_count = str.split(isp_tupe[0],"~=")[1]
- for pisp, posp in overlap.items():
- if re.search(("("+str(pisp)+")\D"), isp_tupe[0]):
- peri_count = str.split(isp_tupe[0],"~=")[1]
- isp_seqs.append((pisp,isp_tupe[1],peri_count))
-
- for osp_tupe in osp_full:
- for pisp, posp in overlap.items():
- for data in posp:
- if re.search(("("+str(data[3])+")\D"), osp_tupe[0]):
- peri_count = str.split(osp_tupe[0],"~=")[1]
- osp_seqs.append((data[3],osp_tupe[1],peri_count))
-
-
-
- with args.overlap_txt as f:
- f.write("================ overlap spanin candidates =================\n")
- f.write("isp\tisp_start\tisp_end\tosp\tosp_start\tosp_end\tstrand\tpair_number\n")
- if overlap != {}:
- for pisp, posp in overlap.items():
- f.write(pisp + "\n")
- for each_posp in posp:
- f.write(
- "\t{}\t{}\t{}\t{}\t{}\t{}\t".format(
- each_posp[1],
- each_posp[2],
- each_posp[3],
- each_posp[4],
- each_posp[5],
- each_posp[6],
- )
- )
- if each_posp[6] == "+":
- if each_posp[2] in pair_dict['pairs']['pair_number'].keys():
- #print('ovl ; +')
- f.write(""+str(pair_dict['pairs']['pair_number'][each_posp[2]])+"\n")
- elif each_posp[6] == "-":
- if each_posp[1] in pair_dict['pairs']['pair_number'].keys():
- f.write(""+str(pair_dict['pairs']['pair_number'][each_posp[1]])+"\n")
- else:
- f.write("nothing found")
-
- with open(args.overlap_txt.name, "a") as f:
- #print(isp_seqs)
- f.write("\n================= overlap candidate sequences ================\n")
- f.write("======================= isp ==========================\n\n")
- for isp_data in isp_seqs:
- f.write(">isp_orf::{}-pericount~={}\n{}\n".format(isp_data[0],isp_data[2],lineWrapper(isp_data[1])))
- f.write("\n======================= osp ========================\n\n")
- for osp_data in osp_seqs:
- f.write(">osp_orf::{}-pericount~={}\n{}\n".format(osp_data[0],osp_data[2],lineWrapper(osp_data[1])))
-
- args.putative_isp_fasta_file = open(args.putative_isp_fasta_file.name, "r")
- isp_full = tuple_fasta(args.putative_isp_fasta_file)
- args.putative_osp_fasta_file = open(args.putative_osp_fasta_file.name, "r")
- osp_full = tuple_fasta(args.putative_osp_fasta_file)
-
- isp_seqs = []
- osp_seqs = []
- for isp_tupe in isp_full:
- for pisp, posp in separate.items():
- if re.search(("("+str(pisp)+")\D"), isp_tupe[0]):
- peri_count = str.split(isp_tupe[0],"~=")[1]
- isp_seqs.append((pisp,isp_tupe[1],peri_count))
- #print(isp_seqs)
- for osp_tupe in osp_full:
- for pisp, posp in separate.items():
- for data in posp:
- if re.search(("("+str(data[3])+")\D"), osp_tupe[0]):
- peri_count = str.split(osp_tupe[0],"~=")[1]
- osp_seqs.append((data[3],osp_tupe[1],peri_count))
-
- with args.separate_txt as f:
- f.write("================ separated spanin candidates =================\n")
- f.write("isp\tisp_start\tisp_end\tosp\tosp_start\tosp_end\tstrand\tpair_number\n")
- if separate != {}:
- for pisp, posp in separate.items():
- f.write(pisp + "\n")
- for each_posp in posp:
- f.write(
- "\t{}\t{}\t{}\t{}\t{}\t{}\t".format(
- each_posp[1],
- each_posp[2],
- each_posp[3],
- each_posp[4],
- each_posp[5],
- each_posp[6],
- )
- )
- if each_posp[6] == "+":
- if each_posp[2] in pair_dict['pairs']['pair_number'].keys():
- f.write(""+str(pair_dict['pairs']['pair_number'][each_posp[2]])+"\n")
- elif each_posp[6] == "-":
- if each_posp[1] in pair_dict['pairs']['pair_number'].keys():
- f.write(""+str(pair_dict['pairs']['pair_number'][each_posp[1]])+"\n")
- else:
- f.write("nothing found")
-
- with open(args.separate_txt.name, "a") as f:
- f.write("\n================= separated candidate sequences ================\n")
- f.write("======================= isp ==========================\n\n")
- for isp_data in isp_seqs:
- f.write(">isp_orf::{}-pericount~={}\n{}\n".format(isp_data[0],isp_data[2],lineWrapper(isp_data[1])))
- f.write("\n======================= osp ========================\n\n")
- for osp_data in osp_seqs:
- f.write(">osp_orf::{}-pericount~={}\n{}\n".format(osp_data[0],osp_data[2],lineWrapper(osp_data[1])))
diff -r 1a7fef71aee3 -r fd70980a516b cpt_find_spanins/findSpanin.xml
--- a/cpt_find_spanins/findSpanin.xml Fri Jun 17 12:40:59 2022 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,44 +0,0 @@
-
-
- With the outputs from the ISP and OSP candidate tools, cull the list down to candidate pairs
-
- macros.xml
- cpt-macros.xml
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- Putative i-spanin and o-spanin protein multiFASTAs (generated from the ISP/OSP Candidate Tools).
-
-**METHODOLOGY**
-Does a pairwise comparison between candidate i-spanins and o-spanins based on their genomic location, and classifies them into the known bimolecular spanin genetic architectures. Classes are: embedded, overlapping, separated, or NOT a potential pair.
-
-**OUTPUT** --> File with candidate pairs for each bimolecular spanin class and a basic summary statistics file.
-
-]]>
-
-
diff -r 1a7fef71aee3 -r fd70980a516b cpt_find_spanins/macros.xml
--- a/cpt_find_spanins/macros.xml Fri Jun 17 12:40:59 2022 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,63 +0,0 @@
-
-
-
-
- regex
- python
- biopython
- cpt_gffparser
-
-
-
-
- "$blast_tsv"
-
-
-
-
-
-
- "$blast_xml"
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- "$gff3_data"
-
-
- genomeref.fa
-
-
- ln -s $genome_fasta genomeref.fa;
-
-
- genomeref.fa
-
-
-
-
-
-
- "$sequences"
-
-
-
-
-
diff -r 1a7fef71aee3 -r fd70980a516b cpt_find_spanins/spaninFuncs.py
--- a/cpt_find_spanins/spaninFuncs.py Fri Jun 17 12:40:59 2022 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,469 +0,0 @@
-"""
-PREMISE
-### Functions/Classes that are used in both generate-putative-osp.py and generate-putative-isp.py
-###### Main premise here is to make the above scripts a little more DRY, as well as easily readable for execution.
-###### Documentation will ATTEMPT to be thourough here
-"""
-
-import re
-from Bio import SeqIO
-from Bio import Seq
-from collections import OrderedDict
-
-# Not written in OOP for a LITTLE bit of trying to keep the complication down in case adjustments are needed by someone else.
-# Much of the manipulation is string based; so it should be straightforward as well as moderately quick
-################## GLobal Variables
-Lys = "K"
-
-
-def check_back_end_snorkels(seq, tmsize):
- """
- Searches through the backend of a potential TMD snorkel. This is the 2nd part of a TMD snorkel lysine match.
- --> seq : should be the sequence fed from the "search_region" portion of the sequence
- --> tmsize : size of the potential TMD being investigated
- """
- found = []
- if seq[tmsize - 4] == Lys and re.search(("[FIWLVMYCATGS]"), seq[tmsize - 5]):
- found = "match"
- return found
- elif seq[tmsize - 3] == Lys and re.search(("[FIWLVMYCATGS]"), seq[tmsize - 4]):
- found = "match"
- return found
- elif seq[tmsize - 2] == Lys and re.search(("[FIWLVMYCATGS]"), seq[tmsize - 3]):
- found = "match"
- return found
- elif seq[tmsize - 1] == Lys and re.search(("[FIWLVMYCATGS]"), seq[tmsize - 2]):
- found = "match"
- return found
- else:
- found = "NOTmatch"
- return found
-
-
-def prep_a_gff3(fa, spanin_type, org):
- """
- Function parses an input detailed 'fa' file and outputs a 'gff3' file
- ---> fa = input .fa file
- ---> output = output a returned list of data, easily portable to a gff3 next
- ---> spanin_type = 'isp' or 'osp'
- """
- with org as f:
- header = f.readline()
- orgacc = header.split(" ")
- orgacc = orgacc[0].split(">")[1].strip()
- fa_zip = tuple_fasta(fa)
- data = []
- for a_pair in fa_zip:
- # print(a_pair)
- if re.search(("(\[1\])"), a_pair[0]):
- strand = "+"
- elif re.search(("(\[-1\])"), a_pair[0]):
- strand = "-" # column 7
- start = re.search(("[\d]+\.\."), a_pair[0]).group(0).split("..")[0] # column 4
- end = re.search(("\.\.[\d]+"), a_pair[0]).group(0).split("..")[1] # column 5
- orfid = re.search(("(ORF)[\d]+"), a_pair[0]).group(0) # column 1
- if spanin_type == "isp":
- methodtype = "CDS" # column 3
- spanin = "isp"
- elif spanin_type == "osp":
- methodtype = "CDS" # column 3
- spanin = "osp"
- elif spanin_type == "usp":
- methodtype = "CDS"
- spanin = "usp"
- else:
- raise "need to input spanin type"
- source = "cpt.py|putative-*.py" # column 2
- score = "." # column 6
- phase = "." # column 8
- attributes = "ID=" +orgacc+ "|"+ orfid + ";ALIAS=" + spanin + ";SEQ="+a_pair[1] # column 9
- sequence = [[orgacc, source, methodtype, start, end, score, strand, phase, attributes]]
- data += sequence
- return data
-
-
-def write_gff3(data, output="results.gff3"):
- """
- Parses results from prep_a_gff3 into a gff3 file
- ---> input : list from prep_a_gff3
- ---> output : gff3 file
- """
- data = data
- filename = output
- with filename as f:
- f.write("#gff-version 3\n")
- for value in data:
- f.write(
- "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
- value[0],
- value[1],
- value[2],
- value[3],
- value[4],
- value[5],
- value[6],
- value[7],
- value[8],
- )
- )
- f.close()
-
-
-def find_tmd(pair, minimum=10, maximum=30, TMDmin=10, TMDmax=20, isp_mode=False, peri_min=18, peri_max=206):
- """
- Function that searches for lysine snorkels and then for a spanning hydrophobic region that indicates a potential TMD
- ---> pair : Input of tuple with description and AA sequence (str)
- ---> minimum : How close from the initial start codon a TMD can be within
- ---> maximum : How far from the initial start codon a TMD can be within
- ---> TMDmin : The minimum size that a transmembrane can be (default = 10)
- ---> TMDmax : The maximum size tha ta transmembrane can be (default = 20)
- """
- # hydrophobicAAs = ['P', 'F', 'I', 'W', 'L', 'V', 'M', 'Y', 'C', 'A', 'T', 'G', 'S']
- tmd = []
- s = str(pair[1]) # sequence being analyzed
- # print(s) # for trouble shooting
- if maximum > len(s):
- maximum = len(s)
- search_region = s[minimum - 1 : maximum + 1]
- #print(f"this is the search region: {search_region}")
- # print(search_region) # for trouble shooting
-
- for tmsize in range(TMDmin, TMDmax+1, 1):
- #print(f"this is the current tmsize we're trying: {tmsize}")
- # print('==============='+str(tmsize)+'================') # print for troubleshooting
- pattern = "[PFIWLVMYCATGS]{"+str(tmsize)+"}" # searches for these hydrophobic residues tmsize total times
- #print(pattern)
- #print(f"sending to regex: {search_region}")
- if re.search(
- ("[K]"), search_region[1:8]): # grabbing one below with search region, so I want to grab one ahead here when I query.
- store_search = re.search(("[K]"), search_region[1:8]) # storing regex object
- where_we_are = store_search.start() # finding where we got the hit
- if re.search(
- ("[PFIWLVMYCATGS]"), search_region[where_we_are + 1]
- ) and re.search(
- ("[PFIWLVMYCATGS]"), search_region[where_we_are - 1]
- ): # hydrophobic neighbor
- #try:
- g = re.search(("[PFIWLVMYCATGS]"), search_region[where_we_are + 1]).group()
- backend = check_back_end_snorkels(search_region, tmsize)
- if backend == "match":
- if isp_mode:
- g = re.search((pattern), search_region).group()
- end_of_tmd = re.search((g), s).end()+1
- amt_peri = len(s) - end_of_tmd
- if peri_min <= amt_peri <= peri_max:
- pair_desc = pair[0] + ", peri_count~="+str(amt_peri)
- new_pair = (pair_desc,pair[1])
- tmd.append(new_pair)
- else:
- tmd.append(pair)
- else:
- continue
- #else:
- #print("I'm continuing out of snorkel loop")
- #print(f"{search_region}")
- #continue
- if re.search((pattern), search_region):
- #print(f"found match: {}")
- #print("I AM HEREEEEEEEEEEEEEEEEEEEEEEE")
- #try:
- if isp_mode:
- g = re.search((pattern), search_region).group()
- end_of_tmd = re.search((g), s).end()+1
- amt_peri = len(s) - end_of_tmd
- if peri_min <= amt_peri <= peri_max:
- pair_desc = pair[0] + ", peri_count~="+str(amt_peri)
- new_pair = (pair_desc,pair[1])
- tmd.append(new_pair)
- else:
- tmd.append(pair)
- else:
- continue
-
- return tmd
-
-
-def find_lipobox(pair, minimum=10, maximum=50, min_after=30, max_after=185, regex=1, osp_mode=False):
- """
- Function that takes an input tuple, and will return pairs of sequences to their description that have a lipoobox
- ---> minimum - min distance from start codon to first AA of lipobox
- ---> maximum - max distance from start codon to first AA of lipobox
- ---> regex - option 1 (default) => more strict regular expression ; option 2 => looser selection, imported from LipoRy
-
- """
- if regex == 1:
- pattern = "[ILMFTV][^REKD][GAS]C" # regex for Lipobox from findSpanin.pl
- elif regex == 2:
- pattern = "[ACGSILMFTV][^REKD][GAS]C" # regex for Lipobox from LipoRy
-
- candidates = []
- s = str(pair[1])
- # print(s) # trouble shooting
- search_region = s[minimum-1 : maximum + 5] # properly slice the input... add 4 to catch if it hangs off at max input
- # print(search_region) # trouble shooting
- patterns = ["[ILMFTV][^REKD][GAS]C","AW[AGS]C"]
- for pattern in patterns:
- #print(pattern) # trouble shooting
- if re.search((pattern), search_region): # lipobox must be WITHIN the range...
- # searches the sequence with the input RegEx AND omits if
- g = re.search((pattern), search_region).group() # find the exact group match
- amt_peri = len(s) - re.search((g), s).end() + 1
- if min_after <= amt_peri <= max_after: # find the lipobox end region
- if osp_mode:
- pair_desc = pair[0] + ", peri_count~="+str(amt_peri)
- new_pair = (pair_desc,pair[1])
- candidates.append(new_pair)
- else:
- candidates.append(pair)
-
- return candidates
-
-
-def tuple_fasta(fasta_file):
- """
- #### INPUT: Fasta File
- #### OUTPUT: zipped (zip) : pairwise relationship of description to sequence
- ####
- """
- fasta = SeqIO.parse(fasta_file, "fasta")
- descriptions = []
- sequences = []
- for r in fasta: # iterates and stores each description and sequence
- description = r.description
- sequence = str(r.seq)
- if (
- sequence[0] != "I"
- ): # the translation table currently has I as a potential start codon ==> this will remove all ORFs that start with I
- descriptions.append(description)
- sequences.append(sequence)
- else:
- continue
-
- return zip(descriptions, sequences)
-
-
-def lineWrapper(text, charactersize=60):
-
- if len(text) <= charactersize:
- return text
- else:
- return (
- text[:charactersize]
- + "\n"
- + lineWrapper(text[charactersize:], charactersize)
- )
-
-
-def getDescriptions(fasta):
- """
- Takes an output FASTA file, and parses retrieves the description headers. These headers contain information needed
- for finding locations of a potential i-spanin and o-spanin proximity to one another.
- """
- desc = []
- with fasta as f:
- for line in f:
- if line.startswith(">"):
- desc.append(line)
- return desc
-
-
-def splitStrands(text, strand="+"):
- # positive_strands = []
- # negative_strands = []
- if strand == "+":
- if re.search(("(\[1\])"), text):
- return text
- elif strand == "-":
- if re.search(("(\[-1\])"), text):
- return text
- # return positive_strands, negative_strands
-
-
-def parse_a_range(pair, start, end):
- """
- Takes an input data tuple from a fasta tuple pair and keeps only those within the input sequence range
- ---> data : fasta tuple data
- ---> start : start range to keep
- ---> end : end range to keep (will need to + 1)
- """
- matches = []
- for each_pair in pair:
-
- s = re.search(("[\d]+\.\."), each_pair[0]).group(0) # Start of the sequence
- s = int(s.split("..")[0])
- e = re.search(("\.\.[\d]+"), each_pair[0]).group(0)
- e = int(e.split("..")[1])
- if start - 1 <= s and e <= end + 1:
- matches.append(each_pair)
- else:
- continue
- # else:
- # continue
- # if matches != []:
- return matches
- # else:
- # print('no candidates within selected range')
-
-
-def grabLocs(text):
- """
- Grabs the locations of the spanin based on NT location (seen from ORF). Grabs the ORF name, as per named from the ORF class/module
- from cpt.py
- """
- start = re.search(("[\d]+\.\."), text).group(0) # Start of the sequence ; looks for [numbers]..
- end = re.search(("\.\.[\d]+"), text).group(0) # End of the sequence ; Looks for ..[numbers]
- orf = re.search(("(ORF)[\d]+"), text).group(0) # Looks for ORF and the numbers that are after it
- if re.search(("(\[1\])"), text): # stores strand
- strand = "+"
- elif re.search(("(\[-1\])"), text): # stores strand
- strand = "-"
- start = int(start.split("..")[0])
- end = int(end.split("..")[1])
- vals = [start, end, orf, strand]
-
- return vals
-
-
-def spaninProximity(isp, osp, max_dist=30):
- """
- _NOTE THIS FUNCTION COULD BE MODIFIED TO RETURN SEQUENCES_
- Compares the locations of i-spanins and o-spanins. max_dist is the distance in NT measurement from i-spanin END site
- to o-spanin START. The user will be inputting AA distance, so a conversion will be necessary ( * 3)
- I modified this on 07.30.2020 to bypass the pick + or - strand. To
- INPUT: list of OSP and ISP candidates
- OUTPUT: Return (improved) candidates for overlapping, embedded, and separate list
- """
-
- embedded = {}
- overlap = {}
- separate = {}
- for iseq in isp:
- embedded[iseq[2]] = []
- overlap[iseq[2]] = []
- separate[iseq[2]] = []
- for oseq in osp:
- if iseq[3] == "+":
- if oseq[3] == "+":
- if iseq[0] < oseq[0] < iseq[1] and oseq[1] < iseq[1]:
- ### EMBEDDED ###
- combo = [
- iseq[0],
- iseq[1],
- oseq[2],
- oseq[0],
- oseq[1],
- iseq[3],
- ] # ordering a return for dic
- embedded[iseq[2]] += [combo]
- elif iseq[0] < oseq[0] <= iseq[1] and oseq[1] > iseq[1]:
- ### OVERLAP / SEPARATE ###
- if (iseq[1] - oseq[0]) < 6:
- combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]]
- separate[iseq[2]] += [combo]
- else:
- combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]]
- overlap[iseq[2]] += [combo]
- elif iseq[1] <= oseq[0] <= iseq[1] + max_dist:
- combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]]
- separate[iseq[2]] += [combo]
- else:
- continue
- if iseq[3] == "-":
- if oseq[3] == "-":
- if iseq[0] <= oseq[1] <= iseq[1] and oseq[0] > iseq[0]:
- ### EMBEDDED ###
- combo = [
- iseq[0],
- iseq[1],
- oseq[2],
- oseq[0],
- oseq[1],
- iseq[3],
- ] # ordering a return for dict
- embedded[iseq[2]] += [combo]
- elif iseq[0] <= oseq[1] <= iseq[1] and oseq[0] < iseq[0]:
- if (oseq[1] - iseq[0]) < 6:
- combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]]
- separate[iseq[2]] += [combo]
- else:
- combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]]
- overlap[iseq[2]] += [combo]
- elif iseq[0] - 10 < oseq[1] < iseq[0]:
- combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]]
- separate[iseq[2]] += [combo]
- else:
- continue
-
- embedded = {k: embedded[k] for k in embedded if embedded[k]}
- overlap = {k: overlap[k] for k in overlap if overlap[k]}
- separate = {k: separate[k] for k in separate if separate[k]}
-
- return embedded, overlap, separate
-
-
-def check_for_usp():
- " pass "
-
-############################################### TEST RANGE #########################################################################
-####################################################################################################################################
-if __name__ == "__main__":
-
- #### TMD TEST
- test_desc = ["one", "two", "three", "four", "five"]
- test_seq = [
- "XXXXXXXXXXXXXXXFMCFMCFMCFMCFMCXXXXXXXXXXXXXXXXXXXXXXXXXX",
- "XXXXXXXXAAKKKKKKKKKKKKKKKXXXXXXXXXXXXX",
- "XXXXXXX",
- "XXXXXXXXXXXKXXXXXXXXXX",
- "XXXXXXXXXXAKXXXXXXXXXXAKXXXXXXXX",
- ]
- # for l in
- # combo = zip(test_desc,test_seq)
- pairs = zip(test_desc, test_seq)
- tmd = []
- for each_pair in pairs:
- # print(each_pair)
- try:
- tmd += find_tmd(pair=each_pair)
- except (IndexError, TypeError):
- continue
- # try:s = each_pair[1]
- # tmd += find_tmd(seq=s, tmsize=15)
- # print('\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n')
- # print(tmd)
- # print('\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n')
-
- #### tuple-fasta TEST
- # fasta_file = 'out_isp.fa'
- # ret = tuple_fasta(fasta_file)
- # print('=============')
- # for i in ret:
- # print(i[1])
-
- #### LipoBox TEST
- test_desc = ["one", "two", "three", "four", "five", "six", "seven"]
- test_seq = [
- "XXXXXXXXXTGGCXXXXXXXXXXXXXXXX",
- "XXXXXXXXAAKKKKKKKKKKKKKKKXXXXXXXXXXXXX",
- "XXXXXXX",
- "AGGCXXXXXXXXXXXXXXXXXXXXTT",
- "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXTGGC",
- "XXXXXXXXXXXXXXXXXXXXXXXXXXTGGC",
- "MSTLRELRLRRALKEQSMRYLLSIKKTLPRWKGALIGLFLICVATISGCASESKLPEPPMVSVDSSLMVEPNLTTEMLNVFSQ*",
- ]
- pairs = zip(test_desc, test_seq)
- lipo = []
- for each_pair in pairs:
- #print(each_pair)
- # try:
- try:
- lipo += find_lipobox(pair=each_pair, regex=2) # , minimum=8)
- except TypeError: # catches if something doesnt have the min/max requirements (something is too small)
- continue
- # except:
- # continue
- # print('\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n')
- #############################3
- # g = prep_a_gff3(fa='putative_isp.fa', spanin_type='isp')
- # print(g)
- # write_gff3(data=g)
diff -r 1a7fef71aee3 -r fd70980a516b findSpanin.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/findSpanin.py Mon Jun 05 02:42:01 2023 +0000
@@ -0,0 +1,597 @@
+##### findSpanin.pl --> findSpanin.py
+######### Much of this code is very "blocked", in the sense that one thing happens...then a function happens on the return...then another function...etc...etc...
+
+import argparse
+import os
+import re # new
+import itertools # new
+from collections import Counter, OrderedDict
+from spaninFuncs import (
+ getDescriptions,
+ grabLocs,
+ spaninProximity,
+ splitStrands,
+ tuple_fasta,
+ lineWrapper,
+)
+
+### Requirement Inputs
+#### INPUT : putative_isp.fa & putative_osp.fa (in that order)
+#### PARAMETERS :
+
+###############################################################################
+def write_output(candidates):
+ """output file function...maybe not needed"""
+ pass
+
+
+def reconfigure_dict(spanins):
+ """
+ re organizes dictionary to be more friendly for checks
+ """
+
+ new_spanin_dict = {}
+
+ for each_spanin_type, data_dict in spanins.items():
+ # print(f"{each_spanin_type} == {data_dict}")
+ new_spanin_dict[each_spanin_type] = {}
+ new_spanin_dict[each_spanin_type]["positive"] = {}
+ new_spanin_dict[each_spanin_type]["negative"] = {}
+ new_spanin_dict[each_spanin_type]["positive"]["coords"] = []
+ new_spanin_dict[each_spanin_type]["negative"]["coords"] = []
+ for outter_orf, inner_data in data_dict.items():
+ list_of_hits = []
+ for data_content in inner_data:
+ # print(data_content)
+ data_content.insert(0, outter_orf)
+ # print(f"new data_content -> {data_content}")
+ # print(data_content)
+ # list_of_hits += [data_content]
+ # new_spanin_dict[each_spanin_type] += [data_content]
+ if data_content[6] == "+":
+ # print(f"{each_spanin_type} @ POSITIVE")
+ new_spanin_dict[each_spanin_type]["positive"]["coords"] += [
+ data_content
+ ]
+ elif data_content[6] == "-":
+ # print(f"{each_spanin_type} @ NEGATIVE")
+ new_spanin_dict[each_spanin_type]["negative"]["coords"] += [
+ data_content
+ ]
+ # print(new_spanin_dict[each_spanin_type])
+ # print(reorganized)
+ # print(f"{outter_orf} => {inner_data}")
+ # print(new_spanin_dict)
+
+ # print('\n')
+ # for k, v in new_spanin_dict.items():
+ # print(k)
+ # print(v)
+ return new_spanin_dict
+
+
+def check_for_uniques(spanins):
+ """
+ Checks for unique spanins based on spanin_type.
+ If the positive strand end site is _the same_ for a i-spanin, we would group that as "1".
+ i.e. if ORF1, ORF2, and ORF3 all ended with location 4231, they would not be unique.
+ """
+ pair_dict = {}
+ pair_dict = {
+ "pairs": {
+ "location_amount": [],
+ "pair_number": {},
+ }
+ }
+ for each_spanin_type, spanin_data in spanins.items():
+ # print(f"{each_spanin_type} ===> {spanin_data}")
+ # early declarations for cases of no results
+ pos_check = [] # end checks
+ pos_uniques = []
+ neg_check = [] # start checks
+ neg_uniques = []
+ unique_ends = []
+ pos_amt_unique = 0
+ neg_amt_unique = 0
+ amt_positive = 0
+ amt_negative = 0
+ spanin_data["uniques"] = 0
+ spanin_data["amount"] = 0
+ # spanin_data['positive']['amt_positive'] = 0
+ # spanin_data['positive']['pos_amt_unique'] = 0
+ # spanin_data['positive']['isp_match'] = []
+ # spanin_data['negative']['amt_negative'] = 0
+ # spanin_data['negative']['neg_amt_unique'] = 0
+ # spanin_data['negative']['isp_match'] = []
+ # print(spanin_data)
+ if spanin_data["positive"]["coords"]:
+ # do something...
+ # print('in other function')
+ # print(spanin_data['positive']['coords'])
+ for each_hit in spanin_data["positive"]["coords"]:
+ pos_check.append(each_hit[2])
+ pair_dict["pairs"]["location_amount"].append(each_hit[2])
+ pos_uniques = list(
+ set(
+ [
+ end_site
+ for end_site in pos_check
+ if pos_check.count(end_site) >= 1
+ ]
+ )
+ )
+ # print(pos_check)
+ # print(pos_uniques)
+ amt_positive = len(spanin_data["positive"]["coords"])
+ pos_amt_unique = len(pos_uniques)
+ if amt_positive:
+ spanin_data["positive"]["amt_positive"] = amt_positive
+ spanin_data["positive"]["pos_amt_unique"] = pos_amt_unique
+ # pair_dict['pairs']['locations'].extend(pos_uniques)
+ else:
+ spanin_data["positive"]["amt_positive"] = 0
+ spanin_data["positive"]["pos_amt_unique"] = 0
+ if spanin_data["negative"]["coords"]:
+
+ # do something else...
+ # print('in other function')
+ # print(spanin_data['negative']['coords'])
+ for each_hit in spanin_data["negative"]["coords"]:
+ neg_check.append(each_hit[1])
+ pair_dict["pairs"]["location_amount"].append(each_hit[1])
+ neg_uniques = list(
+ set(
+ [
+ start_site
+ for start_site in neg_check
+ if neg_check.count(start_site) >= 1
+ ]
+ )
+ )
+ # print(neg_uniques)
+ amt_negative = len(spanin_data["negative"]["coords"])
+ neg_amt_unique = len(neg_uniques)
+ if amt_negative:
+ spanin_data["negative"]["amt_negative"] = amt_negative
+ spanin_data["negative"]["neg_amt_unique"] = neg_amt_unique
+ # pair_dict['pairs']['locations'].extend(neg_uniques)
+ else:
+ spanin_data["negative"]["amt_negative"] = 0
+ spanin_data["negative"]["neg_amt_unique"] = 0
+ spanin_data["uniques"] += (
+ spanin_data["positive"]["pos_amt_unique"]
+ + spanin_data["negative"]["neg_amt_unique"]
+ )
+ spanin_data["amount"] += (
+ spanin_data["positive"]["amt_positive"]
+ + spanin_data["negative"]["amt_negative"]
+ )
+ # print(spanin_data['uniques'])
+ list(set(pair_dict["pairs"]["location_amount"]))
+ pair_dict["pairs"]["location_amount"] = dict(
+ Counter(pair_dict["pairs"]["location_amount"])
+ )
+ for data in pair_dict.values():
+ # print(data['locations'])
+ # print(type(data['locations']))
+ v = 0
+ for loc, count in data["location_amount"].items():
+ # data['pair_number'] = {loc
+ v += 1
+ data["pair_number"][loc] = v
+ # print(dict(Counter(pair_dict['pairs']['locations'])))
+ # print(pair_dict)
+ spanins["total_amount"] = (
+ spanins["EMBEDDED"]["amount"]
+ + spanins["SEPARATED"]["amount"]
+ + spanins["OVERLAPPED"]["amount"]
+ )
+ spanins["total_unique"] = (
+ spanins["EMBEDDED"]["uniques"]
+ + spanins["SEPARATED"]["uniques"]
+ + spanins["OVERLAPPED"]["uniques"]
+ )
+ # spanins['total_unique'] = len(pair_dict['pairs']['pair_number'])
+ return spanins, pair_dict
+
+
+if __name__ == "__main__":
+
+ # Common parameters for both ISP / OSP portion of script
+
+ parser = argparse.ArgumentParser(
+ description="Trim the putative protein candidates and find potential i-spanin / o-spanin pairs"
+ )
+
+ parser.add_argument(
+ "putative_isp_fasta_file",
+ type=argparse.FileType("r"),
+ help='Putative i-spanin FASTA file, output of "generate-putative-isp"',
+ ) # the "input" argument
+
+ parser.add_argument(
+ "putative_osp_fasta_file",
+ type=argparse.FileType("r"),
+ help='Putative o-spanin FASTA file, output of "generate-putative-osp"',
+ )
+
+ parser.add_argument(
+ "--max_isp_osp_distance",
+ dest="max_isp_osp_distance",
+ default=10,
+ type=int,
+ help="max distance from end of i-spanin to start of o-spanin, measured in AAs",
+ )
+
+ parser.add_argument(
+ "--embedded_txt",
+ dest="embedded_txt",
+ type=argparse.FileType("w"),
+ default="_findSpanin_embedded_results.txt",
+ help="Results of potential embedded spanins",
+ )
+ parser.add_argument(
+ "--overlap_txt",
+ dest="overlap_txt",
+ type=argparse.FileType("w"),
+ default="_findSpanin_overlap_results.txt",
+ help="Results of potential overlapping spanins",
+ )
+ parser.add_argument(
+ "--separate_txt",
+ dest="separate_txt",
+ type=argparse.FileType("w"),
+ default="_findSpanin_separated_results.txt",
+ help="Results of potential separated spanins",
+ )
+
+ parser.add_argument(
+ "--summary_txt",
+ dest="summary_txt",
+ type=argparse.FileType("w"),
+ default="_findSpanin_summary.txt",
+ help="Results of potential spanin pairs",
+ )
+ parser.add_argument(
+ "-v", action="version", version="0.3.0"
+ ) # Is this manually updated?
+ args = parser.parse_args()
+
+ #### RE-WRITE
+ SPANIN_TYPES = {}
+ SPANIN_TYPES["EMBEDDED"] = {}
+ SPANIN_TYPES["OVERLAPPED"] = {}
+ SPANIN_TYPES["SEPARATED"] = {}
+ # SPANIN_TYPES = {
+ # 'EMBEDDED' : {},
+ # 'OVERLAPPED' : {},
+ # 'SEPARATED' : {},
+ # }
+
+ isp = getDescriptions(args.putative_isp_fasta_file)
+ args.putative_isp_fasta_file = open(args.putative_isp_fasta_file.name, "r")
+ isp_full = tuple_fasta(args.putative_isp_fasta_file)
+
+ osp = getDescriptions(args.putative_osp_fasta_file)
+ args.putative_osp_fasta_file = open(args.putative_osp_fasta_file.name, "r")
+ osp_full = tuple_fasta(args.putative_osp_fasta_file)
+
+ #### location data
+ location_data = {"isp": [], "osp": []}
+ spanins = [isp, osp]
+ for idx, each_spanin_type in enumerate(spanins):
+ for description in each_spanin_type:
+ locations = grabLocs(description)
+ if idx == 0: # i-spanin
+ location_data["isp"].append(locations)
+ elif idx == 1: # o-spanin
+ location_data["osp"].append(locations)
+
+ #### Check for types of spanins
+ embedded, overlap, separate = spaninProximity(
+ isp=location_data["isp"],
+ osp=location_data["osp"],
+ max_dist=args.max_isp_osp_distance * 3,
+ )
+
+ SPANIN_TYPES["EMBEDDED"] = embedded
+ SPANIN_TYPES["OVERLAPPED"] = overlap
+ SPANIN_TYPES["SEPARATED"] = separate
+
+ # for spanin_type, spanin in SPANIN_TYPES.items():
+ # s = 0
+ # for sequence in spanin.values():
+ # s += len(sequence)
+ # SPANIN_TYPES[spanin_type]['amount'] = s
+ # SPANIN_TYPES[spanin_type]['unique'] = len(spanin.keys())
+
+ # check_for_unique_spanins(SPANIN_TYPES)
+ spanins = reconfigure_dict(SPANIN_TYPES)
+ spanins, pair_dict = check_for_uniques(spanins)
+ # print(pair_dict)
+ with args.summary_txt as f:
+ for each_spanin_type, spanin_data in spanins.items():
+ try:
+ if each_spanin_type not in ["total_amount", "total_unique"]:
+ # print(each_spanin_type)
+ # print(each_spanin_type)
+ f.write(
+ "=~~~~~= "
+ + str(each_spanin_type)
+ + " Spanin Candidate Statistics =~~~~~=\n"
+ )
+ f.writelines(
+ "Total Candidate Pairs = " + str(spanin_data["amount"]) + "\n"
+ )
+ f.writelines(
+ "Total Unique Pairs = " + str(spanin_data["uniques"]) + "\n"
+ )
+ if each_spanin_type == "EMBEDDED":
+ for k, v in SPANIN_TYPES["EMBEDDED"].items():
+ # print(k)
+ f.writelines(
+ ""
+ + str(k)
+ + " ==> Amount of corresponding candidate o-spanins(s): "
+ + str(len(v))
+ + "\n"
+ )
+ if each_spanin_type == "SEPARATED":
+ for k, v in SPANIN_TYPES["SEPARATED"].items():
+ f.writelines(
+ ""
+ + str(k)
+ + " ==> Amount of corresponding candidate o-spanins(s): "
+ + str(len(v))
+ + "\n"
+ )
+ if each_spanin_type == "OVERLAPPED":
+ for k, v in SPANIN_TYPES["OVERLAPPED"].items():
+ f.writelines(
+ ""
+ + str(k)
+ + " ==> Amount of corresponding candidate o-spanins(s): "
+ + str(len(v))
+ + "\n"
+ )
+ except TypeError:
+ continue
+ f.write("\n=~~~~~= Tally from ALL spanin types =~~~~~=\n")
+ f.writelines("Total Candidates = " + str(spanins["total_amount"]) + "\n")
+ f.writelines(
+ "Total Unique Candidate Pairs = " + str(spanins["total_unique"]) + "\n"
+ )
+
+ args.putative_isp_fasta_file = open(args.putative_isp_fasta_file.name, "r")
+ isp_full = tuple_fasta(args.putative_isp_fasta_file)
+
+ args.putative_osp_fasta_file = open(args.putative_osp_fasta_file.name, "r")
+ osp_full = tuple_fasta(args.putative_osp_fasta_file)
+
+ # print(isp_full)
+ isp_seqs = []
+ osp_seqs = []
+ for isp_tupe in isp_full:
+ # print(isp_tupe)
+ for pisp, posp in embedded.items():
+ # print(f"ISP = searching for {pisp} in {isp_tupe[0]}")
+ if re.search(("(" + str(pisp) + ")\D"), isp_tupe[0]):
+ # print(isp_tupe[0])
+ # print(peri_count)
+ peri_count = str.split(isp_tupe[0], "~=")[1]
+ isp_seqs.append((pisp, isp_tupe[1], peri_count))
+ # print(isp_seqs)
+ for osp_tupe in osp_full:
+ for pisp, posp in embedded.items():
+ for data in posp:
+ # print(f"OSP = searching for {data[3]} in {osp_tupe[0]}, coming from this object: {data}")
+ if re.search(("(" + str(data[3]) + ")\D"), osp_tupe[0]):
+ peri_count = str.split(osp_tupe[0], "~=")[1]
+ osp_seqs.append((data[3], osp_tupe[1], peri_count))
+
+ with args.embedded_txt as f:
+ f.write("================ embedded spanin candidates =================\n")
+ f.write(
+ "isp\tisp_start\tisp_end\tosp\tosp_start\tosp_end\tstrand\tpair_number\n"
+ )
+ if embedded != {}:
+ # print(embedded)
+ for pisp, posp in embedded.items():
+ # print(f"{pisp} - {posp}")
+ f.write(pisp + "\n")
+ for each_posp in posp:
+ # print(posp)
+ f.write(
+ "\t{}\t{}\t{}\t{}\t{}\t{}\t".format(
+ each_posp[1],
+ each_posp[2],
+ each_posp[3],
+ each_posp[4],
+ each_posp[5],
+ each_posp[6],
+ )
+ )
+ if each_posp[6] == "+":
+ if each_posp[2] in pair_dict["pairs"]["pair_number"].keys():
+ f.write(
+ ""
+ + str(pair_dict["pairs"]["pair_number"][each_posp[2]])
+ + "\n"
+ )
+ elif each_posp[6] == "-":
+ if each_posp[1] in pair_dict["pairs"]["pair_number"].keys():
+ f.write(
+ ""
+ + str(pair_dict["pairs"]["pair_number"][each_posp[1]])
+ + "\n"
+ )
+ else:
+ f.write("nothing found")
+
+ with open(args.embedded_txt.name, "a") as f:
+ f.write("\n================= embedded candidate sequences ================\n")
+ f.write("======================= isp ==========================\n\n")
+ for isp_data in isp_seqs:
+ # print(isp_data)
+ f.write(
+ ">isp_orf::{}-peri_count~={}\n{}\n".format(
+ isp_data[0], isp_data[2], lineWrapper(isp_data[1])
+ )
+ )
+ f.write("\n======================= osp ========================\n\n")
+ for osp_data in osp_seqs:
+ f.write(
+ ">osp_orf::{}-peri_count~={}\n{}\n".format(
+ osp_data[0], osp_data[2], lineWrapper(osp_data[1])
+ )
+ )
+
+ args.putative_isp_fasta_file = open(args.putative_isp_fasta_file.name, "r")
+ isp_full = tuple_fasta(args.putative_isp_fasta_file)
+
+ args.putative_osp_fasta_file = open(args.putative_osp_fasta_file.name, "r")
+ osp_full = tuple_fasta(args.putative_osp_fasta_file)
+
+ isp_seqs = []
+ osp_seqs = []
+ for isp_tupe in isp_full:
+ peri_count = str.split(isp_tupe[0], "~=")[1]
+ for pisp, posp in overlap.items():
+ if re.search(("(" + str(pisp) + ")\D"), isp_tupe[0]):
+ peri_count = str.split(isp_tupe[0], "~=")[1]
+ isp_seqs.append((pisp, isp_tupe[1], peri_count))
+
+ for osp_tupe in osp_full:
+ for pisp, posp in overlap.items():
+ for data in posp:
+ if re.search(("(" + str(data[3]) + ")\D"), osp_tupe[0]):
+ peri_count = str.split(osp_tupe[0], "~=")[1]
+ osp_seqs.append((data[3], osp_tupe[1], peri_count))
+
+ with args.overlap_txt as f:
+ f.write("================ overlap spanin candidates =================\n")
+ f.write(
+ "isp\tisp_start\tisp_end\tosp\tosp_start\tosp_end\tstrand\tpair_number\n"
+ )
+ if overlap != {}:
+ for pisp, posp in overlap.items():
+ f.write(pisp + "\n")
+ for each_posp in posp:
+ f.write(
+ "\t{}\t{}\t{}\t{}\t{}\t{}\t".format(
+ each_posp[1],
+ each_posp[2],
+ each_posp[3],
+ each_posp[4],
+ each_posp[5],
+ each_posp[6],
+ )
+ )
+ if each_posp[6] == "+":
+ if each_posp[2] in pair_dict["pairs"]["pair_number"].keys():
+ # print('ovl ; +')
+ f.write(
+ ""
+ + str(pair_dict["pairs"]["pair_number"][each_posp[2]])
+ + "\n"
+ )
+ elif each_posp[6] == "-":
+ if each_posp[1] in pair_dict["pairs"]["pair_number"].keys():
+ f.write(
+ ""
+ + str(pair_dict["pairs"]["pair_number"][each_posp[1]])
+ + "\n"
+ )
+ else:
+ f.write("nothing found")
+
+ with open(args.overlap_txt.name, "a") as f:
+ # print(isp_seqs)
+ f.write("\n================= overlap candidate sequences ================\n")
+ f.write("======================= isp ==========================\n\n")
+ for isp_data in isp_seqs:
+ f.write(
+ ">isp_orf::{}-pericount~={}\n{}\n".format(
+ isp_data[0], isp_data[2], lineWrapper(isp_data[1])
+ )
+ )
+ f.write("\n======================= osp ========================\n\n")
+ for osp_data in osp_seqs:
+ f.write(
+ ">osp_orf::{}-pericount~={}\n{}\n".format(
+ osp_data[0], osp_data[2], lineWrapper(osp_data[1])
+ )
+ )
+
+ args.putative_isp_fasta_file = open(args.putative_isp_fasta_file.name, "r")
+ isp_full = tuple_fasta(args.putative_isp_fasta_file)
+ args.putative_osp_fasta_file = open(args.putative_osp_fasta_file.name, "r")
+ osp_full = tuple_fasta(args.putative_osp_fasta_file)
+
+ isp_seqs = []
+ osp_seqs = []
+ for isp_tupe in isp_full:
+ for pisp, posp in separate.items():
+ if re.search(("(" + str(pisp) + ")\D"), isp_tupe[0]):
+ peri_count = str.split(isp_tupe[0], "~=")[1]
+ isp_seqs.append((pisp, isp_tupe[1], peri_count))
+ # print(isp_seqs)
+ for osp_tupe in osp_full:
+ for pisp, posp in separate.items():
+ for data in posp:
+ if re.search(("(" + str(data[3]) + ")\D"), osp_tupe[0]):
+ peri_count = str.split(osp_tupe[0], "~=")[1]
+ osp_seqs.append((data[3], osp_tupe[1], peri_count))
+
+ with args.separate_txt as f:
+ f.write("================ separated spanin candidates =================\n")
+ f.write(
+ "isp\tisp_start\tisp_end\tosp\tosp_start\tosp_end\tstrand\tpair_number\n"
+ )
+ if separate != {}:
+ for pisp, posp in separate.items():
+ f.write(pisp + "\n")
+ for each_posp in posp:
+ f.write(
+ "\t{}\t{}\t{}\t{}\t{}\t{}\t".format(
+ each_posp[1],
+ each_posp[2],
+ each_posp[3],
+ each_posp[4],
+ each_posp[5],
+ each_posp[6],
+ )
+ )
+ if each_posp[6] == "+":
+ if each_posp[2] in pair_dict["pairs"]["pair_number"].keys():
+ f.write(
+ ""
+ + str(pair_dict["pairs"]["pair_number"][each_posp[2]])
+ + "\n"
+ )
+ elif each_posp[6] == "-":
+ if each_posp[1] in pair_dict["pairs"]["pair_number"].keys():
+ f.write(
+ ""
+ + str(pair_dict["pairs"]["pair_number"][each_posp[1]])
+ + "\n"
+ )
+ else:
+ f.write("nothing found")
+
+ with open(args.separate_txt.name, "a") as f:
+ f.write("\n================= separated candidate sequences ================\n")
+ f.write("======================= isp ==========================\n\n")
+ for isp_data in isp_seqs:
+ f.write(
+ ">isp_orf::{}-pericount~={}\n{}\n".format(
+ isp_data[0], isp_data[2], lineWrapper(isp_data[1])
+ )
+ )
+ f.write("\n======================= osp ========================\n\n")
+ for osp_data in osp_seqs:
+ f.write(
+ ">osp_orf::{}-pericount~={}\n{}\n".format(
+ osp_data[0], osp_data[2], lineWrapper(osp_data[1])
+ )
+ )
diff -r 1a7fef71aee3 -r fd70980a516b findSpanin.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/findSpanin.xml Mon Jun 05 02:42:01 2023 +0000
@@ -0,0 +1,43 @@
+
+ With the outputs from the ISP and OSP candidate tools, cull the list down to candidate pairs
+
+ macros.xml
+ cpt-macros.xml
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Putative i-spanin and o-spanin protein multiFASTAs (generated from the ISP/OSP Candidate Tools).
+
+**METHODOLOGY**
+Does a pairwise comparison between candidate i-spanins and o-spanins based on their genomic location, and classifies them into the known bimolecular spanin genetic architectures. Classes are: embedded, overlapping, separated, or NOT a potential pair.
+
+**OUTPUT** --> File with candidate pairs for each bimolecular spanin class and a basic summary statistics file.
+
+]]>
+
+
diff -r 1a7fef71aee3 -r fd70980a516b macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml Mon Jun 05 02:42:01 2023 +0000
@@ -0,0 +1,58 @@
+
+
+
+ regex
+ python
+ biopython
+ cpt_gffparser
+
+
+
+
+ $blast_tsv
+
+
+
+
+
+ $blast_xml
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ $gff3_data
+
+
+ genomeref.fa
+
+
+ ln -s $genome_fasta genomeref.fa;
+
+
+ genomeref.fa
+
+
+
+
+
+ $sequences
+
+
+
+
+
diff -r 1a7fef71aee3 -r fd70980a516b spaninFuncs.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/spaninFuncs.py Mon Jun 05 02:42:01 2023 +0000
@@ -0,0 +1,530 @@
+"""
+PREMISE
+### Functions/Classes that are used in both generate-putative-osp.py and generate-putative-isp.py
+###### Main premise here is to make the above scripts a little more DRY, as well as easily readable for execution.
+###### Documentation will ATTEMPT to be thourough here
+"""
+
+import re
+from Bio import SeqIO
+from Bio import Seq
+from collections import OrderedDict
+
+# Not written in OOP for a LITTLE bit of trying to keep the complication down in case adjustments are needed by someone else.
+# Much of the manipulation is string based; so it should be straightforward as well as moderately quick
+################## GLobal Variables
+Lys = "K"
+
+
+def check_back_end_snorkels(seq, tmsize):
+ """
+ Searches through the backend of a potential TMD snorkel. This is the 2nd part of a TMD snorkel lysine match.
+ --> seq : should be the sequence fed from the "search_region" portion of the sequence
+ --> tmsize : size of the potential TMD being investigated
+ """
+ found = []
+ if seq[tmsize - 4] == Lys and re.search(("[FIWLVMYCATGS]"), seq[tmsize - 5]):
+ found = "match"
+ return found
+ elif seq[tmsize - 3] == Lys and re.search(("[FIWLVMYCATGS]"), seq[tmsize - 4]):
+ found = "match"
+ return found
+ elif seq[tmsize - 2] == Lys and re.search(("[FIWLVMYCATGS]"), seq[tmsize - 3]):
+ found = "match"
+ return found
+ elif seq[tmsize - 1] == Lys and re.search(("[FIWLVMYCATGS]"), seq[tmsize - 2]):
+ found = "match"
+ return found
+ else:
+ found = "NOTmatch"
+ return found
+
+
+def prep_a_gff3(fa, spanin_type, org):
+ """
+ Function parses an input detailed 'fa' file and outputs a 'gff3' file
+ ---> fa = input .fa file
+ ---> output = output a returned list of data, easily portable to a gff3 next
+ ---> spanin_type = 'isp' or 'osp'
+ """
+ with org as f:
+ header = f.readline()
+ orgacc = header.split(" ")
+ orgacc = orgacc[0].split(">")[1].strip()
+ fa_zip = tuple_fasta(fa)
+ data = []
+ for a_pair in fa_zip:
+ # print(a_pair)
+ if re.search(("(\[1\])"), a_pair[0]):
+ strand = "+"
+ elif re.search(("(\[-1\])"), a_pair[0]):
+ strand = "-" # column 7
+ start = re.search(("[\d]+\.\."), a_pair[0]).group(0).split("..")[0] # column 4
+ end = re.search(("\.\.[\d]+"), a_pair[0]).group(0).split("..")[1] # column 5
+ orfid = re.search(("(ORF)[\d]+"), a_pair[0]).group(0) # column 1
+ if spanin_type == "isp":
+ methodtype = "CDS" # column 3
+ spanin = "isp"
+ elif spanin_type == "osp":
+ methodtype = "CDS" # column 3
+ spanin = "osp"
+ elif spanin_type == "usp":
+ methodtype = "CDS"
+ spanin = "usp"
+ else:
+ raise "need to input spanin type"
+ source = "cpt.py|putative-*.py" # column 2
+ score = "." # column 6
+ phase = "." # column 8
+ attributes = (
+ "ID=" + orgacc + "|" + orfid + ";ALIAS=" + spanin + ";SEQ=" + a_pair[1]
+ ) # column 9
+ sequence = [
+ [orgacc, source, methodtype, start, end, score, strand, phase, attributes]
+ ]
+ data += sequence
+ return data
+
+
+def write_gff3(data, output="results.gff3"):
+ """
+ Parses results from prep_a_gff3 into a gff3 file
+ ---> input : list from prep_a_gff3
+ ---> output : gff3 file
+ """
+ data = data
+ filename = output
+ with filename as f:
+ f.write("#gff-version 3\n")
+ for value in data:
+ f.write(
+ "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
+ value[0],
+ value[1],
+ value[2],
+ value[3],
+ value[4],
+ value[5],
+ value[6],
+ value[7],
+ value[8],
+ )
+ )
+ f.close()
+
+
+def find_tmd(
+ pair,
+ minimum=10,
+ maximum=30,
+ TMDmin=10,
+ TMDmax=20,
+ isp_mode=False,
+ peri_min=18,
+ peri_max=206,
+):
+ """
+ Function that searches for lysine snorkels and then for a spanning hydrophobic region that indicates a potential TMD
+ ---> pair : Input of tuple with description and AA sequence (str)
+ ---> minimum : How close from the initial start codon a TMD can be within
+ ---> maximum : How far from the initial start codon a TMD can be within
+ ---> TMDmin : The minimum size that a transmembrane can be (default = 10)
+ ---> TMDmax : The maximum size tha ta transmembrane can be (default = 20)
+ """
+ # hydrophobicAAs = ['P', 'F', 'I', 'W', 'L', 'V', 'M', 'Y', 'C', 'A', 'T', 'G', 'S']
+ tmd = []
+ s = str(pair[1]) # sequence being analyzed
+ # print(s) # for trouble shooting
+ if maximum > len(s):
+ maximum = len(s)
+ search_region = s[minimum - 1 : maximum + 1]
+ # print(f"this is the search region: {search_region}")
+ # print(search_region) # for trouble shooting
+
+ for tmsize in range(TMDmin, TMDmax + 1, 1):
+ # print(f"this is the current tmsize we're trying: {tmsize}")
+ # print('==============='+str(tmsize)+'================') # print for troubleshooting
+ pattern = (
+ "[PFIWLVMYCATGS]{" + str(tmsize) + "}"
+ ) # searches for these hydrophobic residues tmsize total times
+ # print(pattern)
+ # print(f"sending to regex: {search_region}")
+ if re.search(
+ ("[K]"), search_region[1:8]
+ ): # grabbing one below with search region, so I want to grab one ahead here when I query.
+ store_search = re.search(
+ ("[K]"), search_region[1:8]
+ ) # storing regex object
+ where_we_are = store_search.start() # finding where we got the hit
+ if re.search(
+ ("[PFIWLVMYCATGS]"), search_region[where_we_are + 1]
+ ) and re.search(
+ ("[PFIWLVMYCATGS]"), search_region[where_we_are - 1]
+ ): # hydrophobic neighbor
+ # try:
+ g = re.search(
+ ("[PFIWLVMYCATGS]"), search_region[where_we_are + 1]
+ ).group()
+ backend = check_back_end_snorkels(search_region, tmsize)
+ if backend == "match":
+ if isp_mode:
+ g = re.search((pattern), search_region).group()
+ end_of_tmd = re.search((g), s).end() + 1
+ amt_peri = len(s) - end_of_tmd
+ if peri_min <= amt_peri <= peri_max:
+ pair_desc = pair[0] + ", peri_count~=" + str(amt_peri)
+ new_pair = (pair_desc, pair[1])
+ tmd.append(new_pair)
+ else:
+ tmd.append(pair)
+ else:
+ continue
+ # else:
+ # print("I'm continuing out of snorkel loop")
+ # print(f"{search_region}")
+ # continue
+ if re.search((pattern), search_region):
+ # print(f"found match: {}")
+ # print("I AM HEREEEEEEEEEEEEEEEEEEEEEEE")
+ # try:
+ if isp_mode:
+ g = re.search((pattern), search_region).group()
+ end_of_tmd = re.search((g), s).end() + 1
+ amt_peri = len(s) - end_of_tmd
+ if peri_min <= amt_peri <= peri_max:
+ pair_desc = pair[0] + ", peri_count~=" + str(amt_peri)
+ new_pair = (pair_desc, pair[1])
+ tmd.append(new_pair)
+ else:
+ tmd.append(pair)
+ else:
+ continue
+
+ return tmd
+
+
+def find_lipobox(
+ pair, minimum=10, maximum=50, min_after=30, max_after=185, regex=1, osp_mode=False
+):
+ """
+ Function that takes an input tuple, and will return pairs of sequences to their description that have a lipoobox
+ ---> minimum - min distance from start codon to first AA of lipobox
+ ---> maximum - max distance from start codon to first AA of lipobox
+ ---> regex - option 1 (default) => more strict regular expression ; option 2 => looser selection, imported from LipoRy
+
+ """
+ if regex == 1:
+ pattern = "[ILMFTV][^REKD][GAS]C" # regex for Lipobox from findSpanin.pl
+ elif regex == 2:
+ pattern = "[ACGSILMFTV][^REKD][GAS]C" # regex for Lipobox from LipoRy
+
+ candidates = []
+ s = str(pair[1])
+ # print(s) # trouble shooting
+ search_region = s[
+ minimum - 1 : maximum + 5
+ ] # properly slice the input... add 4 to catch if it hangs off at max input
+ # print(search_region) # trouble shooting
+ patterns = ["[ILMFTV][^REKD][GAS]C", "AW[AGS]C"]
+ for pattern in patterns:
+ # print(pattern) # trouble shooting
+ if re.search((pattern), search_region): # lipobox must be WITHIN the range...
+ # searches the sequence with the input RegEx AND omits if
+ g = re.search(
+ (pattern), search_region
+ ).group() # find the exact group match
+ amt_peri = len(s) - re.search((g), s).end() + 1
+ if min_after <= amt_peri <= max_after: # find the lipobox end region
+ if osp_mode:
+ pair_desc = pair[0] + ", peri_count~=" + str(amt_peri)
+ new_pair = (pair_desc, pair[1])
+ candidates.append(new_pair)
+ else:
+ candidates.append(pair)
+
+ return candidates
+
+
+def tuple_fasta(fasta_file):
+ """
+ #### INPUT: Fasta File
+ #### OUTPUT: zipped (zip) : pairwise relationship of description to sequence
+ ####
+ """
+ fasta = SeqIO.parse(fasta_file, "fasta")
+ descriptions = []
+ sequences = []
+ for r in fasta: # iterates and stores each description and sequence
+ description = r.description
+ sequence = str(r.seq)
+ if (
+ sequence[0] != "I"
+ ): # the translation table currently has I as a potential start codon ==> this will remove all ORFs that start with I
+ descriptions.append(description)
+ sequences.append(sequence)
+ else:
+ continue
+
+ return zip(descriptions, sequences)
+
+
+def lineWrapper(text, charactersize=60):
+
+ if len(text) <= charactersize:
+ return text
+ else:
+ return (
+ text[:charactersize]
+ + "\n"
+ + lineWrapper(text[charactersize:], charactersize)
+ )
+
+
+def getDescriptions(fasta):
+ """
+ Takes an output FASTA file, and parses retrieves the description headers. These headers contain information needed
+ for finding locations of a potential i-spanin and o-spanin proximity to one another.
+ """
+ desc = []
+ with fasta as f:
+ for line in f:
+ if line.startswith(">"):
+ desc.append(line)
+ return desc
+
+
+def splitStrands(text, strand="+"):
+ # positive_strands = []
+ # negative_strands = []
+ if strand == "+":
+ if re.search(("(\[1\])"), text):
+ return text
+ elif strand == "-":
+ if re.search(("(\[-1\])"), text):
+ return text
+ # return positive_strands, negative_strands
+
+
+def parse_a_range(pair, start, end):
+ """
+ Takes an input data tuple from a fasta tuple pair and keeps only those within the input sequence range
+ ---> data : fasta tuple data
+ ---> start : start range to keep
+ ---> end : end range to keep (will need to + 1)
+ """
+ matches = []
+ for each_pair in pair:
+
+ s = re.search(("[\d]+\.\."), each_pair[0]).group(0) # Start of the sequence
+ s = int(s.split("..")[0])
+ e = re.search(("\.\.[\d]+"), each_pair[0]).group(0)
+ e = int(e.split("..")[1])
+ if start - 1 <= s and e <= end + 1:
+ matches.append(each_pair)
+ else:
+ continue
+ # else:
+ # continue
+ # if matches != []:
+ return matches
+ # else:
+ # print('no candidates within selected range')
+
+
+def grabLocs(text):
+ """
+ Grabs the locations of the spanin based on NT location (seen from ORF). Grabs the ORF name, as per named from the ORF class/module
+ from cpt.py
+ """
+ start = re.search(("[\d]+\.\."), text).group(
+ 0
+ ) # Start of the sequence ; looks for [numbers]..
+ end = re.search(("\.\.[\d]+"), text).group(
+ 0
+ ) # End of the sequence ; Looks for ..[numbers]
+ orf = re.search(("(ORF)[\d]+"), text).group(
+ 0
+ ) # Looks for ORF and the numbers that are after it
+ if re.search(("(\[1\])"), text): # stores strand
+ strand = "+"
+ elif re.search(("(\[-1\])"), text): # stores strand
+ strand = "-"
+ start = int(start.split("..")[0])
+ end = int(end.split("..")[1])
+ vals = [start, end, orf, strand]
+
+ return vals
+
+
+def spaninProximity(isp, osp, max_dist=30):
+ """
+ _NOTE THIS FUNCTION COULD BE MODIFIED TO RETURN SEQUENCES_
+ Compares the locations of i-spanins and o-spanins. max_dist is the distance in NT measurement from i-spanin END site
+ to o-spanin START. The user will be inputting AA distance, so a conversion will be necessary ( * 3)
+ I modified this on 07.30.2020 to bypass the pick + or - strand. To
+ INPUT: list of OSP and ISP candidates
+ OUTPUT: Return (improved) candidates for overlapping, embedded, and separate list
+ """
+
+ embedded = {}
+ overlap = {}
+ separate = {}
+ for iseq in isp:
+ embedded[iseq[2]] = []
+ overlap[iseq[2]] = []
+ separate[iseq[2]] = []
+ for oseq in osp:
+ if iseq[3] == "+":
+ if oseq[3] == "+":
+ if iseq[0] < oseq[0] < iseq[1] and oseq[1] < iseq[1]:
+ ### EMBEDDED ###
+ combo = [
+ iseq[0],
+ iseq[1],
+ oseq[2],
+ oseq[0],
+ oseq[1],
+ iseq[3],
+ ] # ordering a return for dic
+ embedded[iseq[2]] += [combo]
+ elif iseq[0] < oseq[0] <= iseq[1] and oseq[1] > iseq[1]:
+ ### OVERLAP / SEPARATE ###
+ if (iseq[1] - oseq[0]) < 6:
+ combo = [
+ iseq[0],
+ iseq[1],
+ oseq[2],
+ oseq[0],
+ oseq[1],
+ iseq[3],
+ ]
+ separate[iseq[2]] += [combo]
+ else:
+ combo = [
+ iseq[0],
+ iseq[1],
+ oseq[2],
+ oseq[0],
+ oseq[1],
+ iseq[3],
+ ]
+ overlap[iseq[2]] += [combo]
+ elif iseq[1] <= oseq[0] <= iseq[1] + max_dist:
+ combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1], iseq[3]]
+ separate[iseq[2]] += [combo]
+ else:
+ continue
+ if iseq[3] == "-":
+ if oseq[3] == "-":
+ if iseq[0] <= oseq[1] <= iseq[1] and oseq[0] > iseq[0]:
+ ### EMBEDDED ###
+ combo = [
+ iseq[0],
+ iseq[1],
+ oseq[2],
+ oseq[0],
+ oseq[1],
+ iseq[3],
+ ] # ordering a return for dict
+ embedded[iseq[2]] += [combo]
+ elif iseq[0] <= oseq[1] <= iseq[1] and oseq[0] < iseq[0]:
+ if (oseq[1] - iseq[0]) < 6:
+ combo = [
+ iseq[0],
+ iseq[1],
+ oseq[2],
+ oseq[0],
+ oseq[1],
+ iseq[3],
+ ]
+ separate[iseq[2]] += [combo]
+ else:
+ combo = [
+ iseq[0],
+ iseq[1],
+ oseq[2],
+ oseq[0],
+ oseq[1],
+ iseq[3],
+ ]
+ overlap[iseq[2]] += [combo]
+ elif iseq[0] - 10 < oseq[1] < iseq[0]:
+ combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1], iseq[3]]
+ separate[iseq[2]] += [combo]
+ else:
+ continue
+
+ embedded = {k: embedded[k] for k in embedded if embedded[k]}
+ overlap = {k: overlap[k] for k in overlap if overlap[k]}
+ separate = {k: separate[k] for k in separate if separate[k]}
+
+ return embedded, overlap, separate
+
+
+def check_for_usp():
+ "pass"
+
+
+############################################### TEST RANGE #########################################################################
+####################################################################################################################################
+if __name__ == "__main__":
+
+ #### TMD TEST
+ test_desc = ["one", "two", "three", "four", "five"]
+ test_seq = [
+ "XXXXXXXXXXXXXXXFMCFMCFMCFMCFMCXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "XXXXXXXXAAKKKKKKKKKKKKKKKXXXXXXXXXXXXX",
+ "XXXXXXX",
+ "XXXXXXXXXXXKXXXXXXXXXX",
+ "XXXXXXXXXXAKXXXXXXXXXXAKXXXXXXXX",
+ ]
+ # for l in
+ # combo = zip(test_desc,test_seq)
+ pairs = zip(test_desc, test_seq)
+ tmd = []
+ for each_pair in pairs:
+ # print(each_pair)
+ try:
+ tmd += find_tmd(pair=each_pair)
+ except (IndexError, TypeError):
+ continue
+ # try:s = each_pair[1]
+ # tmd += find_tmd(seq=s, tmsize=15)
+ # print('\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n')
+ # print(tmd)
+ # print('\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n')
+
+ #### tuple-fasta TEST
+ # fasta_file = 'out_isp.fa'
+ # ret = tuple_fasta(fasta_file)
+ # print('=============')
+ # for i in ret:
+ # print(i[1])
+
+ #### LipoBox TEST
+ test_desc = ["one", "two", "three", "four", "five", "six", "seven"]
+ test_seq = [
+ "XXXXXXXXXTGGCXXXXXXXXXXXXXXXX",
+ "XXXXXXXXAAKKKKKKKKKKKKKKKXXXXXXXXXXXXX",
+ "XXXXXXX",
+ "AGGCXXXXXXXXXXXXXXXXXXXXTT",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXTGGC",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXTGGC",
+ "MSTLRELRLRRALKEQSMRYLLSIKKTLPRWKGALIGLFLICVATISGCASESKLPEPPMVSVDSSLMVEPNLTTEMLNVFSQ*",
+ ]
+ pairs = zip(test_desc, test_seq)
+ lipo = []
+ for each_pair in pairs:
+ # print(each_pair)
+ # try:
+ try:
+ lipo += find_lipobox(pair=each_pair, regex=2) # , minimum=8)
+ except TypeError: # catches if something doesnt have the min/max requirements (something is too small)
+ continue
+ # except:
+ # continue
+ # print('\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n')
+ #############################3
+ # g = prep_a_gff3(fa='putative_isp.fa', spanin_type='isp')
+ # print(g)
+ # write_gff3(data=g)