# HG changeset patch # User cpt # Date 1685932921 0 # Node ID fd70980a516b2db8b444cc15577cbb06234cc421 # Parent 1a7fef71aee3ae232349b37f3a3ce5f76a03523c planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c diff -r 1a7fef71aee3 -r fd70980a516b cpt-macros.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt-macros.xml Mon Jun 05 02:42:01 2023 +0000 @@ -0,0 +1,114 @@ + + + + python + biopython + requests + + + + + + + + 10.1371/journal.pcbi.1008214 + @unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + + + + + 10.1371/journal.pcbi.1008214 + + @unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + + + + + + + 10.1371/journal.pcbi.1008214 + + @unpublished{galaxyTools, + author = {C. Ross}, + title = {CPT Galaxy Tools}, + year = {2020-}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + + + + + + + 10.1371/journal.pcbi.1008214 + + @unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + + + @unpublished{galaxyTools, + author = {A. Criscione}, + title = {CPT Galaxy Tools}, + year = {2019-2021}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + + + + + + + 10.1371/journal.pcbi.1008214 + + @unpublished{galaxyTools, + author = {A. Criscione}, + title = {CPT Galaxy Tools}, + year = {2019-2021}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + + + + + + + 10.1371/journal.pcbi.1008214 + + @unpublished{galaxyTools, + author = {C. Maughmer}, + title = {CPT Galaxy Tools}, + year = {2017-2020}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + + + + + + + @unpublished{galaxyTools, + author = {C. Maughmer}, + title = {CPT Galaxy Tools}, + year = {2017-2020}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + + + + diff -r 1a7fef71aee3 -r fd70980a516b cpt.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt.py Mon Jun 05 02:42:01 2023 +0000 @@ -0,0 +1,341 @@ +#!/usr/bin/env python +from Bio.Seq import Seq, reverse_complement, translate +from Bio.SeqRecord import SeqRecord +from Bio import SeqIO +from Bio.Data import CodonTable +import logging + +logging.basicConfig() +log = logging.getLogger() + +PHAGE_IN_MIDDLE = re.compile("^(?P.*)\s*phage (?P.*)$") +BACTERIOPHAGE_IN_MIDDLE = re.compile("^(?P.*)\s*bacteriophage (?P.*)$") +STARTS_WITH_PHAGE = re.compile( + "^(bacterio|vibrio|Bacterio|Vibrio|)?[Pp]hage (?P.*)$" +) +NEW_STYLE_NAMES = re.compile("(?Pv[A-Z]_[A-Z][a-z]{2}_.*)") + + +def phage_name_parser(name): + host = None + phage = None + name = name.replace(", complete genome.", "") + name = name.replace(", complete genome", "") + + m = BACTERIOPHAGE_IN_MIDDLE.match(name) + if m: + host = m.group("host") + phage = m.group("phage") + return (host, phage) + + m = PHAGE_IN_MIDDLE.match(name) + if m: + host = m.group("host") + phage = m.group("phage") + return (host, phage) + + m = STARTS_WITH_PHAGE.match(name) + if m: + phage = m.group("phage") + return (host, phage) + + m = NEW_STYLE_NAMES.match(name) + if m: + phage = m.group("phage") + return (host, phage) + + return (host, phage) + + +class OrfFinder(object): + def __init__(self, table, ftype, ends, min_len, strand): + self.table = table + self.table_obj = CodonTable.ambiguous_generic_by_id[table] + self.ends = ends + self.ftype = ftype + self.min_len = min_len + self.starts = sorted(self.table_obj.start_codons) + self.stops = sorted(self.table_obj.stop_codons) + self.re_starts = re.compile("|".join(self.starts)) + self.re_stops = re.compile("|".join(self.stops)) + self.strand = strand + + def locate(self, fasta_file, out_nuc, out_prot, out_bed, out_gff3): + seq_format = "fasta" + log.debug("Genetic code table %i" % self.table) + log.debug("Minimum length %i aa" % self.min_len) + + out_count = 0 + + out_gff3.write("##gff-version 3\n") + + for idx, record in enumerate(SeqIO.parse(fasta_file, seq_format)): + for i, (f_start, f_end, f_strand, n, t) in enumerate( + self.get_all_peptides(str(record.seq).upper()) + ): + out_count += 1 + + descr = "length %i aa, %i bp, from %s..%s[%s] of %s" % ( + len(t), + len(n), + f_start, + f_end, + f_strand, + record.description, + ) + fid = record.id + "|%s%i" % (self.ftype, i + 1) + + r = SeqRecord(Seq(n), id=fid, name="", description=descr) + t = SeqRecord(Seq(t), id=fid, name="", description=descr) + + SeqIO.write(r, out_nuc, "fasta") + SeqIO.write(t, out_prot, "fasta") + + nice_strand = "+" if f_strand == +1 else "-" + + out_bed.write( + "\t".join( + map(str, [record.id, f_start, f_end, fid, 0, nice_strand]) + ) + + "\n" + ) + + out_gff3.write( + "\t".join( + map( + str, + [ + record.id, + "getOrfsOrCds", + "CDS", + f_start + 1, + f_end, + ".", + nice_strand, + 0, + "ID=%s.%s.%s" % (self.ftype, idx, i + 1), + ], + ) + ) + + "\n" + ) + log.info("Found %i %ss", out_count, self.ftype) + + def start_chop_and_trans(self, s, strict=True): + """Returns offset, trimmed nuc, protein.""" + if strict: + assert s[-3:] in self.stops, s + assert len(s) % 3 == 0 + for match in self.re_starts.finditer(s, overlapped=True): + # Must check the start is in frame + start = match.start() + if start % 3 == 0: + n = s[start:] + assert len(n) % 3 == 0, "%s is len %i" % (n, len(n)) + if strict: + t = translate(n, self.table) + else: + # Use when missing stop codon, + t = "M" + translate(n[3:], self.table, to_stop=True) + yield start, n, t # Edited by CPT to be a generator + + def break_up_frame(self, s): + """Returns offset, nuc, protein.""" + start = 0 + for match in self.re_stops.finditer(s, overlapped=True): + index = match.start() + 3 + if index % 3 != 0: + continue + n = s[start:index] + for (offset, n, t) in self.start_chop_and_trans(n): + if n and len(t) >= self.min_len: + yield start + offset, n, t + start = index + + def putative_genes_in_sequence(self, nuc_seq): + """Returns start, end, strand, nucleotides, protein. + Co-ordinates are Python style zero-based. + """ + nuc_seq = nuc_seq.upper() + # TODO - Refactor to use a generator function (in start order) + # rather than making a list and sorting? + answer = [] + full_len = len(nuc_seq) + + for frame in range(0, 3): + for offset, n, t in self.break_up_frame(nuc_seq[frame:]): + start = frame + offset # zero based + answer.append((start, start + len(n), +1, n, t)) + + rc = reverse_complement(nuc_seq) + for frame in range(0, 3): + for offset, n, t in self.break_up_frame(rc[frame:]): + start = full_len - frame - offset # zero based + answer.append((start, start - len(n), -1, n, t)) + answer.sort() + return answer + + def get_all_peptides(self, nuc_seq): + """Returns start, end, strand, nucleotides, protein. + + Co-ordinates are Python style zero-based. + """ + # Refactored into generator by CPT + full_len = len(nuc_seq) + if self.strand != "reverse": + for frame in range(0, 3): + for offset, n, t in self.break_up_frame(nuc_seq[frame:]): + start = frame + offset # zero based + yield (start, start + len(n), +1, n, t) + if self.strand != "forward": + rc = reverse_complement(nuc_seq) + for frame in range(0, 3): + for offset, n, t in self.break_up_frame(rc[frame:]): + start = full_len - frame - offset # zero based + yield (start - len(n), start, -1, n, t) + + +class MGAFinder(object): + def __init__(self, table, ftype, ends, min_len): + self.table = table + self.table_obj = CodonTable.ambiguous_generic_by_id[table] + self.ends = ends + self.ftype = ftype + self.min_len = min_len + self.starts = sorted(self.table_obj.start_codons) + self.stops = sorted(self.table_obj.stop_codons) + self.re_starts = re.compile("|".join(self.starts)) + self.re_stops = re.compile("|".join(self.stops)) + + def locate(self, fasta_file, out_nuc, out_prot, out_bed, out_gff3): + seq_format = "fasta" + log.debug("Genetic code table %i" % self.table) + log.debug("Minimum length %i aa" % self.min_len) + + out_count = 0 + + out_gff3.write("##gff-version 3\n") + + for idx, record in enumerate(SeqIO.parse(fasta_file, seq_format)): + for i, (f_start, f_end, f_strand, n, t) in enumerate( + self.get_all_peptides(str(record.seq).upper()) + ): + out_count += 1 + + descr = "length %i aa, %i bp, from %s..%s[%s] of %s" % ( + len(t), + len(n), + f_start, + f_end, + f_strand, + record.description, + ) + fid = record.id + "|%s%i" % (self.ftype, i + 1) + + r = SeqRecord(Seq(n), id=fid, name="", description=descr) + t = SeqRecord(Seq(t), id=fid, name="", description=descr) + + SeqIO.write(r, out_nuc, "fasta") + SeqIO.write(t, out_prot, "fasta") + + nice_strand = "+" if f_strand == +1 else "-" + + out_bed.write( + "\t".join( + map(str, [record.id, f_start, f_end, fid, 0, nice_strand]) + ) + + "\n" + ) + + out_gff3.write( + "\t".join( + map( + str, + [ + record.id, + "getOrfsOrCds", + "CDS", + f_start + 1, + f_end, + ".", + nice_strand, + 0, + "ID=%s.%s.%s" % (self.ftype, idx, i + 1), + ], + ) + ) + + "\n" + ) + log.info("Found %i %ss", out_count, self.ftype) + + def start_chop_and_trans(self, s, strict=True): + """Returns offset, trimmed nuc, protein.""" + if strict: + assert s[-3:] in self.stops, s + assert len(s) % 3 == 0 + for match in self.re_starts.finditer(s, overlapped=True): + # Must check the start is in frame + start = match.start() + if start % 3 == 0: + n = s[start:] + assert len(n) % 3 == 0, "%s is len %i" % (n, len(n)) + if strict: + t = translate(n, self.table) + else: + # Use when missing stop codon, + t = "M" + translate(n[3:], self.table, to_stop=True) + yield start, n, t + + def break_up_frame(self, s): + """Returns offset, nuc, protein.""" + start = 0 + for match in self.re_stops.finditer(s, overlapped=True): + index = match.start() + 3 + if index % 3 != 0: + continue + n = s[start:index] + for (offset, n, t) in self.start_chop_and_trans(n): + if n and len(t) >= self.min_len: + yield start + offset, n, t + start = index + + def putative_genes_in_sequence(self, nuc_seq): + """Returns start, end, strand, nucleotides, protein. + Co-ordinates are Python style zero-based. + """ + nuc_seq = nuc_seq.upper() + # TODO - Refactor to use a generator function (in start order) + # rather than making a list and sorting? + answer = [] + full_len = len(nuc_seq) + + for frame in range(0, 3): + for offset, n, t in self.break_up_frame(nuc_seq[frame:]): + start = frame + offset # zero based + answer.append((start, start + len(n), +1, n, t)) + + rc = reverse_complement(nuc_seq) + for frame in range(0, 3): + for offset, n, t in self.break_up_frame(rc[frame:]): + start = full_len - frame - offset # zero based + answer.append((start, start - len(n), -1, n, t)) + answer.sort() + return answer + + def get_all_peptides(self, nuc_seq): + """Returns start, end, strand, nucleotides, protein. + + Co-ordinates are Python style zero-based. + """ + # Refactored into generator by CPT + + full_len = len(nuc_seq) + for frame in range(0, 3): + for offset, n, t in self.break_up_frame(nuc_seq[frame:]): + start = frame + offset # zero based + yield (start, start + len(n), +1, n, t) + rc = reverse_complement(nuc_seq) + for frame in range(0, 3): + for offset, n, t in self.break_up_frame(rc[frame:]): + start = full_len - frame - offset # zero based + yield (start - len(n), start, -1, n, t) diff -r 1a7fef71aee3 -r fd70980a516b cpt_find_spanins/cpt-macros.xml --- a/cpt_find_spanins/cpt-macros.xml Fri Jun 17 12:40:59 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,115 +0,0 @@ - - - - - python - biopython - requests - - - - - - - - 10.1371/journal.pcbi.1008214 - @unpublished{galaxyTools, - author = {E. Mijalis, H. Rasche}, - title = {CPT Galaxy Tools}, - year = {2013-2017}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - - - - - 10.1371/journal.pcbi.1008214 - - @unpublished{galaxyTools, - author = {E. Mijalis, H. Rasche}, - title = {CPT Galaxy Tools}, - year = {2013-2017}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - - - - - - - 10.1371/journal.pcbi.1008214 - - @unpublished{galaxyTools, - author = {C. Ross}, - title = {CPT Galaxy Tools}, - year = {2020-}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - - - - - - - 10.1371/journal.pcbi.1008214 - - @unpublished{galaxyTools, - author = {E. Mijalis, H. Rasche}, - title = {CPT Galaxy Tools}, - year = {2013-2017}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - - - @unpublished{galaxyTools, - author = {A. Criscione}, - title = {CPT Galaxy Tools}, - year = {2019-2021}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - - - - - - - 10.1371/journal.pcbi.1008214 - - @unpublished{galaxyTools, - author = {A. Criscione}, - title = {CPT Galaxy Tools}, - year = {2019-2021}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - - - - - - - 10.1371/journal.pcbi.1008214 - - @unpublished{galaxyTools, - author = {C. Maughmer}, - title = {CPT Galaxy Tools}, - year = {2017-2020}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - - - - - - - @unpublished{galaxyTools, - author = {C. Maughmer}, - title = {CPT Galaxy Tools}, - year = {2017-2020}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - - - - diff -r 1a7fef71aee3 -r fd70980a516b cpt_find_spanins/cpt.py --- a/cpt_find_spanins/cpt.py Fri Jun 17 12:40:59 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,341 +0,0 @@ -#!/usr/bin/env python -from Bio.Seq import Seq, reverse_complement, translate -from Bio.SeqRecord import SeqRecord -from Bio import SeqIO -from Bio.Data import CodonTable -import logging - -logging.basicConfig() -log = logging.getLogger() - -PHAGE_IN_MIDDLE = re.compile("^(?P.*)\s*phage (?P.*)$") -BACTERIOPHAGE_IN_MIDDLE = re.compile("^(?P.*)\s*bacteriophage (?P.*)$") -STARTS_WITH_PHAGE = re.compile( - "^(bacterio|vibrio|Bacterio|Vibrio|)?[Pp]hage (?P.*)$" -) -NEW_STYLE_NAMES = re.compile("(?Pv[A-Z]_[A-Z][a-z]{2}_.*)") - - -def phage_name_parser(name): - host = None - phage = None - name = name.replace(", complete genome.", "") - name = name.replace(", complete genome", "") - - m = BACTERIOPHAGE_IN_MIDDLE.match(name) - if m: - host = m.group("host") - phage = m.group("phage") - return (host, phage) - - m = PHAGE_IN_MIDDLE.match(name) - if m: - host = m.group("host") - phage = m.group("phage") - return (host, phage) - - m = STARTS_WITH_PHAGE.match(name) - if m: - phage = m.group("phage") - return (host, phage) - - m = NEW_STYLE_NAMES.match(name) - if m: - phage = m.group("phage") - return (host, phage) - - return (host, phage) - - -class OrfFinder(object): - def __init__(self, table, ftype, ends, min_len, strand): - self.table = table - self.table_obj = CodonTable.ambiguous_generic_by_id[table] - self.ends = ends - self.ftype = ftype - self.min_len = min_len - self.starts = sorted(self.table_obj.start_codons) - self.stops = sorted(self.table_obj.stop_codons) - self.re_starts = re.compile("|".join(self.starts)) - self.re_stops = re.compile("|".join(self.stops)) - self.strand = strand - - def locate(self, fasta_file, out_nuc, out_prot, out_bed, out_gff3): - seq_format = "fasta" - log.debug("Genetic code table %i" % self.table) - log.debug("Minimum length %i aa" % self.min_len) - - out_count = 0 - - out_gff3.write("##gff-version 3\n") - - for idx, record in enumerate(SeqIO.parse(fasta_file, seq_format)): - for i, (f_start, f_end, f_strand, n, t) in enumerate( - self.get_all_peptides(str(record.seq).upper()) - ): - out_count += 1 - - descr = "length %i aa, %i bp, from %s..%s[%s] of %s" % ( - len(t), - len(n), - f_start, - f_end, - f_strand, - record.description, - ) - fid = record.id + "|%s%i" % (self.ftype, i + 1) - - r = SeqRecord(Seq(n), id=fid, name="", description=descr) - t = SeqRecord(Seq(t), id=fid, name="", description=descr) - - SeqIO.write(r, out_nuc, "fasta") - SeqIO.write(t, out_prot, "fasta") - - nice_strand = "+" if f_strand == +1 else "-" - - out_bed.write( - "\t".join( - map(str, [record.id, f_start, f_end, fid, 0, nice_strand]) - ) - + "\n" - ) - - out_gff3.write( - "\t".join( - map( - str, - [ - record.id, - "getOrfsOrCds", - "CDS", - f_start + 1, - f_end, - ".", - nice_strand, - 0, - "ID=%s.%s.%s" % (self.ftype, idx, i + 1), - ], - ) - ) - + "\n" - ) - log.info("Found %i %ss", out_count, self.ftype) - - def start_chop_and_trans(self, s, strict=True): - """Returns offset, trimmed nuc, protein.""" - if strict: - assert s[-3:] in self.stops, s - assert len(s) % 3 == 0 - for match in self.re_starts.finditer(s, overlapped=True): - # Must check the start is in frame - start = match.start() - if start % 3 == 0: - n = s[start:] - assert len(n) % 3 == 0, "%s is len %i" % (n, len(n)) - if strict: - t = translate(n, self.table) - else: - # Use when missing stop codon, - t = "M" + translate(n[3:], self.table, to_stop=True) - yield start, n, t # Edited by CPT to be a generator - - def break_up_frame(self, s): - """Returns offset, nuc, protein.""" - start = 0 - for match in self.re_stops.finditer(s, overlapped=True): - index = match.start() + 3 - if index % 3 != 0: - continue - n = s[start:index] - for (offset, n, t) in self.start_chop_and_trans(n): - if n and len(t) >= self.min_len: - yield start + offset, n, t - start = index - - def putative_genes_in_sequence(self, nuc_seq): - """Returns start, end, strand, nucleotides, protein. - Co-ordinates are Python style zero-based. - """ - nuc_seq = nuc_seq.upper() - # TODO - Refactor to use a generator function (in start order) - # rather than making a list and sorting? - answer = [] - full_len = len(nuc_seq) - - for frame in range(0, 3): - for offset, n, t in self.break_up_frame(nuc_seq[frame:]): - start = frame + offset # zero based - answer.append((start, start + len(n), +1, n, t)) - - rc = reverse_complement(nuc_seq) - for frame in range(0, 3): - for offset, n, t in self.break_up_frame(rc[frame:]): - start = full_len - frame - offset # zero based - answer.append((start, start - len(n), -1, n, t)) - answer.sort() - return answer - - def get_all_peptides(self, nuc_seq): - """Returns start, end, strand, nucleotides, protein. - - Co-ordinates are Python style zero-based. - """ - # Refactored into generator by CPT - full_len = len(nuc_seq) - if self.strand != "reverse": - for frame in range(0, 3): - for offset, n, t in self.break_up_frame(nuc_seq[frame:]): - start = frame + offset # zero based - yield (start, start + len(n), +1, n, t) - if self.strand != "forward": - rc = reverse_complement(nuc_seq) - for frame in range(0, 3): - for offset, n, t in self.break_up_frame(rc[frame:]): - start = full_len - frame - offset # zero based - yield (start - len(n), start, -1, n, t) - - -class MGAFinder(object): - def __init__(self, table, ftype, ends, min_len): - self.table = table - self.table_obj = CodonTable.ambiguous_generic_by_id[table] - self.ends = ends - self.ftype = ftype - self.min_len = min_len - self.starts = sorted(self.table_obj.start_codons) - self.stops = sorted(self.table_obj.stop_codons) - self.re_starts = re.compile("|".join(self.starts)) - self.re_stops = re.compile("|".join(self.stops)) - - def locate(self, fasta_file, out_nuc, out_prot, out_bed, out_gff3): - seq_format = "fasta" - log.debug("Genetic code table %i" % self.table) - log.debug("Minimum length %i aa" % self.min_len) - - out_count = 0 - - out_gff3.write("##gff-version 3\n") - - for idx, record in enumerate(SeqIO.parse(fasta_file, seq_format)): - for i, (f_start, f_end, f_strand, n, t) in enumerate( - self.get_all_peptides(str(record.seq).upper()) - ): - out_count += 1 - - descr = "length %i aa, %i bp, from %s..%s[%s] of %s" % ( - len(t), - len(n), - f_start, - f_end, - f_strand, - record.description, - ) - fid = record.id + "|%s%i" % (self.ftype, i + 1) - - r = SeqRecord(Seq(n), id=fid, name="", description=descr) - t = SeqRecord(Seq(t), id=fid, name="", description=descr) - - SeqIO.write(r, out_nuc, "fasta") - SeqIO.write(t, out_prot, "fasta") - - nice_strand = "+" if f_strand == +1 else "-" - - out_bed.write( - "\t".join( - map(str, [record.id, f_start, f_end, fid, 0, nice_strand]) - ) - + "\n" - ) - - out_gff3.write( - "\t".join( - map( - str, - [ - record.id, - "getOrfsOrCds", - "CDS", - f_start + 1, - f_end, - ".", - nice_strand, - 0, - "ID=%s.%s.%s" % (self.ftype, idx, i + 1), - ], - ) - ) - + "\n" - ) - log.info("Found %i %ss", out_count, self.ftype) - - def start_chop_and_trans(self, s, strict=True): - """Returns offset, trimmed nuc, protein.""" - if strict: - assert s[-3:] in self.stops, s - assert len(s) % 3 == 0 - for match in self.re_starts.finditer(s, overlapped=True): - # Must check the start is in frame - start = match.start() - if start % 3 == 0: - n = s[start:] - assert len(n) % 3 == 0, "%s is len %i" % (n, len(n)) - if strict: - t = translate(n, self.table) - else: - # Use when missing stop codon, - t = "M" + translate(n[3:], self.table, to_stop=True) - yield start, n, t - - def break_up_frame(self, s): - """Returns offset, nuc, protein.""" - start = 0 - for match in self.re_stops.finditer(s, overlapped=True): - index = match.start() + 3 - if index % 3 != 0: - continue - n = s[start:index] - for (offset, n, t) in self.start_chop_and_trans(n): - if n and len(t) >= self.min_len: - yield start + offset, n, t - start = index - - def putative_genes_in_sequence(self, nuc_seq): - """Returns start, end, strand, nucleotides, protein. - Co-ordinates are Python style zero-based. - """ - nuc_seq = nuc_seq.upper() - # TODO - Refactor to use a generator function (in start order) - # rather than making a list and sorting? - answer = [] - full_len = len(nuc_seq) - - for frame in range(0, 3): - for offset, n, t in self.break_up_frame(nuc_seq[frame:]): - start = frame + offset # zero based - answer.append((start, start + len(n), +1, n, t)) - - rc = reverse_complement(nuc_seq) - for frame in range(0, 3): - for offset, n, t in self.break_up_frame(rc[frame:]): - start = full_len - frame - offset # zero based - answer.append((start, start - len(n), -1, n, t)) - answer.sort() - return answer - - def get_all_peptides(self, nuc_seq): - """Returns start, end, strand, nucleotides, protein. - - Co-ordinates are Python style zero-based. - """ - # Refactored into generator by CPT - - full_len = len(nuc_seq) - for frame in range(0, 3): - for offset, n, t in self.break_up_frame(nuc_seq[frame:]): - start = frame + offset # zero based - yield (start, start + len(n), +1, n, t) - rc = reverse_complement(nuc_seq) - for frame in range(0, 3): - for offset, n, t in self.break_up_frame(rc[frame:]): - start = full_len - frame - offset # zero based - yield (start - len(n), start, -1, n, t) diff -r 1a7fef71aee3 -r fd70980a516b cpt_find_spanins/findSpanin.py --- a/cpt_find_spanins/findSpanin.py Fri Jun 17 12:40:59 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,477 +0,0 @@ -##### findSpanin.pl --> findSpanin.py -######### Much of this code is very "blocked", in the sense that one thing happens...then a function happens on the return...then another function...etc...etc... - -import argparse -import os -import re # new -import itertools # new -from collections import Counter, OrderedDict -from spaninFuncs import getDescriptions, grabLocs, spaninProximity, splitStrands, tuple_fasta, lineWrapper - -### Requirement Inputs -#### INPUT : putative_isp.fa & putative_osp.fa (in that order) -#### PARAMETERS : - -############################################################################### -def write_output(candidates): - """ output file function...maybe not needed """ - pass - -def reconfigure_dict(spanins): - """ - re organizes dictionary to be more friendly for checks - """ - - new_spanin_dict = {} - - for each_spanin_type, data_dict in spanins.items(): - #print(f"{each_spanin_type} == {data_dict}") - new_spanin_dict[each_spanin_type] = {} - new_spanin_dict[each_spanin_type]['positive'] = {} - new_spanin_dict[each_spanin_type]['negative'] = {} - new_spanin_dict[each_spanin_type]['positive']['coords'] = [] - new_spanin_dict[each_spanin_type]['negative']['coords'] = [] - for outter_orf, inner_data in data_dict.items(): - list_of_hits = [] - for data_content in inner_data: - #print(data_content) - data_content.insert(0, outter_orf) - #print(f"new data_content -> {data_content}") - #print(data_content) - #list_of_hits += [data_content] - #new_spanin_dict[each_spanin_type] += [data_content] - if data_content[6] == "+": - #print(f"{each_spanin_type} @ POSITIVE") - new_spanin_dict[each_spanin_type]['positive']['coords'] += [data_content] - elif data_content[6] == "-": - #print(f"{each_spanin_type} @ NEGATIVE") - new_spanin_dict[each_spanin_type]['negative']['coords'] += [data_content] - #print(new_spanin_dict[each_spanin_type]) - #print(reorganized) - #print(f"{outter_orf} => {inner_data}") - #print(new_spanin_dict) - - #print('\n') - #for k, v in new_spanin_dict.items(): - #print(k) - #print(v) - return new_spanin_dict - - - -def check_for_uniques(spanins): - """ - Checks for unique spanins based on spanin_type. - If the positive strand end site is _the same_ for a i-spanin, we would group that as "1". - i.e. if ORF1, ORF2, and ORF3 all ended with location 4231, they would not be unique. - """ - pair_dict = {} - pair_dict = { - 'pairs' : { - 'location_amount' : [], - 'pair_number' : {}, - } - } - for each_spanin_type, spanin_data in spanins.items(): - #print(f"{each_spanin_type} ===> {spanin_data}") - # early declarations for cases of no results - pos_check = [] # end checks - pos_uniques = [] - neg_check = [] # start checks - neg_uniques = [] - unique_ends = [] - pos_amt_unique = 0 - neg_amt_unique = 0 - amt_positive = 0 - amt_negative = 0 - spanin_data['uniques'] = 0 - spanin_data['amount'] = 0 - #spanin_data['positive']['amt_positive'] = 0 - #spanin_data['positive']['pos_amt_unique'] = 0 - #spanin_data['positive']['isp_match'] = [] - #spanin_data['negative']['amt_negative'] = 0 - #spanin_data['negative']['neg_amt_unique'] = 0 - #spanin_data['negative']['isp_match'] = [] - #print(spanin_data) - if spanin_data['positive']['coords']: - # do something... - #print('in other function') - #print(spanin_data['positive']['coords']) - for each_hit in spanin_data['positive']['coords']: - pos_check.append(each_hit[2]) - pair_dict['pairs']['location_amount'].append(each_hit[2]) - pos_uniques = list(set([end_site for end_site in pos_check if pos_check.count(end_site) >= 1])) - #print(pos_check) - #print(pos_uniques) - amt_positive = len(spanin_data['positive']['coords']) - pos_amt_unique = len(pos_uniques) - if amt_positive: - spanin_data['positive']['amt_positive'] = amt_positive - spanin_data['positive']['pos_amt_unique'] = pos_amt_unique - #pair_dict['pairs']['locations'].extend(pos_uniques) - else: - spanin_data['positive']['amt_positive'] = 0 - spanin_data['positive']['pos_amt_unique'] = 0 - if spanin_data['negative']['coords']: - - # do something else... - #print('in other function') - #print(spanin_data['negative']['coords']) - for each_hit in spanin_data['negative']['coords']: - neg_check.append(each_hit[1]) - pair_dict['pairs']['location_amount'].append(each_hit[1]) - neg_uniques = list(set([start_site for start_site in neg_check if neg_check.count(start_site) >= 1])) - #print(neg_uniques) - amt_negative = len(spanin_data['negative']['coords']) - neg_amt_unique = len(neg_uniques) - if amt_negative: - spanin_data['negative']['amt_negative'] = amt_negative - spanin_data['negative']['neg_amt_unique'] = neg_amt_unique - #pair_dict['pairs']['locations'].extend(neg_uniques) - else: - spanin_data['negative']['amt_negative'] = 0 - spanin_data['negative']['neg_amt_unique'] = 0 - spanin_data['uniques'] += (spanin_data['positive']['pos_amt_unique'] + spanin_data['negative']['neg_amt_unique']) - spanin_data['amount'] += (spanin_data['positive']['amt_positive'] + spanin_data['negative']['amt_negative']) - #print(spanin_data['uniques']) - list(set(pair_dict['pairs']['location_amount'])) - pair_dict['pairs']['location_amount'] = dict(Counter(pair_dict['pairs']['location_amount'])) - for data in pair_dict.values(): - #print(data['locations']) - #print(type(data['locations'])) - v = 0 - for loc, count in data['location_amount'].items(): - #data['pair_number'] = {loc - v += 1 - data['pair_number'][loc] = v - #print(dict(Counter(pair_dict['pairs']['locations']))) - #print(pair_dict) - spanins['total_amount'] = spanins['EMBEDDED']['amount'] + spanins['SEPARATED']['amount'] + spanins['OVERLAPPED']['amount'] - spanins['total_unique'] = spanins['EMBEDDED']['uniques'] + spanins['SEPARATED']['uniques'] + spanins['OVERLAPPED']['uniques'] - #spanins['total_unique'] = len(pair_dict['pairs']['pair_number']) - return spanins, pair_dict - -if __name__ == "__main__": - - # Common parameters for both ISP / OSP portion of script - - parser = argparse.ArgumentParser( - description="Trim the putative protein candidates and find potential i-spanin / o-spanin pairs" - ) - - parser.add_argument( - "putative_isp_fasta_file", - type=argparse.FileType("r"), - help='Putative i-spanin FASTA file, output of "generate-putative-isp"', - ) # the "input" argument - - parser.add_argument( - "putative_osp_fasta_file", - type=argparse.FileType("r"), - help='Putative o-spanin FASTA file, output of "generate-putative-osp"', - ) - - parser.add_argument( - "--max_isp_osp_distance", - dest="max_isp_osp_distance", - default=10, - type=int, - help="max distance from end of i-spanin to start of o-spanin, measured in AAs", - ) - - parser.add_argument( - "--embedded_txt", - dest="embedded_txt", - type=argparse.FileType("w"), - default="_findSpanin_embedded_results.txt", - help="Results of potential embedded spanins", - ) - parser.add_argument( - "--overlap_txt", - dest="overlap_txt", - type=argparse.FileType("w"), - default="_findSpanin_overlap_results.txt", - help="Results of potential overlapping spanins", - ) - parser.add_argument( - "--separate_txt", - dest="separate_txt", - type=argparse.FileType("w"), - default="_findSpanin_separated_results.txt", - help="Results of potential separated spanins", - ) - - parser.add_argument( - "--summary_txt", - dest="summary_txt", - type=argparse.FileType("w"), - default="_findSpanin_summary.txt", - help="Results of potential spanin pairs", - ) - parser.add_argument( - "-v", action="version", version="0.3.0" - ) # Is this manually updated? - args = parser.parse_args() - - - #### RE-WRITE - SPANIN_TYPES = {} - SPANIN_TYPES['EMBEDDED'] = {} - SPANIN_TYPES['OVERLAPPED'] = {} - SPANIN_TYPES['SEPARATED'] = {} - #SPANIN_TYPES = { - # 'EMBEDDED' : {}, - # 'OVERLAPPED' : {}, - # 'SEPARATED' : {}, - #} - - isp = getDescriptions(args.putative_isp_fasta_file) - args.putative_isp_fasta_file = open(args.putative_isp_fasta_file.name, "r") - isp_full = tuple_fasta(args.putative_isp_fasta_file) - - osp = getDescriptions(args.putative_osp_fasta_file) - args.putative_osp_fasta_file = open(args.putative_osp_fasta_file.name, "r") - osp_full = tuple_fasta(args.putative_osp_fasta_file) - - #### location data - location_data = { - 'isp' : [], - 'osp' : [] - } - spanins = [isp, osp] - for idx, each_spanin_type in enumerate(spanins): - for description in each_spanin_type: - locations = grabLocs(description) - if idx == 0: # i-spanin - location_data['isp'].append(locations) - elif idx == 1: # o-spanin - location_data['osp'].append(locations) - - #### Check for types of spanins - embedded, overlap, separate = spaninProximity( - isp=location_data['isp'], - osp=location_data['osp'], - max_dist=args.max_isp_osp_distance * 3 - ) - - SPANIN_TYPES['EMBEDDED'] = embedded - SPANIN_TYPES['OVERLAPPED'] = overlap - SPANIN_TYPES['SEPARATED'] = separate - - #for spanin_type, spanin in SPANIN_TYPES.items(): - # s = 0 - # for sequence in spanin.values(): - # s += len(sequence) - # SPANIN_TYPES[spanin_type]['amount'] = s - # SPANIN_TYPES[spanin_type]['unique'] = len(spanin.keys()) - - #check_for_unique_spanins(SPANIN_TYPES) - spanins = reconfigure_dict(SPANIN_TYPES) - spanins, pair_dict = check_for_uniques(spanins) - #print(pair_dict) - with args.summary_txt as f: - for each_spanin_type, spanin_data in spanins.items(): - try: - if each_spanin_type not in ["total_amount","total_unique"]: - #print(each_spanin_type) - #print(each_spanin_type) - f.write("=~~~~~= "+str(each_spanin_type) +" Spanin Candidate Statistics =~~~~~=\n") - f.writelines("Total Candidate Pairs = "+str(spanin_data['amount'])+"\n") - f.writelines("Total Unique Pairs = "+str(spanin_data['uniques'])+"\n") - if each_spanin_type == "EMBEDDED": - for k, v in SPANIN_TYPES['EMBEDDED'].items(): - #print(k) - f.writelines(""+str(k)+" ==> Amount of corresponding candidate o-spanins(s): "+str(len(v))+"\n") - if each_spanin_type == "SEPARATED": - for k, v in SPANIN_TYPES['SEPARATED'].items(): - f.writelines(""+str(k)+ " ==> Amount of corresponding candidate o-spanins(s): "+str(len(v))+"\n") - if each_spanin_type == "OVERLAPPED": - for k, v in SPANIN_TYPES['OVERLAPPED'].items(): - f.writelines(""+str(k)+" ==> Amount of corresponding candidate o-spanins(s): "+str(len(v))+"\n") - except TypeError: - continue - f.write("\n=~~~~~= Tally from ALL spanin types =~~~~~=\n") - f.writelines("Total Candidates = "+str(spanins['total_amount'])+"\n") - f.writelines("Total Unique Candidate Pairs = "+str(spanins['total_unique'])+"\n") - - args.putative_isp_fasta_file = open(args.putative_isp_fasta_file.name, "r") - isp_full = tuple_fasta(args.putative_isp_fasta_file) - - args.putative_osp_fasta_file = open(args.putative_osp_fasta_file.name, "r") - osp_full = tuple_fasta(args.putative_osp_fasta_file) - - #print(isp_full) - isp_seqs = [] - osp_seqs = [] - for isp_tupe in isp_full: - #print(isp_tupe) - for pisp, posp in embedded.items(): - #print(f"ISP = searching for {pisp} in {isp_tupe[0]}") - if re.search(("("+str(pisp)+")\D"), isp_tupe[0]): - #print(isp_tupe[0]) - #print(peri_count) - peri_count = str.split(isp_tupe[0],"~=")[1] - isp_seqs.append((pisp,isp_tupe[1],peri_count)) - #print(isp_seqs) - for osp_tupe in osp_full: - for pisp, posp in embedded.items(): - for data in posp: - #print(f"OSP = searching for {data[3]} in {osp_tupe[0]}, coming from this object: {data}") - if re.search(("("+str(data[3])+")\D"), osp_tupe[0]): - peri_count = str.split(osp_tupe[0],"~=")[1] - osp_seqs.append((data[3],osp_tupe[1],peri_count)) - - with args.embedded_txt as f: - f.write("================ embedded spanin candidates =================\n") - f.write("isp\tisp_start\tisp_end\tosp\tosp_start\tosp_end\tstrand\tpair_number\n") - if embedded != {}: - #print(embedded) - for pisp, posp in embedded.items(): - #print(f"{pisp} - {posp}") - f.write(pisp + "\n") - for each_posp in posp: - #print(posp) - f.write( - "\t{}\t{}\t{}\t{}\t{}\t{}\t".format( - each_posp[1], - each_posp[2], - each_posp[3], - each_posp[4], - each_posp[5], - each_posp[6], - ) - ) - if each_posp[6] == "+": - if each_posp[2] in pair_dict['pairs']['pair_number'].keys(): - f.write(""+str(pair_dict['pairs']['pair_number'][each_posp[2]])+"\n") - elif each_posp[6] == "-": - if each_posp[1] in pair_dict['pairs']['pair_number'].keys(): - f.write(""+str(pair_dict['pairs']['pair_number'][each_posp[1]])+"\n") - else: - f.write("nothing found") - - with open(args.embedded_txt.name, "a") as f: - f.write("\n================= embedded candidate sequences ================\n") - f.write("======================= isp ==========================\n\n") - for isp_data in isp_seqs: - #print(isp_data) - f.write(">isp_orf::{}-peri_count~={}\n{}\n".format(isp_data[0],isp_data[2],lineWrapper(isp_data[1]))) - f.write("\n======================= osp ========================\n\n") - for osp_data in osp_seqs: - f.write(">osp_orf::{}-peri_count~={}\n{}\n".format(osp_data[0],osp_data[2],lineWrapper(osp_data[1]))) - - args.putative_isp_fasta_file = open(args.putative_isp_fasta_file.name, "r") - isp_full = tuple_fasta(args.putative_isp_fasta_file) - - args.putative_osp_fasta_file = open(args.putative_osp_fasta_file.name, "r") - osp_full = tuple_fasta(args.putative_osp_fasta_file) - - isp_seqs = [] - osp_seqs = [] - for isp_tupe in isp_full: - peri_count = str.split(isp_tupe[0],"~=")[1] - for pisp, posp in overlap.items(): - if re.search(("("+str(pisp)+")\D"), isp_tupe[0]): - peri_count = str.split(isp_tupe[0],"~=")[1] - isp_seqs.append((pisp,isp_tupe[1],peri_count)) - - for osp_tupe in osp_full: - for pisp, posp in overlap.items(): - for data in posp: - if re.search(("("+str(data[3])+")\D"), osp_tupe[0]): - peri_count = str.split(osp_tupe[0],"~=")[1] - osp_seqs.append((data[3],osp_tupe[1],peri_count)) - - - - with args.overlap_txt as f: - f.write("================ overlap spanin candidates =================\n") - f.write("isp\tisp_start\tisp_end\tosp\tosp_start\tosp_end\tstrand\tpair_number\n") - if overlap != {}: - for pisp, posp in overlap.items(): - f.write(pisp + "\n") - for each_posp in posp: - f.write( - "\t{}\t{}\t{}\t{}\t{}\t{}\t".format( - each_posp[1], - each_posp[2], - each_posp[3], - each_posp[4], - each_posp[5], - each_posp[6], - ) - ) - if each_posp[6] == "+": - if each_posp[2] in pair_dict['pairs']['pair_number'].keys(): - #print('ovl ; +') - f.write(""+str(pair_dict['pairs']['pair_number'][each_posp[2]])+"\n") - elif each_posp[6] == "-": - if each_posp[1] in pair_dict['pairs']['pair_number'].keys(): - f.write(""+str(pair_dict['pairs']['pair_number'][each_posp[1]])+"\n") - else: - f.write("nothing found") - - with open(args.overlap_txt.name, "a") as f: - #print(isp_seqs) - f.write("\n================= overlap candidate sequences ================\n") - f.write("======================= isp ==========================\n\n") - for isp_data in isp_seqs: - f.write(">isp_orf::{}-pericount~={}\n{}\n".format(isp_data[0],isp_data[2],lineWrapper(isp_data[1]))) - f.write("\n======================= osp ========================\n\n") - for osp_data in osp_seqs: - f.write(">osp_orf::{}-pericount~={}\n{}\n".format(osp_data[0],osp_data[2],lineWrapper(osp_data[1]))) - - args.putative_isp_fasta_file = open(args.putative_isp_fasta_file.name, "r") - isp_full = tuple_fasta(args.putative_isp_fasta_file) - args.putative_osp_fasta_file = open(args.putative_osp_fasta_file.name, "r") - osp_full = tuple_fasta(args.putative_osp_fasta_file) - - isp_seqs = [] - osp_seqs = [] - for isp_tupe in isp_full: - for pisp, posp in separate.items(): - if re.search(("("+str(pisp)+")\D"), isp_tupe[0]): - peri_count = str.split(isp_tupe[0],"~=")[1] - isp_seqs.append((pisp,isp_tupe[1],peri_count)) - #print(isp_seqs) - for osp_tupe in osp_full: - for pisp, posp in separate.items(): - for data in posp: - if re.search(("("+str(data[3])+")\D"), osp_tupe[0]): - peri_count = str.split(osp_tupe[0],"~=")[1] - osp_seqs.append((data[3],osp_tupe[1],peri_count)) - - with args.separate_txt as f: - f.write("================ separated spanin candidates =================\n") - f.write("isp\tisp_start\tisp_end\tosp\tosp_start\tosp_end\tstrand\tpair_number\n") - if separate != {}: - for pisp, posp in separate.items(): - f.write(pisp + "\n") - for each_posp in posp: - f.write( - "\t{}\t{}\t{}\t{}\t{}\t{}\t".format( - each_posp[1], - each_posp[2], - each_posp[3], - each_posp[4], - each_posp[5], - each_posp[6], - ) - ) - if each_posp[6] == "+": - if each_posp[2] in pair_dict['pairs']['pair_number'].keys(): - f.write(""+str(pair_dict['pairs']['pair_number'][each_posp[2]])+"\n") - elif each_posp[6] == "-": - if each_posp[1] in pair_dict['pairs']['pair_number'].keys(): - f.write(""+str(pair_dict['pairs']['pair_number'][each_posp[1]])+"\n") - else: - f.write("nothing found") - - with open(args.separate_txt.name, "a") as f: - f.write("\n================= separated candidate sequences ================\n") - f.write("======================= isp ==========================\n\n") - for isp_data in isp_seqs: - f.write(">isp_orf::{}-pericount~={}\n{}\n".format(isp_data[0],isp_data[2],lineWrapper(isp_data[1]))) - f.write("\n======================= osp ========================\n\n") - for osp_data in osp_seqs: - f.write(">osp_orf::{}-pericount~={}\n{}\n".format(osp_data[0],osp_data[2],lineWrapper(osp_data[1]))) diff -r 1a7fef71aee3 -r fd70980a516b cpt_find_spanins/findSpanin.xml --- a/cpt_find_spanins/findSpanin.xml Fri Jun 17 12:40:59 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,44 +0,0 @@ - - - With the outputs from the ISP and OSP candidate tools, cull the list down to candidate pairs - - macros.xml - cpt-macros.xml - - - - - - - - - - - - - - - - Putative i-spanin and o-spanin protein multiFASTAs (generated from the ISP/OSP Candidate Tools). - -**METHODOLOGY** -Does a pairwise comparison between candidate i-spanins and o-spanins based on their genomic location, and classifies them into the known bimolecular spanin genetic architectures. Classes are: embedded, overlapping, separated, or NOT a potential pair. - -**OUTPUT** --> File with candidate pairs for each bimolecular spanin class and a basic summary statistics file. - -]]> - - diff -r 1a7fef71aee3 -r fd70980a516b cpt_find_spanins/macros.xml --- a/cpt_find_spanins/macros.xml Fri Jun 17 12:40:59 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,63 +0,0 @@ - - - - - regex - python - biopython - cpt_gffparser - - - - - "$blast_tsv" - - - - - - - "$blast_xml" - - - - - - - - - - - - - - - - - - - - "$gff3_data" - - - genomeref.fa - - - ln -s $genome_fasta genomeref.fa; - - - genomeref.fa - - - - - - - "$sequences" - - - - - diff -r 1a7fef71aee3 -r fd70980a516b cpt_find_spanins/spaninFuncs.py --- a/cpt_find_spanins/spaninFuncs.py Fri Jun 17 12:40:59 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,469 +0,0 @@ -""" -PREMISE -### Functions/Classes that are used in both generate-putative-osp.py and generate-putative-isp.py -###### Main premise here is to make the above scripts a little more DRY, as well as easily readable for execution. -###### Documentation will ATTEMPT to be thourough here -""" - -import re -from Bio import SeqIO -from Bio import Seq -from collections import OrderedDict - -# Not written in OOP for a LITTLE bit of trying to keep the complication down in case adjustments are needed by someone else. -# Much of the manipulation is string based; so it should be straightforward as well as moderately quick -################## GLobal Variables -Lys = "K" - - -def check_back_end_snorkels(seq, tmsize): - """ - Searches through the backend of a potential TMD snorkel. This is the 2nd part of a TMD snorkel lysine match. - --> seq : should be the sequence fed from the "search_region" portion of the sequence - --> tmsize : size of the potential TMD being investigated - """ - found = [] - if seq[tmsize - 4] == Lys and re.search(("[FIWLVMYCATGS]"), seq[tmsize - 5]): - found = "match" - return found - elif seq[tmsize - 3] == Lys and re.search(("[FIWLVMYCATGS]"), seq[tmsize - 4]): - found = "match" - return found - elif seq[tmsize - 2] == Lys and re.search(("[FIWLVMYCATGS]"), seq[tmsize - 3]): - found = "match" - return found - elif seq[tmsize - 1] == Lys and re.search(("[FIWLVMYCATGS]"), seq[tmsize - 2]): - found = "match" - return found - else: - found = "NOTmatch" - return found - - -def prep_a_gff3(fa, spanin_type, org): - """ - Function parses an input detailed 'fa' file and outputs a 'gff3' file - ---> fa = input .fa file - ---> output = output a returned list of data, easily portable to a gff3 next - ---> spanin_type = 'isp' or 'osp' - """ - with org as f: - header = f.readline() - orgacc = header.split(" ") - orgacc = orgacc[0].split(">")[1].strip() - fa_zip = tuple_fasta(fa) - data = [] - for a_pair in fa_zip: - # print(a_pair) - if re.search(("(\[1\])"), a_pair[0]): - strand = "+" - elif re.search(("(\[-1\])"), a_pair[0]): - strand = "-" # column 7 - start = re.search(("[\d]+\.\."), a_pair[0]).group(0).split("..")[0] # column 4 - end = re.search(("\.\.[\d]+"), a_pair[0]).group(0).split("..")[1] # column 5 - orfid = re.search(("(ORF)[\d]+"), a_pair[0]).group(0) # column 1 - if spanin_type == "isp": - methodtype = "CDS" # column 3 - spanin = "isp" - elif spanin_type == "osp": - methodtype = "CDS" # column 3 - spanin = "osp" - elif spanin_type == "usp": - methodtype = "CDS" - spanin = "usp" - else: - raise "need to input spanin type" - source = "cpt.py|putative-*.py" # column 2 - score = "." # column 6 - phase = "." # column 8 - attributes = "ID=" +orgacc+ "|"+ orfid + ";ALIAS=" + spanin + ";SEQ="+a_pair[1] # column 9 - sequence = [[orgacc, source, methodtype, start, end, score, strand, phase, attributes]] - data += sequence - return data - - -def write_gff3(data, output="results.gff3"): - """ - Parses results from prep_a_gff3 into a gff3 file - ---> input : list from prep_a_gff3 - ---> output : gff3 file - """ - data = data - filename = output - with filename as f: - f.write("#gff-version 3\n") - for value in data: - f.write( - "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format( - value[0], - value[1], - value[2], - value[3], - value[4], - value[5], - value[6], - value[7], - value[8], - ) - ) - f.close() - - -def find_tmd(pair, minimum=10, maximum=30, TMDmin=10, TMDmax=20, isp_mode=False, peri_min=18, peri_max=206): - """ - Function that searches for lysine snorkels and then for a spanning hydrophobic region that indicates a potential TMD - ---> pair : Input of tuple with description and AA sequence (str) - ---> minimum : How close from the initial start codon a TMD can be within - ---> maximum : How far from the initial start codon a TMD can be within - ---> TMDmin : The minimum size that a transmembrane can be (default = 10) - ---> TMDmax : The maximum size tha ta transmembrane can be (default = 20) - """ - # hydrophobicAAs = ['P', 'F', 'I', 'W', 'L', 'V', 'M', 'Y', 'C', 'A', 'T', 'G', 'S'] - tmd = [] - s = str(pair[1]) # sequence being analyzed - # print(s) # for trouble shooting - if maximum > len(s): - maximum = len(s) - search_region = s[minimum - 1 : maximum + 1] - #print(f"this is the search region: {search_region}") - # print(search_region) # for trouble shooting - - for tmsize in range(TMDmin, TMDmax+1, 1): - #print(f"this is the current tmsize we're trying: {tmsize}") - # print('==============='+str(tmsize)+'================') # print for troubleshooting - pattern = "[PFIWLVMYCATGS]{"+str(tmsize)+"}" # searches for these hydrophobic residues tmsize total times - #print(pattern) - #print(f"sending to regex: {search_region}") - if re.search( - ("[K]"), search_region[1:8]): # grabbing one below with search region, so I want to grab one ahead here when I query. - store_search = re.search(("[K]"), search_region[1:8]) # storing regex object - where_we_are = store_search.start() # finding where we got the hit - if re.search( - ("[PFIWLVMYCATGS]"), search_region[where_we_are + 1] - ) and re.search( - ("[PFIWLVMYCATGS]"), search_region[where_we_are - 1] - ): # hydrophobic neighbor - #try: - g = re.search(("[PFIWLVMYCATGS]"), search_region[where_we_are + 1]).group() - backend = check_back_end_snorkels(search_region, tmsize) - if backend == "match": - if isp_mode: - g = re.search((pattern), search_region).group() - end_of_tmd = re.search((g), s).end()+1 - amt_peri = len(s) - end_of_tmd - if peri_min <= amt_peri <= peri_max: - pair_desc = pair[0] + ", peri_count~="+str(amt_peri) - new_pair = (pair_desc,pair[1]) - tmd.append(new_pair) - else: - tmd.append(pair) - else: - continue - #else: - #print("I'm continuing out of snorkel loop") - #print(f"{search_region}") - #continue - if re.search((pattern), search_region): - #print(f"found match: {}") - #print("I AM HEREEEEEEEEEEEEEEEEEEEEEEE") - #try: - if isp_mode: - g = re.search((pattern), search_region).group() - end_of_tmd = re.search((g), s).end()+1 - amt_peri = len(s) - end_of_tmd - if peri_min <= amt_peri <= peri_max: - pair_desc = pair[0] + ", peri_count~="+str(amt_peri) - new_pair = (pair_desc,pair[1]) - tmd.append(new_pair) - else: - tmd.append(pair) - else: - continue - - return tmd - - -def find_lipobox(pair, minimum=10, maximum=50, min_after=30, max_after=185, regex=1, osp_mode=False): - """ - Function that takes an input tuple, and will return pairs of sequences to their description that have a lipoobox - ---> minimum - min distance from start codon to first AA of lipobox - ---> maximum - max distance from start codon to first AA of lipobox - ---> regex - option 1 (default) => more strict regular expression ; option 2 => looser selection, imported from LipoRy - - """ - if regex == 1: - pattern = "[ILMFTV][^REKD][GAS]C" # regex for Lipobox from findSpanin.pl - elif regex == 2: - pattern = "[ACGSILMFTV][^REKD][GAS]C" # regex for Lipobox from LipoRy - - candidates = [] - s = str(pair[1]) - # print(s) # trouble shooting - search_region = s[minimum-1 : maximum + 5] # properly slice the input... add 4 to catch if it hangs off at max input - # print(search_region) # trouble shooting - patterns = ["[ILMFTV][^REKD][GAS]C","AW[AGS]C"] - for pattern in patterns: - #print(pattern) # trouble shooting - if re.search((pattern), search_region): # lipobox must be WITHIN the range... - # searches the sequence with the input RegEx AND omits if - g = re.search((pattern), search_region).group() # find the exact group match - amt_peri = len(s) - re.search((g), s).end() + 1 - if min_after <= amt_peri <= max_after: # find the lipobox end region - if osp_mode: - pair_desc = pair[0] + ", peri_count~="+str(amt_peri) - new_pair = (pair_desc,pair[1]) - candidates.append(new_pair) - else: - candidates.append(pair) - - return candidates - - -def tuple_fasta(fasta_file): - """ - #### INPUT: Fasta File - #### OUTPUT: zipped (zip) : pairwise relationship of description to sequence - #### - """ - fasta = SeqIO.parse(fasta_file, "fasta") - descriptions = [] - sequences = [] - for r in fasta: # iterates and stores each description and sequence - description = r.description - sequence = str(r.seq) - if ( - sequence[0] != "I" - ): # the translation table currently has I as a potential start codon ==> this will remove all ORFs that start with I - descriptions.append(description) - sequences.append(sequence) - else: - continue - - return zip(descriptions, sequences) - - -def lineWrapper(text, charactersize=60): - - if len(text) <= charactersize: - return text - else: - return ( - text[:charactersize] - + "\n" - + lineWrapper(text[charactersize:], charactersize) - ) - - -def getDescriptions(fasta): - """ - Takes an output FASTA file, and parses retrieves the description headers. These headers contain information needed - for finding locations of a potential i-spanin and o-spanin proximity to one another. - """ - desc = [] - with fasta as f: - for line in f: - if line.startswith(">"): - desc.append(line) - return desc - - -def splitStrands(text, strand="+"): - # positive_strands = [] - # negative_strands = [] - if strand == "+": - if re.search(("(\[1\])"), text): - return text - elif strand == "-": - if re.search(("(\[-1\])"), text): - return text - # return positive_strands, negative_strands - - -def parse_a_range(pair, start, end): - """ - Takes an input data tuple from a fasta tuple pair and keeps only those within the input sequence range - ---> data : fasta tuple data - ---> start : start range to keep - ---> end : end range to keep (will need to + 1) - """ - matches = [] - for each_pair in pair: - - s = re.search(("[\d]+\.\."), each_pair[0]).group(0) # Start of the sequence - s = int(s.split("..")[0]) - e = re.search(("\.\.[\d]+"), each_pair[0]).group(0) - e = int(e.split("..")[1]) - if start - 1 <= s and e <= end + 1: - matches.append(each_pair) - else: - continue - # else: - # continue - # if matches != []: - return matches - # else: - # print('no candidates within selected range') - - -def grabLocs(text): - """ - Grabs the locations of the spanin based on NT location (seen from ORF). Grabs the ORF name, as per named from the ORF class/module - from cpt.py - """ - start = re.search(("[\d]+\.\."), text).group(0) # Start of the sequence ; looks for [numbers].. - end = re.search(("\.\.[\d]+"), text).group(0) # End of the sequence ; Looks for ..[numbers] - orf = re.search(("(ORF)[\d]+"), text).group(0) # Looks for ORF and the numbers that are after it - if re.search(("(\[1\])"), text): # stores strand - strand = "+" - elif re.search(("(\[-1\])"), text): # stores strand - strand = "-" - start = int(start.split("..")[0]) - end = int(end.split("..")[1]) - vals = [start, end, orf, strand] - - return vals - - -def spaninProximity(isp, osp, max_dist=30): - """ - _NOTE THIS FUNCTION COULD BE MODIFIED TO RETURN SEQUENCES_ - Compares the locations of i-spanins and o-spanins. max_dist is the distance in NT measurement from i-spanin END site - to o-spanin START. The user will be inputting AA distance, so a conversion will be necessary ( * 3) - I modified this on 07.30.2020 to bypass the pick + or - strand. To - INPUT: list of OSP and ISP candidates - OUTPUT: Return (improved) candidates for overlapping, embedded, and separate list - """ - - embedded = {} - overlap = {} - separate = {} - for iseq in isp: - embedded[iseq[2]] = [] - overlap[iseq[2]] = [] - separate[iseq[2]] = [] - for oseq in osp: - if iseq[3] == "+": - if oseq[3] == "+": - if iseq[0] < oseq[0] < iseq[1] and oseq[1] < iseq[1]: - ### EMBEDDED ### - combo = [ - iseq[0], - iseq[1], - oseq[2], - oseq[0], - oseq[1], - iseq[3], - ] # ordering a return for dic - embedded[iseq[2]] += [combo] - elif iseq[0] < oseq[0] <= iseq[1] and oseq[1] > iseq[1]: - ### OVERLAP / SEPARATE ### - if (iseq[1] - oseq[0]) < 6: - combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]] - separate[iseq[2]] += [combo] - else: - combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]] - overlap[iseq[2]] += [combo] - elif iseq[1] <= oseq[0] <= iseq[1] + max_dist: - combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]] - separate[iseq[2]] += [combo] - else: - continue - if iseq[3] == "-": - if oseq[3] == "-": - if iseq[0] <= oseq[1] <= iseq[1] and oseq[0] > iseq[0]: - ### EMBEDDED ### - combo = [ - iseq[0], - iseq[1], - oseq[2], - oseq[0], - oseq[1], - iseq[3], - ] # ordering a return for dict - embedded[iseq[2]] += [combo] - elif iseq[0] <= oseq[1] <= iseq[1] and oseq[0] < iseq[0]: - if (oseq[1] - iseq[0]) < 6: - combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]] - separate[iseq[2]] += [combo] - else: - combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]] - overlap[iseq[2]] += [combo] - elif iseq[0] - 10 < oseq[1] < iseq[0]: - combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]] - separate[iseq[2]] += [combo] - else: - continue - - embedded = {k: embedded[k] for k in embedded if embedded[k]} - overlap = {k: overlap[k] for k in overlap if overlap[k]} - separate = {k: separate[k] for k in separate if separate[k]} - - return embedded, overlap, separate - - -def check_for_usp(): - " pass " - -############################################### TEST RANGE ######################################################################### -#################################################################################################################################### -if __name__ == "__main__": - - #### TMD TEST - test_desc = ["one", "two", "three", "four", "five"] - test_seq = [ - "XXXXXXXXXXXXXXXFMCFMCFMCFMCFMCXXXXXXXXXXXXXXXXXXXXXXXXXX", - "XXXXXXXXAAKKKKKKKKKKKKKKKXXXXXXXXXXXXX", - "XXXXXXX", - "XXXXXXXXXXXKXXXXXXXXXX", - "XXXXXXXXXXAKXXXXXXXXXXAKXXXXXXXX", - ] - # for l in - # combo = zip(test_desc,test_seq) - pairs = zip(test_desc, test_seq) - tmd = [] - for each_pair in pairs: - # print(each_pair) - try: - tmd += find_tmd(pair=each_pair) - except (IndexError, TypeError): - continue - # try:s = each_pair[1] - # tmd += find_tmd(seq=s, tmsize=15) - # print('\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n') - # print(tmd) - # print('\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n') - - #### tuple-fasta TEST - # fasta_file = 'out_isp.fa' - # ret = tuple_fasta(fasta_file) - # print('=============') - # for i in ret: - # print(i[1]) - - #### LipoBox TEST - test_desc = ["one", "two", "three", "four", "five", "six", "seven"] - test_seq = [ - "XXXXXXXXXTGGCXXXXXXXXXXXXXXXX", - "XXXXXXXXAAKKKKKKKKKKKKKKKXXXXXXXXXXXXX", - "XXXXXXX", - "AGGCXXXXXXXXXXXXXXXXXXXXTT", - "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXTGGC", - "XXXXXXXXXXXXXXXXXXXXXXXXXXTGGC", - "MSTLRELRLRRALKEQSMRYLLSIKKTLPRWKGALIGLFLICVATISGCASESKLPEPPMVSVDSSLMVEPNLTTEMLNVFSQ*", - ] - pairs = zip(test_desc, test_seq) - lipo = [] - for each_pair in pairs: - #print(each_pair) - # try: - try: - lipo += find_lipobox(pair=each_pair, regex=2) # , minimum=8) - except TypeError: # catches if something doesnt have the min/max requirements (something is too small) - continue - # except: - # continue - # print('\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n') - #############################3 - # g = prep_a_gff3(fa='putative_isp.fa', spanin_type='isp') - # print(g) - # write_gff3(data=g) diff -r 1a7fef71aee3 -r fd70980a516b findSpanin.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/findSpanin.py Mon Jun 05 02:42:01 2023 +0000 @@ -0,0 +1,597 @@ +##### findSpanin.pl --> findSpanin.py +######### Much of this code is very "blocked", in the sense that one thing happens...then a function happens on the return...then another function...etc...etc... + +import argparse +import os +import re # new +import itertools # new +from collections import Counter, OrderedDict +from spaninFuncs import ( + getDescriptions, + grabLocs, + spaninProximity, + splitStrands, + tuple_fasta, + lineWrapper, +) + +### Requirement Inputs +#### INPUT : putative_isp.fa & putative_osp.fa (in that order) +#### PARAMETERS : + +############################################################################### +def write_output(candidates): + """output file function...maybe not needed""" + pass + + +def reconfigure_dict(spanins): + """ + re organizes dictionary to be more friendly for checks + """ + + new_spanin_dict = {} + + for each_spanin_type, data_dict in spanins.items(): + # print(f"{each_spanin_type} == {data_dict}") + new_spanin_dict[each_spanin_type] = {} + new_spanin_dict[each_spanin_type]["positive"] = {} + new_spanin_dict[each_spanin_type]["negative"] = {} + new_spanin_dict[each_spanin_type]["positive"]["coords"] = [] + new_spanin_dict[each_spanin_type]["negative"]["coords"] = [] + for outter_orf, inner_data in data_dict.items(): + list_of_hits = [] + for data_content in inner_data: + # print(data_content) + data_content.insert(0, outter_orf) + # print(f"new data_content -> {data_content}") + # print(data_content) + # list_of_hits += [data_content] + # new_spanin_dict[each_spanin_type] += [data_content] + if data_content[6] == "+": + # print(f"{each_spanin_type} @ POSITIVE") + new_spanin_dict[each_spanin_type]["positive"]["coords"] += [ + data_content + ] + elif data_content[6] == "-": + # print(f"{each_spanin_type} @ NEGATIVE") + new_spanin_dict[each_spanin_type]["negative"]["coords"] += [ + data_content + ] + # print(new_spanin_dict[each_spanin_type]) + # print(reorganized) + # print(f"{outter_orf} => {inner_data}") + # print(new_spanin_dict) + + # print('\n') + # for k, v in new_spanin_dict.items(): + # print(k) + # print(v) + return new_spanin_dict + + +def check_for_uniques(spanins): + """ + Checks for unique spanins based on spanin_type. + If the positive strand end site is _the same_ for a i-spanin, we would group that as "1". + i.e. if ORF1, ORF2, and ORF3 all ended with location 4231, they would not be unique. + """ + pair_dict = {} + pair_dict = { + "pairs": { + "location_amount": [], + "pair_number": {}, + } + } + for each_spanin_type, spanin_data in spanins.items(): + # print(f"{each_spanin_type} ===> {spanin_data}") + # early declarations for cases of no results + pos_check = [] # end checks + pos_uniques = [] + neg_check = [] # start checks + neg_uniques = [] + unique_ends = [] + pos_amt_unique = 0 + neg_amt_unique = 0 + amt_positive = 0 + amt_negative = 0 + spanin_data["uniques"] = 0 + spanin_data["amount"] = 0 + # spanin_data['positive']['amt_positive'] = 0 + # spanin_data['positive']['pos_amt_unique'] = 0 + # spanin_data['positive']['isp_match'] = [] + # spanin_data['negative']['amt_negative'] = 0 + # spanin_data['negative']['neg_amt_unique'] = 0 + # spanin_data['negative']['isp_match'] = [] + # print(spanin_data) + if spanin_data["positive"]["coords"]: + # do something... + # print('in other function') + # print(spanin_data['positive']['coords']) + for each_hit in spanin_data["positive"]["coords"]: + pos_check.append(each_hit[2]) + pair_dict["pairs"]["location_amount"].append(each_hit[2]) + pos_uniques = list( + set( + [ + end_site + for end_site in pos_check + if pos_check.count(end_site) >= 1 + ] + ) + ) + # print(pos_check) + # print(pos_uniques) + amt_positive = len(spanin_data["positive"]["coords"]) + pos_amt_unique = len(pos_uniques) + if amt_positive: + spanin_data["positive"]["amt_positive"] = amt_positive + spanin_data["positive"]["pos_amt_unique"] = pos_amt_unique + # pair_dict['pairs']['locations'].extend(pos_uniques) + else: + spanin_data["positive"]["amt_positive"] = 0 + spanin_data["positive"]["pos_amt_unique"] = 0 + if spanin_data["negative"]["coords"]: + + # do something else... + # print('in other function') + # print(spanin_data['negative']['coords']) + for each_hit in spanin_data["negative"]["coords"]: + neg_check.append(each_hit[1]) + pair_dict["pairs"]["location_amount"].append(each_hit[1]) + neg_uniques = list( + set( + [ + start_site + for start_site in neg_check + if neg_check.count(start_site) >= 1 + ] + ) + ) + # print(neg_uniques) + amt_negative = len(spanin_data["negative"]["coords"]) + neg_amt_unique = len(neg_uniques) + if amt_negative: + spanin_data["negative"]["amt_negative"] = amt_negative + spanin_data["negative"]["neg_amt_unique"] = neg_amt_unique + # pair_dict['pairs']['locations'].extend(neg_uniques) + else: + spanin_data["negative"]["amt_negative"] = 0 + spanin_data["negative"]["neg_amt_unique"] = 0 + spanin_data["uniques"] += ( + spanin_data["positive"]["pos_amt_unique"] + + spanin_data["negative"]["neg_amt_unique"] + ) + spanin_data["amount"] += ( + spanin_data["positive"]["amt_positive"] + + spanin_data["negative"]["amt_negative"] + ) + # print(spanin_data['uniques']) + list(set(pair_dict["pairs"]["location_amount"])) + pair_dict["pairs"]["location_amount"] = dict( + Counter(pair_dict["pairs"]["location_amount"]) + ) + for data in pair_dict.values(): + # print(data['locations']) + # print(type(data['locations'])) + v = 0 + for loc, count in data["location_amount"].items(): + # data['pair_number'] = {loc + v += 1 + data["pair_number"][loc] = v + # print(dict(Counter(pair_dict['pairs']['locations']))) + # print(pair_dict) + spanins["total_amount"] = ( + spanins["EMBEDDED"]["amount"] + + spanins["SEPARATED"]["amount"] + + spanins["OVERLAPPED"]["amount"] + ) + spanins["total_unique"] = ( + spanins["EMBEDDED"]["uniques"] + + spanins["SEPARATED"]["uniques"] + + spanins["OVERLAPPED"]["uniques"] + ) + # spanins['total_unique'] = len(pair_dict['pairs']['pair_number']) + return spanins, pair_dict + + +if __name__ == "__main__": + + # Common parameters for both ISP / OSP portion of script + + parser = argparse.ArgumentParser( + description="Trim the putative protein candidates and find potential i-spanin / o-spanin pairs" + ) + + parser.add_argument( + "putative_isp_fasta_file", + type=argparse.FileType("r"), + help='Putative i-spanin FASTA file, output of "generate-putative-isp"', + ) # the "input" argument + + parser.add_argument( + "putative_osp_fasta_file", + type=argparse.FileType("r"), + help='Putative o-spanin FASTA file, output of "generate-putative-osp"', + ) + + parser.add_argument( + "--max_isp_osp_distance", + dest="max_isp_osp_distance", + default=10, + type=int, + help="max distance from end of i-spanin to start of o-spanin, measured in AAs", + ) + + parser.add_argument( + "--embedded_txt", + dest="embedded_txt", + type=argparse.FileType("w"), + default="_findSpanin_embedded_results.txt", + help="Results of potential embedded spanins", + ) + parser.add_argument( + "--overlap_txt", + dest="overlap_txt", + type=argparse.FileType("w"), + default="_findSpanin_overlap_results.txt", + help="Results of potential overlapping spanins", + ) + parser.add_argument( + "--separate_txt", + dest="separate_txt", + type=argparse.FileType("w"), + default="_findSpanin_separated_results.txt", + help="Results of potential separated spanins", + ) + + parser.add_argument( + "--summary_txt", + dest="summary_txt", + type=argparse.FileType("w"), + default="_findSpanin_summary.txt", + help="Results of potential spanin pairs", + ) + parser.add_argument( + "-v", action="version", version="0.3.0" + ) # Is this manually updated? + args = parser.parse_args() + + #### RE-WRITE + SPANIN_TYPES = {} + SPANIN_TYPES["EMBEDDED"] = {} + SPANIN_TYPES["OVERLAPPED"] = {} + SPANIN_TYPES["SEPARATED"] = {} + # SPANIN_TYPES = { + # 'EMBEDDED' : {}, + # 'OVERLAPPED' : {}, + # 'SEPARATED' : {}, + # } + + isp = getDescriptions(args.putative_isp_fasta_file) + args.putative_isp_fasta_file = open(args.putative_isp_fasta_file.name, "r") + isp_full = tuple_fasta(args.putative_isp_fasta_file) + + osp = getDescriptions(args.putative_osp_fasta_file) + args.putative_osp_fasta_file = open(args.putative_osp_fasta_file.name, "r") + osp_full = tuple_fasta(args.putative_osp_fasta_file) + + #### location data + location_data = {"isp": [], "osp": []} + spanins = [isp, osp] + for idx, each_spanin_type in enumerate(spanins): + for description in each_spanin_type: + locations = grabLocs(description) + if idx == 0: # i-spanin + location_data["isp"].append(locations) + elif idx == 1: # o-spanin + location_data["osp"].append(locations) + + #### Check for types of spanins + embedded, overlap, separate = spaninProximity( + isp=location_data["isp"], + osp=location_data["osp"], + max_dist=args.max_isp_osp_distance * 3, + ) + + SPANIN_TYPES["EMBEDDED"] = embedded + SPANIN_TYPES["OVERLAPPED"] = overlap + SPANIN_TYPES["SEPARATED"] = separate + + # for spanin_type, spanin in SPANIN_TYPES.items(): + # s = 0 + # for sequence in spanin.values(): + # s += len(sequence) + # SPANIN_TYPES[spanin_type]['amount'] = s + # SPANIN_TYPES[spanin_type]['unique'] = len(spanin.keys()) + + # check_for_unique_spanins(SPANIN_TYPES) + spanins = reconfigure_dict(SPANIN_TYPES) + spanins, pair_dict = check_for_uniques(spanins) + # print(pair_dict) + with args.summary_txt as f: + for each_spanin_type, spanin_data in spanins.items(): + try: + if each_spanin_type not in ["total_amount", "total_unique"]: + # print(each_spanin_type) + # print(each_spanin_type) + f.write( + "=~~~~~= " + + str(each_spanin_type) + + " Spanin Candidate Statistics =~~~~~=\n" + ) + f.writelines( + "Total Candidate Pairs = " + str(spanin_data["amount"]) + "\n" + ) + f.writelines( + "Total Unique Pairs = " + str(spanin_data["uniques"]) + "\n" + ) + if each_spanin_type == "EMBEDDED": + for k, v in SPANIN_TYPES["EMBEDDED"].items(): + # print(k) + f.writelines( + "" + + str(k) + + " ==> Amount of corresponding candidate o-spanins(s): " + + str(len(v)) + + "\n" + ) + if each_spanin_type == "SEPARATED": + for k, v in SPANIN_TYPES["SEPARATED"].items(): + f.writelines( + "" + + str(k) + + " ==> Amount of corresponding candidate o-spanins(s): " + + str(len(v)) + + "\n" + ) + if each_spanin_type == "OVERLAPPED": + for k, v in SPANIN_TYPES["OVERLAPPED"].items(): + f.writelines( + "" + + str(k) + + " ==> Amount of corresponding candidate o-spanins(s): " + + str(len(v)) + + "\n" + ) + except TypeError: + continue + f.write("\n=~~~~~= Tally from ALL spanin types =~~~~~=\n") + f.writelines("Total Candidates = " + str(spanins["total_amount"]) + "\n") + f.writelines( + "Total Unique Candidate Pairs = " + str(spanins["total_unique"]) + "\n" + ) + + args.putative_isp_fasta_file = open(args.putative_isp_fasta_file.name, "r") + isp_full = tuple_fasta(args.putative_isp_fasta_file) + + args.putative_osp_fasta_file = open(args.putative_osp_fasta_file.name, "r") + osp_full = tuple_fasta(args.putative_osp_fasta_file) + + # print(isp_full) + isp_seqs = [] + osp_seqs = [] + for isp_tupe in isp_full: + # print(isp_tupe) + for pisp, posp in embedded.items(): + # print(f"ISP = searching for {pisp} in {isp_tupe[0]}") + if re.search(("(" + str(pisp) + ")\D"), isp_tupe[0]): + # print(isp_tupe[0]) + # print(peri_count) + peri_count = str.split(isp_tupe[0], "~=")[1] + isp_seqs.append((pisp, isp_tupe[1], peri_count)) + # print(isp_seqs) + for osp_tupe in osp_full: + for pisp, posp in embedded.items(): + for data in posp: + # print(f"OSP = searching for {data[3]} in {osp_tupe[0]}, coming from this object: {data}") + if re.search(("(" + str(data[3]) + ")\D"), osp_tupe[0]): + peri_count = str.split(osp_tupe[0], "~=")[1] + osp_seqs.append((data[3], osp_tupe[1], peri_count)) + + with args.embedded_txt as f: + f.write("================ embedded spanin candidates =================\n") + f.write( + "isp\tisp_start\tisp_end\tosp\tosp_start\tosp_end\tstrand\tpair_number\n" + ) + if embedded != {}: + # print(embedded) + for pisp, posp in embedded.items(): + # print(f"{pisp} - {posp}") + f.write(pisp + "\n") + for each_posp in posp: + # print(posp) + f.write( + "\t{}\t{}\t{}\t{}\t{}\t{}\t".format( + each_posp[1], + each_posp[2], + each_posp[3], + each_posp[4], + each_posp[5], + each_posp[6], + ) + ) + if each_posp[6] == "+": + if each_posp[2] in pair_dict["pairs"]["pair_number"].keys(): + f.write( + "" + + str(pair_dict["pairs"]["pair_number"][each_posp[2]]) + + "\n" + ) + elif each_posp[6] == "-": + if each_posp[1] in pair_dict["pairs"]["pair_number"].keys(): + f.write( + "" + + str(pair_dict["pairs"]["pair_number"][each_posp[1]]) + + "\n" + ) + else: + f.write("nothing found") + + with open(args.embedded_txt.name, "a") as f: + f.write("\n================= embedded candidate sequences ================\n") + f.write("======================= isp ==========================\n\n") + for isp_data in isp_seqs: + # print(isp_data) + f.write( + ">isp_orf::{}-peri_count~={}\n{}\n".format( + isp_data[0], isp_data[2], lineWrapper(isp_data[1]) + ) + ) + f.write("\n======================= osp ========================\n\n") + for osp_data in osp_seqs: + f.write( + ">osp_orf::{}-peri_count~={}\n{}\n".format( + osp_data[0], osp_data[2], lineWrapper(osp_data[1]) + ) + ) + + args.putative_isp_fasta_file = open(args.putative_isp_fasta_file.name, "r") + isp_full = tuple_fasta(args.putative_isp_fasta_file) + + args.putative_osp_fasta_file = open(args.putative_osp_fasta_file.name, "r") + osp_full = tuple_fasta(args.putative_osp_fasta_file) + + isp_seqs = [] + osp_seqs = [] + for isp_tupe in isp_full: + peri_count = str.split(isp_tupe[0], "~=")[1] + for pisp, posp in overlap.items(): + if re.search(("(" + str(pisp) + ")\D"), isp_tupe[0]): + peri_count = str.split(isp_tupe[0], "~=")[1] + isp_seqs.append((pisp, isp_tupe[1], peri_count)) + + for osp_tupe in osp_full: + for pisp, posp in overlap.items(): + for data in posp: + if re.search(("(" + str(data[3]) + ")\D"), osp_tupe[0]): + peri_count = str.split(osp_tupe[0], "~=")[1] + osp_seqs.append((data[3], osp_tupe[1], peri_count)) + + with args.overlap_txt as f: + f.write("================ overlap spanin candidates =================\n") + f.write( + "isp\tisp_start\tisp_end\tosp\tosp_start\tosp_end\tstrand\tpair_number\n" + ) + if overlap != {}: + for pisp, posp in overlap.items(): + f.write(pisp + "\n") + for each_posp in posp: + f.write( + "\t{}\t{}\t{}\t{}\t{}\t{}\t".format( + each_posp[1], + each_posp[2], + each_posp[3], + each_posp[4], + each_posp[5], + each_posp[6], + ) + ) + if each_posp[6] == "+": + if each_posp[2] in pair_dict["pairs"]["pair_number"].keys(): + # print('ovl ; +') + f.write( + "" + + str(pair_dict["pairs"]["pair_number"][each_posp[2]]) + + "\n" + ) + elif each_posp[6] == "-": + if each_posp[1] in pair_dict["pairs"]["pair_number"].keys(): + f.write( + "" + + str(pair_dict["pairs"]["pair_number"][each_posp[1]]) + + "\n" + ) + else: + f.write("nothing found") + + with open(args.overlap_txt.name, "a") as f: + # print(isp_seqs) + f.write("\n================= overlap candidate sequences ================\n") + f.write("======================= isp ==========================\n\n") + for isp_data in isp_seqs: + f.write( + ">isp_orf::{}-pericount~={}\n{}\n".format( + isp_data[0], isp_data[2], lineWrapper(isp_data[1]) + ) + ) + f.write("\n======================= osp ========================\n\n") + for osp_data in osp_seqs: + f.write( + ">osp_orf::{}-pericount~={}\n{}\n".format( + osp_data[0], osp_data[2], lineWrapper(osp_data[1]) + ) + ) + + args.putative_isp_fasta_file = open(args.putative_isp_fasta_file.name, "r") + isp_full = tuple_fasta(args.putative_isp_fasta_file) + args.putative_osp_fasta_file = open(args.putative_osp_fasta_file.name, "r") + osp_full = tuple_fasta(args.putative_osp_fasta_file) + + isp_seqs = [] + osp_seqs = [] + for isp_tupe in isp_full: + for pisp, posp in separate.items(): + if re.search(("(" + str(pisp) + ")\D"), isp_tupe[0]): + peri_count = str.split(isp_tupe[0], "~=")[1] + isp_seqs.append((pisp, isp_tupe[1], peri_count)) + # print(isp_seqs) + for osp_tupe in osp_full: + for pisp, posp in separate.items(): + for data in posp: + if re.search(("(" + str(data[3]) + ")\D"), osp_tupe[0]): + peri_count = str.split(osp_tupe[0], "~=")[1] + osp_seqs.append((data[3], osp_tupe[1], peri_count)) + + with args.separate_txt as f: + f.write("================ separated spanin candidates =================\n") + f.write( + "isp\tisp_start\tisp_end\tosp\tosp_start\tosp_end\tstrand\tpair_number\n" + ) + if separate != {}: + for pisp, posp in separate.items(): + f.write(pisp + "\n") + for each_posp in posp: + f.write( + "\t{}\t{}\t{}\t{}\t{}\t{}\t".format( + each_posp[1], + each_posp[2], + each_posp[3], + each_posp[4], + each_posp[5], + each_posp[6], + ) + ) + if each_posp[6] == "+": + if each_posp[2] in pair_dict["pairs"]["pair_number"].keys(): + f.write( + "" + + str(pair_dict["pairs"]["pair_number"][each_posp[2]]) + + "\n" + ) + elif each_posp[6] == "-": + if each_posp[1] in pair_dict["pairs"]["pair_number"].keys(): + f.write( + "" + + str(pair_dict["pairs"]["pair_number"][each_posp[1]]) + + "\n" + ) + else: + f.write("nothing found") + + with open(args.separate_txt.name, "a") as f: + f.write("\n================= separated candidate sequences ================\n") + f.write("======================= isp ==========================\n\n") + for isp_data in isp_seqs: + f.write( + ">isp_orf::{}-pericount~={}\n{}\n".format( + isp_data[0], isp_data[2], lineWrapper(isp_data[1]) + ) + ) + f.write("\n======================= osp ========================\n\n") + for osp_data in osp_seqs: + f.write( + ">osp_orf::{}-pericount~={}\n{}\n".format( + osp_data[0], osp_data[2], lineWrapper(osp_data[1]) + ) + ) diff -r 1a7fef71aee3 -r fd70980a516b findSpanin.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/findSpanin.xml Mon Jun 05 02:42:01 2023 +0000 @@ -0,0 +1,43 @@ + + With the outputs from the ISP and OSP candidate tools, cull the list down to candidate pairs + + macros.xml + cpt-macros.xml + + + + + + + + + + + + + + + + Putative i-spanin and o-spanin protein multiFASTAs (generated from the ISP/OSP Candidate Tools). + +**METHODOLOGY** +Does a pairwise comparison between candidate i-spanins and o-spanins based on their genomic location, and classifies them into the known bimolecular spanin genetic architectures. Classes are: embedded, overlapping, separated, or NOT a potential pair. + +**OUTPUT** --> File with candidate pairs for each bimolecular spanin class and a basic summary statistics file. + +]]> + + diff -r 1a7fef71aee3 -r fd70980a516b macros.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Mon Jun 05 02:42:01 2023 +0000 @@ -0,0 +1,58 @@ + + + + regex + python + biopython + cpt_gffparser + + + + + $blast_tsv + + + + + + $blast_xml + + + + + + + + + + + + + + + + + + + + $gff3_data + + + genomeref.fa + + + ln -s $genome_fasta genomeref.fa; + + + genomeref.fa + + + + + + $sequences + + + + + diff -r 1a7fef71aee3 -r fd70980a516b spaninFuncs.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spaninFuncs.py Mon Jun 05 02:42:01 2023 +0000 @@ -0,0 +1,530 @@ +""" +PREMISE +### Functions/Classes that are used in both generate-putative-osp.py and generate-putative-isp.py +###### Main premise here is to make the above scripts a little more DRY, as well as easily readable for execution. +###### Documentation will ATTEMPT to be thourough here +""" + +import re +from Bio import SeqIO +from Bio import Seq +from collections import OrderedDict + +# Not written in OOP for a LITTLE bit of trying to keep the complication down in case adjustments are needed by someone else. +# Much of the manipulation is string based; so it should be straightforward as well as moderately quick +################## GLobal Variables +Lys = "K" + + +def check_back_end_snorkels(seq, tmsize): + """ + Searches through the backend of a potential TMD snorkel. This is the 2nd part of a TMD snorkel lysine match. + --> seq : should be the sequence fed from the "search_region" portion of the sequence + --> tmsize : size of the potential TMD being investigated + """ + found = [] + if seq[tmsize - 4] == Lys and re.search(("[FIWLVMYCATGS]"), seq[tmsize - 5]): + found = "match" + return found + elif seq[tmsize - 3] == Lys and re.search(("[FIWLVMYCATGS]"), seq[tmsize - 4]): + found = "match" + return found + elif seq[tmsize - 2] == Lys and re.search(("[FIWLVMYCATGS]"), seq[tmsize - 3]): + found = "match" + return found + elif seq[tmsize - 1] == Lys and re.search(("[FIWLVMYCATGS]"), seq[tmsize - 2]): + found = "match" + return found + else: + found = "NOTmatch" + return found + + +def prep_a_gff3(fa, spanin_type, org): + """ + Function parses an input detailed 'fa' file and outputs a 'gff3' file + ---> fa = input .fa file + ---> output = output a returned list of data, easily portable to a gff3 next + ---> spanin_type = 'isp' or 'osp' + """ + with org as f: + header = f.readline() + orgacc = header.split(" ") + orgacc = orgacc[0].split(">")[1].strip() + fa_zip = tuple_fasta(fa) + data = [] + for a_pair in fa_zip: + # print(a_pair) + if re.search(("(\[1\])"), a_pair[0]): + strand = "+" + elif re.search(("(\[-1\])"), a_pair[0]): + strand = "-" # column 7 + start = re.search(("[\d]+\.\."), a_pair[0]).group(0).split("..")[0] # column 4 + end = re.search(("\.\.[\d]+"), a_pair[0]).group(0).split("..")[1] # column 5 + orfid = re.search(("(ORF)[\d]+"), a_pair[0]).group(0) # column 1 + if spanin_type == "isp": + methodtype = "CDS" # column 3 + spanin = "isp" + elif spanin_type == "osp": + methodtype = "CDS" # column 3 + spanin = "osp" + elif spanin_type == "usp": + methodtype = "CDS" + spanin = "usp" + else: + raise "need to input spanin type" + source = "cpt.py|putative-*.py" # column 2 + score = "." # column 6 + phase = "." # column 8 + attributes = ( + "ID=" + orgacc + "|" + orfid + ";ALIAS=" + spanin + ";SEQ=" + a_pair[1] + ) # column 9 + sequence = [ + [orgacc, source, methodtype, start, end, score, strand, phase, attributes] + ] + data += sequence + return data + + +def write_gff3(data, output="results.gff3"): + """ + Parses results from prep_a_gff3 into a gff3 file + ---> input : list from prep_a_gff3 + ---> output : gff3 file + """ + data = data + filename = output + with filename as f: + f.write("#gff-version 3\n") + for value in data: + f.write( + "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format( + value[0], + value[1], + value[2], + value[3], + value[4], + value[5], + value[6], + value[7], + value[8], + ) + ) + f.close() + + +def find_tmd( + pair, + minimum=10, + maximum=30, + TMDmin=10, + TMDmax=20, + isp_mode=False, + peri_min=18, + peri_max=206, +): + """ + Function that searches for lysine snorkels and then for a spanning hydrophobic region that indicates a potential TMD + ---> pair : Input of tuple with description and AA sequence (str) + ---> minimum : How close from the initial start codon a TMD can be within + ---> maximum : How far from the initial start codon a TMD can be within + ---> TMDmin : The minimum size that a transmembrane can be (default = 10) + ---> TMDmax : The maximum size tha ta transmembrane can be (default = 20) + """ + # hydrophobicAAs = ['P', 'F', 'I', 'W', 'L', 'V', 'M', 'Y', 'C', 'A', 'T', 'G', 'S'] + tmd = [] + s = str(pair[1]) # sequence being analyzed + # print(s) # for trouble shooting + if maximum > len(s): + maximum = len(s) + search_region = s[minimum - 1 : maximum + 1] + # print(f"this is the search region: {search_region}") + # print(search_region) # for trouble shooting + + for tmsize in range(TMDmin, TMDmax + 1, 1): + # print(f"this is the current tmsize we're trying: {tmsize}") + # print('==============='+str(tmsize)+'================') # print for troubleshooting + pattern = ( + "[PFIWLVMYCATGS]{" + str(tmsize) + "}" + ) # searches for these hydrophobic residues tmsize total times + # print(pattern) + # print(f"sending to regex: {search_region}") + if re.search( + ("[K]"), search_region[1:8] + ): # grabbing one below with search region, so I want to grab one ahead here when I query. + store_search = re.search( + ("[K]"), search_region[1:8] + ) # storing regex object + where_we_are = store_search.start() # finding where we got the hit + if re.search( + ("[PFIWLVMYCATGS]"), search_region[where_we_are + 1] + ) and re.search( + ("[PFIWLVMYCATGS]"), search_region[where_we_are - 1] + ): # hydrophobic neighbor + # try: + g = re.search( + ("[PFIWLVMYCATGS]"), search_region[where_we_are + 1] + ).group() + backend = check_back_end_snorkels(search_region, tmsize) + if backend == "match": + if isp_mode: + g = re.search((pattern), search_region).group() + end_of_tmd = re.search((g), s).end() + 1 + amt_peri = len(s) - end_of_tmd + if peri_min <= amt_peri <= peri_max: + pair_desc = pair[0] + ", peri_count~=" + str(amt_peri) + new_pair = (pair_desc, pair[1]) + tmd.append(new_pair) + else: + tmd.append(pair) + else: + continue + # else: + # print("I'm continuing out of snorkel loop") + # print(f"{search_region}") + # continue + if re.search((pattern), search_region): + # print(f"found match: {}") + # print("I AM HEREEEEEEEEEEEEEEEEEEEEEEE") + # try: + if isp_mode: + g = re.search((pattern), search_region).group() + end_of_tmd = re.search((g), s).end() + 1 + amt_peri = len(s) - end_of_tmd + if peri_min <= amt_peri <= peri_max: + pair_desc = pair[0] + ", peri_count~=" + str(amt_peri) + new_pair = (pair_desc, pair[1]) + tmd.append(new_pair) + else: + tmd.append(pair) + else: + continue + + return tmd + + +def find_lipobox( + pair, minimum=10, maximum=50, min_after=30, max_after=185, regex=1, osp_mode=False +): + """ + Function that takes an input tuple, and will return pairs of sequences to their description that have a lipoobox + ---> minimum - min distance from start codon to first AA of lipobox + ---> maximum - max distance from start codon to first AA of lipobox + ---> regex - option 1 (default) => more strict regular expression ; option 2 => looser selection, imported from LipoRy + + """ + if regex == 1: + pattern = "[ILMFTV][^REKD][GAS]C" # regex for Lipobox from findSpanin.pl + elif regex == 2: + pattern = "[ACGSILMFTV][^REKD][GAS]C" # regex for Lipobox from LipoRy + + candidates = [] + s = str(pair[1]) + # print(s) # trouble shooting + search_region = s[ + minimum - 1 : maximum + 5 + ] # properly slice the input... add 4 to catch if it hangs off at max input + # print(search_region) # trouble shooting + patterns = ["[ILMFTV][^REKD][GAS]C", "AW[AGS]C"] + for pattern in patterns: + # print(pattern) # trouble shooting + if re.search((pattern), search_region): # lipobox must be WITHIN the range... + # searches the sequence with the input RegEx AND omits if + g = re.search( + (pattern), search_region + ).group() # find the exact group match + amt_peri = len(s) - re.search((g), s).end() + 1 + if min_after <= amt_peri <= max_after: # find the lipobox end region + if osp_mode: + pair_desc = pair[0] + ", peri_count~=" + str(amt_peri) + new_pair = (pair_desc, pair[1]) + candidates.append(new_pair) + else: + candidates.append(pair) + + return candidates + + +def tuple_fasta(fasta_file): + """ + #### INPUT: Fasta File + #### OUTPUT: zipped (zip) : pairwise relationship of description to sequence + #### + """ + fasta = SeqIO.parse(fasta_file, "fasta") + descriptions = [] + sequences = [] + for r in fasta: # iterates and stores each description and sequence + description = r.description + sequence = str(r.seq) + if ( + sequence[0] != "I" + ): # the translation table currently has I as a potential start codon ==> this will remove all ORFs that start with I + descriptions.append(description) + sequences.append(sequence) + else: + continue + + return zip(descriptions, sequences) + + +def lineWrapper(text, charactersize=60): + + if len(text) <= charactersize: + return text + else: + return ( + text[:charactersize] + + "\n" + + lineWrapper(text[charactersize:], charactersize) + ) + + +def getDescriptions(fasta): + """ + Takes an output FASTA file, and parses retrieves the description headers. These headers contain information needed + for finding locations of a potential i-spanin and o-spanin proximity to one another. + """ + desc = [] + with fasta as f: + for line in f: + if line.startswith(">"): + desc.append(line) + return desc + + +def splitStrands(text, strand="+"): + # positive_strands = [] + # negative_strands = [] + if strand == "+": + if re.search(("(\[1\])"), text): + return text + elif strand == "-": + if re.search(("(\[-1\])"), text): + return text + # return positive_strands, negative_strands + + +def parse_a_range(pair, start, end): + """ + Takes an input data tuple from a fasta tuple pair and keeps only those within the input sequence range + ---> data : fasta tuple data + ---> start : start range to keep + ---> end : end range to keep (will need to + 1) + """ + matches = [] + for each_pair in pair: + + s = re.search(("[\d]+\.\."), each_pair[0]).group(0) # Start of the sequence + s = int(s.split("..")[0]) + e = re.search(("\.\.[\d]+"), each_pair[0]).group(0) + e = int(e.split("..")[1]) + if start - 1 <= s and e <= end + 1: + matches.append(each_pair) + else: + continue + # else: + # continue + # if matches != []: + return matches + # else: + # print('no candidates within selected range') + + +def grabLocs(text): + """ + Grabs the locations of the spanin based on NT location (seen from ORF). Grabs the ORF name, as per named from the ORF class/module + from cpt.py + """ + start = re.search(("[\d]+\.\."), text).group( + 0 + ) # Start of the sequence ; looks for [numbers].. + end = re.search(("\.\.[\d]+"), text).group( + 0 + ) # End of the sequence ; Looks for ..[numbers] + orf = re.search(("(ORF)[\d]+"), text).group( + 0 + ) # Looks for ORF and the numbers that are after it + if re.search(("(\[1\])"), text): # stores strand + strand = "+" + elif re.search(("(\[-1\])"), text): # stores strand + strand = "-" + start = int(start.split("..")[0]) + end = int(end.split("..")[1]) + vals = [start, end, orf, strand] + + return vals + + +def spaninProximity(isp, osp, max_dist=30): + """ + _NOTE THIS FUNCTION COULD BE MODIFIED TO RETURN SEQUENCES_ + Compares the locations of i-spanins and o-spanins. max_dist is the distance in NT measurement from i-spanin END site + to o-spanin START. The user will be inputting AA distance, so a conversion will be necessary ( * 3) + I modified this on 07.30.2020 to bypass the pick + or - strand. To + INPUT: list of OSP and ISP candidates + OUTPUT: Return (improved) candidates for overlapping, embedded, and separate list + """ + + embedded = {} + overlap = {} + separate = {} + for iseq in isp: + embedded[iseq[2]] = [] + overlap[iseq[2]] = [] + separate[iseq[2]] = [] + for oseq in osp: + if iseq[3] == "+": + if oseq[3] == "+": + if iseq[0] < oseq[0] < iseq[1] and oseq[1] < iseq[1]: + ### EMBEDDED ### + combo = [ + iseq[0], + iseq[1], + oseq[2], + oseq[0], + oseq[1], + iseq[3], + ] # ordering a return for dic + embedded[iseq[2]] += [combo] + elif iseq[0] < oseq[0] <= iseq[1] and oseq[1] > iseq[1]: + ### OVERLAP / SEPARATE ### + if (iseq[1] - oseq[0]) < 6: + combo = [ + iseq[0], + iseq[1], + oseq[2], + oseq[0], + oseq[1], + iseq[3], + ] + separate[iseq[2]] += [combo] + else: + combo = [ + iseq[0], + iseq[1], + oseq[2], + oseq[0], + oseq[1], + iseq[3], + ] + overlap[iseq[2]] += [combo] + elif iseq[1] <= oseq[0] <= iseq[1] + max_dist: + combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1], iseq[3]] + separate[iseq[2]] += [combo] + else: + continue + if iseq[3] == "-": + if oseq[3] == "-": + if iseq[0] <= oseq[1] <= iseq[1] and oseq[0] > iseq[0]: + ### EMBEDDED ### + combo = [ + iseq[0], + iseq[1], + oseq[2], + oseq[0], + oseq[1], + iseq[3], + ] # ordering a return for dict + embedded[iseq[2]] += [combo] + elif iseq[0] <= oseq[1] <= iseq[1] and oseq[0] < iseq[0]: + if (oseq[1] - iseq[0]) < 6: + combo = [ + iseq[0], + iseq[1], + oseq[2], + oseq[0], + oseq[1], + iseq[3], + ] + separate[iseq[2]] += [combo] + else: + combo = [ + iseq[0], + iseq[1], + oseq[2], + oseq[0], + oseq[1], + iseq[3], + ] + overlap[iseq[2]] += [combo] + elif iseq[0] - 10 < oseq[1] < iseq[0]: + combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1], iseq[3]] + separate[iseq[2]] += [combo] + else: + continue + + embedded = {k: embedded[k] for k in embedded if embedded[k]} + overlap = {k: overlap[k] for k in overlap if overlap[k]} + separate = {k: separate[k] for k in separate if separate[k]} + + return embedded, overlap, separate + + +def check_for_usp(): + "pass" + + +############################################### TEST RANGE ######################################################################### +#################################################################################################################################### +if __name__ == "__main__": + + #### TMD TEST + test_desc = ["one", "two", "three", "four", "five"] + test_seq = [ + "XXXXXXXXXXXXXXXFMCFMCFMCFMCFMCXXXXXXXXXXXXXXXXXXXXXXXXXX", + "XXXXXXXXAAKKKKKKKKKKKKKKKXXXXXXXXXXXXX", + "XXXXXXX", + "XXXXXXXXXXXKXXXXXXXXXX", + "XXXXXXXXXXAKXXXXXXXXXXAKXXXXXXXX", + ] + # for l in + # combo = zip(test_desc,test_seq) + pairs = zip(test_desc, test_seq) + tmd = [] + for each_pair in pairs: + # print(each_pair) + try: + tmd += find_tmd(pair=each_pair) + except (IndexError, TypeError): + continue + # try:s = each_pair[1] + # tmd += find_tmd(seq=s, tmsize=15) + # print('\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n') + # print(tmd) + # print('\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n') + + #### tuple-fasta TEST + # fasta_file = 'out_isp.fa' + # ret = tuple_fasta(fasta_file) + # print('=============') + # for i in ret: + # print(i[1]) + + #### LipoBox TEST + test_desc = ["one", "two", "three", "four", "five", "six", "seven"] + test_seq = [ + "XXXXXXXXXTGGCXXXXXXXXXXXXXXXX", + "XXXXXXXXAAKKKKKKKKKKKKKKKXXXXXXXXXXXXX", + "XXXXXXX", + "AGGCXXXXXXXXXXXXXXXXXXXXTT", + "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXTGGC", + "XXXXXXXXXXXXXXXXXXXXXXXXXXTGGC", + "MSTLRELRLRRALKEQSMRYLLSIKKTLPRWKGALIGLFLICVATISGCASESKLPEPPMVSVDSSLMVEPNLTTEMLNVFSQ*", + ] + pairs = zip(test_desc, test_seq) + lipo = [] + for each_pair in pairs: + # print(each_pair) + # try: + try: + lipo += find_lipobox(pair=each_pair, regex=2) # , minimum=8) + except TypeError: # catches if something doesnt have the min/max requirements (something is too small) + continue + # except: + # continue + # print('\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n') + #############################3 + # g = prep_a_gff3(fa='putative_isp.fa', spanin_type='isp') + # print(g) + # write_gff3(data=g)