Mercurial > repos > cpt > cpt_putative_osp
changeset 0:03670eba3480 draft
Uploaded
author | cpt |
---|---|
date | Fri, 17 Jun 2022 13:09:00 +0000 |
parents | |
children | 05b97a4dce94 |
files | cpt_putative_osp/cpt-macros.xml cpt_putative_osp/cpt.py cpt_putative_osp/generate-putative-osp.py cpt_putative_osp/generate-putative-osp.xml cpt_putative_osp/macros.xml cpt_putative_osp/spaninFuncs.py |
diffstat | 6 files changed, 1367 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_putative_osp/cpt-macros.xml Fri Jun 17 13:09:00 2022 +0000 @@ -0,0 +1,115 @@ +<?xml version="1.0"?> +<macros> + <xml name="gff_requirements"> + <requirements> + <requirement type="package" version="2.7">python</requirement> + <requirement type="package" version="1.65">biopython</requirement> + <requirement type="package" version="2.12.1">requests</requirement> + <yield/> + </requirements> + <version_command> + <![CDATA[ + cd $__tool_directory__ && git rev-parse HEAD + ]]> + </version_command> + </xml> + <xml name="citation/mijalisrasche"> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex">@unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + </xml> + <xml name="citations"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-crr"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {C. Ross}, + title = {CPT Galaxy Tools}, + year = {2020-}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-2020"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {A. Criscione}, + title = {CPT Galaxy Tools}, + year = {2019-2021}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-2020-AJC-solo"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {A. Criscione}, + title = {CPT Galaxy Tools}, + year = {2019-2021}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-clm"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {C. Maughmer}, + title = {CPT Galaxy Tools}, + year = {2017-2020}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="sl-citations-clm"> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {C. Maughmer}, + title = {CPT Galaxy Tools}, + year = {2017-2020}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </xml> +</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_putative_osp/cpt.py Fri Jun 17 13:09:00 2022 +0000 @@ -0,0 +1,341 @@ +#!/usr/bin/env python +from Bio.Seq import Seq, reverse_complement, translate +from Bio.SeqRecord import SeqRecord +from Bio import SeqIO +from Bio.Data import CodonTable +import logging + +logging.basicConfig() +log = logging.getLogger() + +PHAGE_IN_MIDDLE = re.compile("^(?P<host>.*)\s*phage (?P<phage>.*)$") +BACTERIOPHAGE_IN_MIDDLE = re.compile("^(?P<host>.*)\s*bacteriophage (?P<phage>.*)$") +STARTS_WITH_PHAGE = re.compile( + "^(bacterio|vibrio|Bacterio|Vibrio|)?[Pp]hage (?P<phage>.*)$" +) +NEW_STYLE_NAMES = re.compile("(?P<phage>v[A-Z]_[A-Z][a-z]{2}_.*)") + + +def phage_name_parser(name): + host = None + phage = None + name = name.replace(", complete genome.", "") + name = name.replace(", complete genome", "") + + m = BACTERIOPHAGE_IN_MIDDLE.match(name) + if m: + host = m.group("host") + phage = m.group("phage") + return (host, phage) + + m = PHAGE_IN_MIDDLE.match(name) + if m: + host = m.group("host") + phage = m.group("phage") + return (host, phage) + + m = STARTS_WITH_PHAGE.match(name) + if m: + phage = m.group("phage") + return (host, phage) + + m = NEW_STYLE_NAMES.match(name) + if m: + phage = m.group("phage") + return (host, phage) + + return (host, phage) + + +class OrfFinder(object): + def __init__(self, table, ftype, ends, min_len, strand): + self.table = table + self.table_obj = CodonTable.ambiguous_generic_by_id[table] + self.ends = ends + self.ftype = ftype + self.min_len = min_len + self.starts = sorted(self.table_obj.start_codons) + self.stops = sorted(self.table_obj.stop_codons) + self.re_starts = re.compile("|".join(self.starts)) + self.re_stops = re.compile("|".join(self.stops)) + self.strand = strand + + def locate(self, fasta_file, out_nuc, out_prot, out_bed, out_gff3): + seq_format = "fasta" + log.debug("Genetic code table %i" % self.table) + log.debug("Minimum length %i aa" % self.min_len) + + out_count = 0 + + out_gff3.write("##gff-version 3\n") + + for idx, record in enumerate(SeqIO.parse(fasta_file, seq_format)): + for i, (f_start, f_end, f_strand, n, t) in enumerate( + self.get_all_peptides(str(record.seq).upper()) + ): + out_count += 1 + + descr = "length %i aa, %i bp, from %s..%s[%s] of %s" % ( + len(t), + len(n), + f_start, + f_end, + f_strand, + record.description, + ) + fid = record.id + "|%s%i" % (self.ftype, i + 1) + + r = SeqRecord(Seq(n), id=fid, name="", description=descr) + t = SeqRecord(Seq(t), id=fid, name="", description=descr) + + SeqIO.write(r, out_nuc, "fasta") + SeqIO.write(t, out_prot, "fasta") + + nice_strand = "+" if f_strand == +1 else "-" + + out_bed.write( + "\t".join( + map(str, [record.id, f_start, f_end, fid, 0, nice_strand]) + ) + + "\n" + ) + + out_gff3.write( + "\t".join( + map( + str, + [ + record.id, + "getOrfsOrCds", + "CDS", + f_start + 1, + f_end, + ".", + nice_strand, + 0, + "ID=%s.%s.%s" % (self.ftype, idx, i + 1), + ], + ) + ) + + "\n" + ) + log.info("Found %i %ss", out_count, self.ftype) + + def start_chop_and_trans(self, s, strict=True): + """Returns offset, trimmed nuc, protein.""" + if strict: + assert s[-3:] in self.stops, s + assert len(s) % 3 == 0 + for match in self.re_starts.finditer(s, overlapped=True): + # Must check the start is in frame + start = match.start() + if start % 3 == 0: + n = s[start:] + assert len(n) % 3 == 0, "%s is len %i" % (n, len(n)) + if strict: + t = translate(n, self.table) + else: + # Use when missing stop codon, + t = "M" + translate(n[3:], self.table, to_stop=True) + yield start, n, t # Edited by CPT to be a generator + + def break_up_frame(self, s): + """Returns offset, nuc, protein.""" + start = 0 + for match in self.re_stops.finditer(s, overlapped=True): + index = match.start() + 3 + if index % 3 != 0: + continue + n = s[start:index] + for (offset, n, t) in self.start_chop_and_trans(n): + if n and len(t) >= self.min_len: + yield start + offset, n, t + start = index + + def putative_genes_in_sequence(self, nuc_seq): + """Returns start, end, strand, nucleotides, protein. + Co-ordinates are Python style zero-based. + """ + nuc_seq = nuc_seq.upper() + # TODO - Refactor to use a generator function (in start order) + # rather than making a list and sorting? + answer = [] + full_len = len(nuc_seq) + + for frame in range(0, 3): + for offset, n, t in self.break_up_frame(nuc_seq[frame:]): + start = frame + offset # zero based + answer.append((start, start + len(n), +1, n, t)) + + rc = reverse_complement(nuc_seq) + for frame in range(0, 3): + for offset, n, t in self.break_up_frame(rc[frame:]): + start = full_len - frame - offset # zero based + answer.append((start, start - len(n), -1, n, t)) + answer.sort() + return answer + + def get_all_peptides(self, nuc_seq): + """Returns start, end, strand, nucleotides, protein. + + Co-ordinates are Python style zero-based. + """ + # Refactored into generator by CPT + full_len = len(nuc_seq) + if self.strand != "reverse": + for frame in range(0, 3): + for offset, n, t in self.break_up_frame(nuc_seq[frame:]): + start = frame + offset # zero based + yield (start, start + len(n), +1, n, t) + if self.strand != "forward": + rc = reverse_complement(nuc_seq) + for frame in range(0, 3): + for offset, n, t in self.break_up_frame(rc[frame:]): + start = full_len - frame - offset # zero based + yield (start - len(n), start, -1, n, t) + + +class MGAFinder(object): + def __init__(self, table, ftype, ends, min_len): + self.table = table + self.table_obj = CodonTable.ambiguous_generic_by_id[table] + self.ends = ends + self.ftype = ftype + self.min_len = min_len + self.starts = sorted(self.table_obj.start_codons) + self.stops = sorted(self.table_obj.stop_codons) + self.re_starts = re.compile("|".join(self.starts)) + self.re_stops = re.compile("|".join(self.stops)) + + def locate(self, fasta_file, out_nuc, out_prot, out_bed, out_gff3): + seq_format = "fasta" + log.debug("Genetic code table %i" % self.table) + log.debug("Minimum length %i aa" % self.min_len) + + out_count = 0 + + out_gff3.write("##gff-version 3\n") + + for idx, record in enumerate(SeqIO.parse(fasta_file, seq_format)): + for i, (f_start, f_end, f_strand, n, t) in enumerate( + self.get_all_peptides(str(record.seq).upper()) + ): + out_count += 1 + + descr = "length %i aa, %i bp, from %s..%s[%s] of %s" % ( + len(t), + len(n), + f_start, + f_end, + f_strand, + record.description, + ) + fid = record.id + "|%s%i" % (self.ftype, i + 1) + + r = SeqRecord(Seq(n), id=fid, name="", description=descr) + t = SeqRecord(Seq(t), id=fid, name="", description=descr) + + SeqIO.write(r, out_nuc, "fasta") + SeqIO.write(t, out_prot, "fasta") + + nice_strand = "+" if f_strand == +1 else "-" + + out_bed.write( + "\t".join( + map(str, [record.id, f_start, f_end, fid, 0, nice_strand]) + ) + + "\n" + ) + + out_gff3.write( + "\t".join( + map( + str, + [ + record.id, + "getOrfsOrCds", + "CDS", + f_start + 1, + f_end, + ".", + nice_strand, + 0, + "ID=%s.%s.%s" % (self.ftype, idx, i + 1), + ], + ) + ) + + "\n" + ) + log.info("Found %i %ss", out_count, self.ftype) + + def start_chop_and_trans(self, s, strict=True): + """Returns offset, trimmed nuc, protein.""" + if strict: + assert s[-3:] in self.stops, s + assert len(s) % 3 == 0 + for match in self.re_starts.finditer(s, overlapped=True): + # Must check the start is in frame + start = match.start() + if start % 3 == 0: + n = s[start:] + assert len(n) % 3 == 0, "%s is len %i" % (n, len(n)) + if strict: + t = translate(n, self.table) + else: + # Use when missing stop codon, + t = "M" + translate(n[3:], self.table, to_stop=True) + yield start, n, t + + def break_up_frame(self, s): + """Returns offset, nuc, protein.""" + start = 0 + for match in self.re_stops.finditer(s, overlapped=True): + index = match.start() + 3 + if index % 3 != 0: + continue + n = s[start:index] + for (offset, n, t) in self.start_chop_and_trans(n): + if n and len(t) >= self.min_len: + yield start + offset, n, t + start = index + + def putative_genes_in_sequence(self, nuc_seq): + """Returns start, end, strand, nucleotides, protein. + Co-ordinates are Python style zero-based. + """ + nuc_seq = nuc_seq.upper() + # TODO - Refactor to use a generator function (in start order) + # rather than making a list and sorting? + answer = [] + full_len = len(nuc_seq) + + for frame in range(0, 3): + for offset, n, t in self.break_up_frame(nuc_seq[frame:]): + start = frame + offset # zero based + answer.append((start, start + len(n), +1, n, t)) + + rc = reverse_complement(nuc_seq) + for frame in range(0, 3): + for offset, n, t in self.break_up_frame(rc[frame:]): + start = full_len - frame - offset # zero based + answer.append((start, start - len(n), -1, n, t)) + answer.sort() + return answer + + def get_all_peptides(self, nuc_seq): + """Returns start, end, strand, nucleotides, protein. + + Co-ordinates are Python style zero-based. + """ + # Refactored into generator by CPT + + full_len = len(nuc_seq) + for frame in range(0, 3): + for offset, n, t in self.break_up_frame(nuc_seq[frame:]): + start = frame + offset # zero based + yield (start, start + len(n), +1, n, t) + rc = reverse_complement(nuc_seq) + for frame in range(0, 3): + for offset, n, t in self.break_up_frame(rc[frame:]): + start = full_len - frame - offset # zero based + yield (start - len(n), start, -1, n, t)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_putative_osp/generate-putative-osp.py Fri Jun 17 13:09:00 2022 +0000 @@ -0,0 +1,304 @@ +#!/usr/bin/env python +import argparse +from cpt import OrfFinder +from Bio import SeqIO +from Bio import Seq +from CPT_GFFParser import gffParse, gffWrite +from spaninFuncs import * +import re +import os +import sys + +### Requirement Inputs +#### INPUT : Genomic FASTA +#### OUTPUT : Putative OSP candidates in FASTA format. +######### Optional OUTPUT: "Complete" potential ORFs, in BED/FASTAs/GFF3 formats +### Notes: +####### As of 2.13.2020 - RegEx pattern: [ACGSILMFTV][^REKD][GASNL]C for LipoRy +if __name__ == "__main__": + + # Common parameters for both ISP / OSP portion of scripts + + parser = argparse.ArgumentParser( + description="Get putative protein candidates for spanins" + ) + + parser.add_argument( + "fasta_file", type=argparse.FileType("r"), help="Fasta file" + ) # the "input" argument + + parser.add_argument( + "-f", + "--format", + dest="seq_format", + default="fasta", + help="Sequence format (e.g. fasta, fastq, sff)", + ) # optional formats for input, currently just going to do ntFASTA + + parser.add_argument( + "--strand", + dest="strand", + choices=("both", "forward", "reverse"), + default="both", + help="select strand", + ) # Selection of +, -, or both strands + + parser.add_argument( + "--table", dest="table", default=11, help="NCBI Translation table", type=int + ) # Uses "default" NCBI codon table. This should always (afaik) be what we want... + + parser.add_argument( + "-t", + "--ftype", + dest="ftype", + choices=("CDS", "ORF"), + default="ORF", + help="Find ORF or CDSs", + ) # "functional type(?)" --> Finds ORF or CDS, for this we want just the ORF + + parser.add_argument( + "-e", + "--ends", + dest="ends", + choices=("open", "closed"), + default="closed", + help="Open or closed. Closed ensures start/stop codons are present", + ) # includes the start and stop codon + + parser.add_argument( + "-m", + "--mode", + dest="mode", + choices=("all", "top", "one"), + default="all", # I think we want this to JUST be all...nearly always + help="Output all ORFs/CDSs from sequence, all ORFs/CDSs with max length, or first with maximum length", + ) + + parser.add_argument( + "--switch", + dest="switch", + default="all", + help="switch between ALL putative osps, or a range. If not all, insert a range of two integers separated by a colon (:). Eg: 1234:4321", + ) + + # osp parameters + parser.add_argument( + "--osp_min_len", + dest="osp_min_len", + default=30, + help="Minimum ORF length, measured in codons", + type=int, + ) + parser.add_argument( + "--max_osp", + dest="max_osp", + default=200, + help="Maximum ORF length, measured in codons", + type=int, + ) + parser.add_argument( + "--osp_on", + dest="out_osp_nuc", + type=argparse.FileType("w"), + default="_out_osp.fna", + help="Output nucleotide sequences, FASTA", + ) + parser.add_argument( + "--osp_op", + dest="out_osp_prot", + type=argparse.FileType("w"), + default="_out_osp.fa", + help="Output protein sequences, FASTA", + ) + parser.add_argument( + "--osp_ob", + dest="out_osp_bed", + type=argparse.FileType("w"), + default="_out_osp.bed", + help="Output BED file", + ) + parser.add_argument( + "--osp_og", + dest="out_osp_gff3", + type=argparse.FileType("w"), + default="_out_osp.gff3", + help="Output GFF3 file", + ) + parser.add_argument( + "--osp_min_dist", + dest="osp_min_dist", + default=10, + help="Minimal distance to first AA of lipobox, measured in AA", + type=int, + ) + parser.add_argument( + "--osp_max_dist", + dest="osp_max_dist", + default=50, + help="Maximum distance to first AA of lipobox, measured in AA", + type=int, + ) + parser.add_argument( + "--min_lipo_after", + dest="min_lipo_after", + default=25, + help="minimal amount of residues after lipobox", + type=int, + ) + parser.add_argument( + "--max_lipo_after", + dest="max_lipo_after", + default=170, + help="minimal amount of residues after lipobox", + type=int, + ) + parser.add_argument( + "--putative_osp", + dest="putative_osp_fa", + type=argparse.FileType("w"), + default="_putative_osp.fa", + help="Output of putative FASTA file", + ) + parser.add_argument( + "--summary_osp_txt", + dest="summary_osp_txt", + type=argparse.FileType("w"), + default="_summary_osp.txt", + help="Summary statistics on putative o-spanins", + ) + parser.add_argument( + "--putative_osp_gff", + dest="putative_osp_gff", + type=argparse.FileType("w"), + default="_putative_osp.gff3", + help="gff3 output for putative o-spanins", + ) + parser.add_argument( + "--osp_mode", + action="store_true", + default=True + ) + + # parser.add_argument('-v', action='version', version='0.3.0') # Is this manually updated? + args = parser.parse_args() + + the_args = vars(parser.parse_args()) + + ### osp output, naive ORF finding: + osps = OrfFinder(args.table, args.ftype, args.ends, args.osp_min_len, args.strand) + osps.locate( + args.fasta_file, + args.out_osp_nuc, + args.out_osp_prot, + args.out_osp_bed, + args.out_osp_gff3, + ) + + """ + ### For Control: Use T7 and lambda; + # Note the distance from start codon to lipobox region for t7 + o-spanin + 18,7-------------------------------------------------LIPO---------------------------------- + >T7_EOS MSTLRELRLRRALKEQSVRYLLSIKKTLPRWKGALIGLFLICVATISGCASESKLPESPMVSVDSSLMVEPNLTTEMLNVFSQ + -----------------------------LIPO---------------------------------------- + > lambda_EOS MLKLKMMLCVMMLPLVVVGCTSKQSVSQCVKPPPPPAWIMQPPPDWQTPLNGIISPSERG + """ + args.fasta_file.close() + args.fasta_file = open(args.fasta_file.name, "r") + args.out_osp_prot.close() + args.out_osp_prot = open(args.out_osp_prot.name, "r") + + pairs = tuple_fasta(fasta_file=args.out_osp_prot) + have_lipo = [] # empty candidates list to be passed through the user input + + for each_pair in pairs: + if len(each_pair[1]) <= args.max_osp: + try: + have_lipo += find_lipobox( + pair=each_pair, + minimum=args.osp_min_dist, + maximum=args.osp_max_dist, + min_after=args.min_lipo_after, + max_after=args.max_lipo_after, + osp_mode=args.osp_mode, + ) + except (IndexError, TypeError): + continue + + if args.switch == "all": + pass + else: + # for each_pair in have_lipo: + range_of = args.switch + range_of = re.search(("[\d]+:[\d]+"), range_of).group(0) + start = int(range_of.split(":")[0]) + end = int(range_of.split(":")[1]) + have_lipo = parse_a_range(pair=have_lipo, start=start, end=end) + # print(have_lipo) + # matches + + # have_lipo = [] + # have_lipo = matches + total_osp = len(have_lipo) + # print(have_lipo) + # print(total_osp) + + # print(type(have_lipo)) + + # for i in have_lipo: + # print(i) + + # export results in fasta format + ORF = [] + length = [] # grabbing length of the sequences + candidate_dict = {k: v for k, v in have_lipo} + with args.putative_osp_fa as f: + for desc, s in candidate_dict.items(): # description / sequence + f.write(">" + str(desc)) + f.write("\n" + lineWrapper(str(s).replace("*","")) + "\n") + length.append(len(s)) + ORF.append(desc) + if not length: + raise Exception("Parameters yielded no candidates.") + bot_size = min(length) + top_size = max(length) + avg = (sum(length)) / total_osp + n = len(length) + if n == 0: + raise Exception("no median for empty data") + if n % 2 == 1: + med = length[n // 2] + else: + i = n // 2 + med = (length[i - 1] + length[i]) / 2 + + args.out_osp_prot.close() + all_orfs = open(args.out_osp_prot.name, "r") + all_osps = open(args.putative_osp_fa.name, "r") + #record = SeqIO.read(all_orfs, "fasta") + #print(len(record)) + #### Extra stats + n = 0 + for line in all_orfs: + if line.startswith(">"): + n += 1 + all_orfs_counts = n + + c = 0 + for line in all_osps: + if line.startswith(">"): + c += 1 + all_osps_counts = c + + with args.summary_osp_txt as f: + f.write("total potential o-spanins: " + str(total_osp) + "\n") + f.write("average length (AA): " + str(avg) + "\n") + f.write("median length (AA): " + str(med) + "\n") + f.write("maximum orf in size (AA): " + str(top_size) + "\n") + f.write("minimum orf in size (AA): " + str(bot_size) + "\n") + #f.write(f"ratio of osps found from naive orfs: {c}/{n}") + f.write("ratio of osps found from naive orfs: "+ str(c) + "/" +str(n)) + # Output the putative list in gff3 format: + args.putative_osp_fa = open(args.putative_osp_fa.name, "r") + gff_data = prep_a_gff3(fa=args.putative_osp_fa, spanin_type="osp",org=args.fasta_file) + write_gff3(data=gff_data, output=args.putative_osp_gff)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_putative_osp/generate-putative-osp.xml Fri Jun 17 13:09:00 2022 +0000 @@ -0,0 +1,75 @@ +<?xml version="1.1"?> +<tool id="edu.tamu.cpt2.spanin.generate-putative-osp" name="OSP candidates" version="1.0"> + <description>constructs a putative list of potential o-spanin from an input genomic FASTA</description> + <macros> + <import>macros.xml</import> + <import>cpt-macros.xml</import> + </macros> + <expand macro="requirements"> + </expand> + <command detect_errors="aggressive"><![CDATA[ +$__tool_directory__/generate-putative-osp.py +$fasta_file +--strand $strand +--switch $switch +--osp_on $osp_on +--osp_op $osp_op +--osp_ob $osp_ob +--osp_og $osp_og +--osp_min_len $osp_min_len +--putative_osp $putative_osp +--summary_osp_txt $summary_osp +--putative_osp_gff $putative_osp_gff +--min_lipo_after $lipo_min +--max_lipo_after $lipo_max +--osp_max $osp_max +]]></command> + <inputs> + <param type="select" label="Strand Choice" name="strand"> + <option value="both">both</option> + <option value="forward">+</option> + <option value="reverse">-</option> + </param> + <param label="Single Genome FASTA" name="fasta_file" type="data" format="fasta" /> + <param label="o-spanin minimal length" name="osp_min_len" type="integer" value="45" /> + <param label="o-spanin maximum length" name="osp_max" type="integer" value="200" /> + <param label="Range Selection; default is all; for a specific range to check for a spanin input integers separated by a colon (eg. 1234:4321)" type="text" name="switch" value="all" /> + <param label="Lipobox minimal distance from start codon" name="osp_min_dist" type="integer" value="10" /> + <param label="Lipobox maximum distance from start codon" name="osp_max_dist" type="integer" value="60" help="Searches for a Lipobox between Lipoboxmin and Lipoboxmax ie [Lipoboxmin,Lipoboxmax]" /> + <param label="Minimum amount of residues after lipobox is found" name="lipo_min" type="integer" value="25" /> + <param label="Maximum amount of residues after lipobox is found" name="lipo_max" type="integer" value="170" /> + </inputs> + <outputs> + <data format="fasta" name="osp_on" label="NucSequences.fa" hidden = "true"/> + <data format="fasta" name="osp_op" label="ProtSequences.fa" hidden = "true"/> + <data format="bed" name="osp_ob" label="BED_Output.bed" hidden = "true"/> + <data format="gff3" name="osp_og" label="GFF_Output.gff" hidden = "true"/> + <data format="fasta" name="putative_osp" label="putative_osp.fa"/> + <data format="txt" name="summary_osp" label="summary_osp.txt"/> + <data format="gff3" name="putative_osp_gff" label="putative_osp.gff3"/> + </outputs> + <help><![CDATA[ + +**What it does** +Searches a genome for candidate o-spanins (OSPs), a phage protein involved in outer membrane disruption during Gram-negative bacterial host cell lysis. + + +**METHODOLOGY** + +Locates ALL potential start sequences, based on TTG / ATG / GTG (M / L / V). This list is pared down to those within the user-set min/max lengths. That filtered list generates a set of files with the ORFs in FASTA (nt and aa), BED, and GFF3 file formats. + +For each sequence in the protein FASTA, the tool then checks within the user-specified range (min/max distance from start codon) for a regular expression (RegEx) to identify a potential lipobox. The following residues are allowed for the potential lipobox: + + * [ILMFTV][^REKD][GAS]C + * AW[AGS]C + +Finally, the protein list is filtered for size with user-set periplasmic length parameters, calculated as the number of residues after the putative lipobox. + +**INPUT** --> Genomic FASTA +*NOTE: This tool only takes a SINGLE genomic fasta. It does not work with multiFASTAs.* + +**OUTPUT** --> putative_osp.fa (FASTA) file, putative_osp.gff3, and basic summary statistics file as sumamry_osp.txt +Protein sequences which passed the above filters are returned as the candidate OSPs. +]]></help> + <expand macro="citations-crr" /> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_putative_osp/macros.xml Fri Jun 17 13:09:00 2022 +0000 @@ -0,0 +1,63 @@ +<?xml version="1.0"?> +<macros> + <xml name="requirements"> + <requirements> + <requirement type="package" version="2019.06.05">regex</requirement> + <requirement type="package" version="3.6">python</requirement> + <requirement type="package" version="1.77">biopython</requirement> + <requirement type="package" version="1.1.7">cpt_gffparser</requirement> + <yield/> + </requirements> + </xml> + <token name="@BLAST_TSV@"> + "$blast_tsv" + </token> + <xml name="blast_tsv"> + <param label="Blast Results" help="TSV/tabular (25 Column)" + name="blast_tsv" type="data" format="tabular" /> + </xml> + + <token name="@BLAST_XML@"> + "$blast_xml" + </token> + <xml name="blast_xml"> + <param label="Blast Results" help="XML format" + name="blast_xml" type="data" format="blastxml" /> + </xml> + <xml name="gff3_with_fasta"> + <param label="Genome Sequences" name="fasta" type="data" format="fasta" /> + <param label="Genome Annotations" name="gff3" type="data" format="gff3" /> + </xml> + <xml name="genome_selector"> + <param name="genome_fasta" type="data" format="fasta" label="Source FASTA Sequence"/> + </xml> + <xml name="gff3_input"> + <param label="GFF3 Annotations" name="gff3_data" type="data" format="gff3"/> + </xml> + <xml name="input/gff3+fasta"> + <expand macro="gff3_input" /> + <expand macro="genome_selector" /> + </xml> + <token name="@INPUT_GFF@"> + "$gff3_data" + </token> + <token name="@INPUT_FASTA@"> + genomeref.fa + </token> + <token name="@GENOME_SELECTOR_PRE@"> + ln -s $genome_fasta genomeref.fa; + </token> + <token name="@GENOME_SELECTOR@"> + genomeref.fa + </token> + <xml name="input/fasta"> + <param label="Fasta file" name="sequences" type="data" format="fasta"/> + </xml> + + <token name="@SEQUENCE@"> + "$sequences" + </token> + <xml name="input/fasta/protein"> + <param label="Protein fasta file" name="sequences" type="data" format="fasta"/> + </xml> +</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_putative_osp/spaninFuncs.py Fri Jun 17 13:09:00 2022 +0000 @@ -0,0 +1,469 @@ +""" +PREMISE +### Functions/Classes that are used in both generate-putative-osp.py and generate-putative-isp.py +###### Main premise here is to make the above scripts a little more DRY, as well as easily readable for execution. +###### Documentation will ATTEMPT to be thourough here +""" + +import re +from Bio import SeqIO +from Bio import Seq +from collections import OrderedDict + +# Not written in OOP for a LITTLE bit of trying to keep the complication down in case adjustments are needed by someone else. +# Much of the manipulation is string based; so it should be straightforward as well as moderately quick +################## GLobal Variables +Lys = "K" + + +def check_back_end_snorkels(seq, tmsize): + """ + Searches through the backend of a potential TMD snorkel. This is the 2nd part of a TMD snorkel lysine match. + --> seq : should be the sequence fed from the "search_region" portion of the sequence + --> tmsize : size of the potential TMD being investigated + """ + found = [] + if seq[tmsize - 4] == Lys and re.search(("[FIWLVMYCATGS]"), seq[tmsize - 5]): + found = "match" + return found + elif seq[tmsize - 3] == Lys and re.search(("[FIWLVMYCATGS]"), seq[tmsize - 4]): + found = "match" + return found + elif seq[tmsize - 2] == Lys and re.search(("[FIWLVMYCATGS]"), seq[tmsize - 3]): + found = "match" + return found + elif seq[tmsize - 1] == Lys and re.search(("[FIWLVMYCATGS]"), seq[tmsize - 2]): + found = "match" + return found + else: + found = "NOTmatch" + return found + + +def prep_a_gff3(fa, spanin_type, org): + """ + Function parses an input detailed 'fa' file and outputs a 'gff3' file + ---> fa = input .fa file + ---> output = output a returned list of data, easily portable to a gff3 next + ---> spanin_type = 'isp' or 'osp' + """ + with org as f: + header = f.readline() + orgacc = header.split(" ") + orgacc = orgacc[0].split(">")[1].strip() + fa_zip = tuple_fasta(fa) + data = [] + for a_pair in fa_zip: + # print(a_pair) + if re.search(("(\[1\])"), a_pair[0]): + strand = "+" + elif re.search(("(\[-1\])"), a_pair[0]): + strand = "-" # column 7 + start = re.search(("[\d]+\.\."), a_pair[0]).group(0).split("..")[0] # column 4 + end = re.search(("\.\.[\d]+"), a_pair[0]).group(0).split("..")[1] # column 5 + orfid = re.search(("(ORF)[\d]+"), a_pair[0]).group(0) # column 1 + if spanin_type == "isp": + methodtype = "CDS" # column 3 + spanin = "isp" + elif spanin_type == "osp": + methodtype = "CDS" # column 3 + spanin = "osp" + elif spanin_type == "usp": + methodtype = "CDS" + spanin = "usp" + else: + raise "need to input spanin type" + source = "cpt.py|putative-*.py" # column 2 + score = "." # column 6 + phase = "." # column 8 + attributes = "ID=" +orgacc+ "|"+ orfid + ";ALIAS=" + spanin + ";SEQ="+a_pair[1] # column 9 + sequence = [[orgacc, source, methodtype, start, end, score, strand, phase, attributes]] + data += sequence + return data + + +def write_gff3(data, output="results.gff3"): + """ + Parses results from prep_a_gff3 into a gff3 file + ---> input : list from prep_a_gff3 + ---> output : gff3 file + """ + data = data + filename = output + with filename as f: + f.write("#gff-version 3\n") + for value in data: + f.write( + "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format( + value[0], + value[1], + value[2], + value[3], + value[4], + value[5], + value[6], + value[7], + value[8], + ) + ) + f.close() + + +def find_tmd(pair, minimum=10, maximum=30, TMDmin=10, TMDmax=20, isp_mode=False, peri_min=18, peri_max=206): + """ + Function that searches for lysine snorkels and then for a spanning hydrophobic region that indicates a potential TMD + ---> pair : Input of tuple with description and AA sequence (str) + ---> minimum : How close from the initial start codon a TMD can be within + ---> maximum : How far from the initial start codon a TMD can be within + ---> TMDmin : The minimum size that a transmembrane can be (default = 10) + ---> TMDmax : The maximum size tha ta transmembrane can be (default = 20) + """ + # hydrophobicAAs = ['P', 'F', 'I', 'W', 'L', 'V', 'M', 'Y', 'C', 'A', 'T', 'G', 'S'] + tmd = [] + s = str(pair[1]) # sequence being analyzed + # print(s) # for trouble shooting + if maximum > len(s): + maximum = len(s) + search_region = s[minimum - 1 : maximum + 1] + #print(f"this is the search region: {search_region}") + # print(search_region) # for trouble shooting + + for tmsize in range(TMDmin, TMDmax+1, 1): + #print(f"this is the current tmsize we're trying: {tmsize}") + # print('==============='+str(tmsize)+'================') # print for troubleshooting + pattern = "[PFIWLVMYCATGS]{"+str(tmsize)+"}" # searches for these hydrophobic residues tmsize total times + #print(pattern) + #print(f"sending to regex: {search_region}") + if re.search( + ("[K]"), search_region[1:8]): # grabbing one below with search region, so I want to grab one ahead here when I query. + store_search = re.search(("[K]"), search_region[1:8]) # storing regex object + where_we_are = store_search.start() # finding where we got the hit + if re.search( + ("[PFIWLVMYCATGS]"), search_region[where_we_are + 1] + ) and re.search( + ("[PFIWLVMYCATGS]"), search_region[where_we_are - 1] + ): # hydrophobic neighbor + #try: + g = re.search(("[PFIWLVMYCATGS]"), search_region[where_we_are + 1]).group() + backend = check_back_end_snorkels(search_region, tmsize) + if backend == "match": + if isp_mode: + g = re.search((pattern), search_region).group() + end_of_tmd = re.search((g), s).end()+1 + amt_peri = len(s) - end_of_tmd + if peri_min <= amt_peri <= peri_max: + pair_desc = pair[0] + ", peri_count~="+str(amt_peri) + new_pair = (pair_desc,pair[1]) + tmd.append(new_pair) + else: + tmd.append(pair) + else: + continue + #else: + #print("I'm continuing out of snorkel loop") + #print(f"{search_region}") + #continue + if re.search((pattern), search_region): + #print(f"found match: {}") + #print("I AM HEREEEEEEEEEEEEEEEEEEEEEEE") + #try: + if isp_mode: + g = re.search((pattern), search_region).group() + end_of_tmd = re.search((g), s).end()+1 + amt_peri = len(s) - end_of_tmd + if peri_min <= amt_peri <= peri_max: + pair_desc = pair[0] + ", peri_count~="+str(amt_peri) + new_pair = (pair_desc,pair[1]) + tmd.append(new_pair) + else: + tmd.append(pair) + else: + continue + + return tmd + + +def find_lipobox(pair, minimum=10, maximum=50, min_after=30, max_after=185, regex=1, osp_mode=False): + """ + Function that takes an input tuple, and will return pairs of sequences to their description that have a lipoobox + ---> minimum - min distance from start codon to first AA of lipobox + ---> maximum - max distance from start codon to first AA of lipobox + ---> regex - option 1 (default) => more strict regular expression ; option 2 => looser selection, imported from LipoRy + + """ + if regex == 1: + pattern = "[ILMFTV][^REKD][GAS]C" # regex for Lipobox from findSpanin.pl + elif regex == 2: + pattern = "[ACGSILMFTV][^REKD][GAS]C" # regex for Lipobox from LipoRy + + candidates = [] + s = str(pair[1]) + # print(s) # trouble shooting + search_region = s[minimum-1 : maximum + 5] # properly slice the input... add 4 to catch if it hangs off at max input + # print(search_region) # trouble shooting + patterns = ["[ILMFTV][^REKD][GAS]C","AW[AGS]C"] + for pattern in patterns: + #print(pattern) # trouble shooting + if re.search((pattern), search_region): # lipobox must be WITHIN the range... + # searches the sequence with the input RegEx AND omits if + g = re.search((pattern), search_region).group() # find the exact group match + amt_peri = len(s) - re.search((g), s).end() + 1 + if min_after <= amt_peri <= max_after: # find the lipobox end region + if osp_mode: + pair_desc = pair[0] + ", peri_count~="+str(amt_peri) + new_pair = (pair_desc,pair[1]) + candidates.append(new_pair) + else: + candidates.append(pair) + + return candidates + + +def tuple_fasta(fasta_file): + """ + #### INPUT: Fasta File + #### OUTPUT: zipped (zip) : pairwise relationship of description to sequence + #### + """ + fasta = SeqIO.parse(fasta_file, "fasta") + descriptions = [] + sequences = [] + for r in fasta: # iterates and stores each description and sequence + description = r.description + sequence = str(r.seq) + if ( + sequence[0] != "I" + ): # the translation table currently has I as a potential start codon ==> this will remove all ORFs that start with I + descriptions.append(description) + sequences.append(sequence) + else: + continue + + return zip(descriptions, sequences) + + +def lineWrapper(text, charactersize=60): + + if len(text) <= charactersize: + return text + else: + return ( + text[:charactersize] + + "\n" + + lineWrapper(text[charactersize:], charactersize) + ) + + +def getDescriptions(fasta): + """ + Takes an output FASTA file, and parses retrieves the description headers. These headers contain information needed + for finding locations of a potential i-spanin and o-spanin proximity to one another. + """ + desc = [] + with fasta as f: + for line in f: + if line.startswith(">"): + desc.append(line) + return desc + + +def splitStrands(text, strand="+"): + # positive_strands = [] + # negative_strands = [] + if strand == "+": + if re.search(("(\[1\])"), text): + return text + elif strand == "-": + if re.search(("(\[-1\])"), text): + return text + # return positive_strands, negative_strands + + +def parse_a_range(pair, start, end): + """ + Takes an input data tuple from a fasta tuple pair and keeps only those within the input sequence range + ---> data : fasta tuple data + ---> start : start range to keep + ---> end : end range to keep (will need to + 1) + """ + matches = [] + for each_pair in pair: + + s = re.search(("[\d]+\.\."), each_pair[0]).group(0) # Start of the sequence + s = int(s.split("..")[0]) + e = re.search(("\.\.[\d]+"), each_pair[0]).group(0) + e = int(e.split("..")[1]) + if start - 1 <= s and e <= end + 1: + matches.append(each_pair) + else: + continue + # else: + # continue + # if matches != []: + return matches + # else: + # print('no candidates within selected range') + + +def grabLocs(text): + """ + Grabs the locations of the spanin based on NT location (seen from ORF). Grabs the ORF name, as per named from the ORF class/module + from cpt.py + """ + start = re.search(("[\d]+\.\."), text).group(0) # Start of the sequence ; looks for [numbers].. + end = re.search(("\.\.[\d]+"), text).group(0) # End of the sequence ; Looks for ..[numbers] + orf = re.search(("(ORF)[\d]+"), text).group(0) # Looks for ORF and the numbers that are after it + if re.search(("(\[1\])"), text): # stores strand + strand = "+" + elif re.search(("(\[-1\])"), text): # stores strand + strand = "-" + start = int(start.split("..")[0]) + end = int(end.split("..")[1]) + vals = [start, end, orf, strand] + + return vals + + +def spaninProximity(isp, osp, max_dist=30): + """ + _NOTE THIS FUNCTION COULD BE MODIFIED TO RETURN SEQUENCES_ + Compares the locations of i-spanins and o-spanins. max_dist is the distance in NT measurement from i-spanin END site + to o-spanin START. The user will be inputting AA distance, so a conversion will be necessary (<user_input> * 3) + I modified this on 07.30.2020 to bypass the pick + or - strand. To + INPUT: list of OSP and ISP candidates + OUTPUT: Return (improved) candidates for overlapping, embedded, and separate list + """ + + embedded = {} + overlap = {} + separate = {} + for iseq in isp: + embedded[iseq[2]] = [] + overlap[iseq[2]] = [] + separate[iseq[2]] = [] + for oseq in osp: + if iseq[3] == "+": + if oseq[3] == "+": + if iseq[0] < oseq[0] < iseq[1] and oseq[1] < iseq[1]: + ### EMBEDDED ### + combo = [ + iseq[0], + iseq[1], + oseq[2], + oseq[0], + oseq[1], + iseq[3], + ] # ordering a return for dic + embedded[iseq[2]] += [combo] + elif iseq[0] < oseq[0] <= iseq[1] and oseq[1] > iseq[1]: + ### OVERLAP / SEPARATE ### + if (iseq[1] - oseq[0]) < 6: + combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]] + separate[iseq[2]] += [combo] + else: + combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]] + overlap[iseq[2]] += [combo] + elif iseq[1] <= oseq[0] <= iseq[1] + max_dist: + combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]] + separate[iseq[2]] += [combo] + else: + continue + if iseq[3] == "-": + if oseq[3] == "-": + if iseq[0] <= oseq[1] <= iseq[1] and oseq[0] > iseq[0]: + ### EMBEDDED ### + combo = [ + iseq[0], + iseq[1], + oseq[2], + oseq[0], + oseq[1], + iseq[3], + ] # ordering a return for dict + embedded[iseq[2]] += [combo] + elif iseq[0] <= oseq[1] <= iseq[1] and oseq[0] < iseq[0]: + if (oseq[1] - iseq[0]) < 6: + combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]] + separate[iseq[2]] += [combo] + else: + combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]] + overlap[iseq[2]] += [combo] + elif iseq[0] - 10 < oseq[1] < iseq[0]: + combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]] + separate[iseq[2]] += [combo] + else: + continue + + embedded = {k: embedded[k] for k in embedded if embedded[k]} + overlap = {k: overlap[k] for k in overlap if overlap[k]} + separate = {k: separate[k] for k in separate if separate[k]} + + return embedded, overlap, separate + + +def check_for_usp(): + " pass " + +############################################### TEST RANGE ######################################################################### +#################################################################################################################################### +if __name__ == "__main__": + + #### TMD TEST + test_desc = ["one", "two", "three", "four", "five"] + test_seq = [ + "XXXXXXXXXXXXXXXFMCFMCFMCFMCFMCXXXXXXXXXXXXXXXXXXXXXXXXXX", + "XXXXXXXXAAKKKKKKKKKKKKKKKXXXXXXXXXXXXX", + "XXXXXXX", + "XXXXXXXXXXXKXXXXXXXXXX", + "XXXXXXXXXXAKXXXXXXXXXXAKXXXXXXXX", + ] + # for l in + # combo = zip(test_desc,test_seq) + pairs = zip(test_desc, test_seq) + tmd = [] + for each_pair in pairs: + # print(each_pair) + try: + tmd += find_tmd(pair=each_pair) + except (IndexError, TypeError): + continue + # try:s = each_pair[1] + # tmd += find_tmd(seq=s, tmsize=15) + # print('\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n') + # print(tmd) + # print('\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n') + + #### tuple-fasta TEST + # fasta_file = 'out_isp.fa' + # ret = tuple_fasta(fasta_file) + # print('=============') + # for i in ret: + # print(i[1]) + + #### LipoBox TEST + test_desc = ["one", "two", "three", "four", "five", "six", "seven"] + test_seq = [ + "XXXXXXXXXTGGCXXXXXXXXXXXXXXXX", + "XXXXXXXXAAKKKKKKKKKKKKKKKXXXXXXXXXXXXX", + "XXXXXXX", + "AGGCXXXXXXXXXXXXXXXXXXXXTT", + "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXTGGC", + "XXXXXXXXXXXXXXXXXXXXXXXXXXTGGC", + "MSTLRELRLRRALKEQSMRYLLSIKKTLPRWKGALIGLFLICVATISGCASESKLPEPPMVSVDSSLMVEPNLTTEMLNVFSQ*", + ] + pairs = zip(test_desc, test_seq) + lipo = [] + for each_pair in pairs: + #print(each_pair) + # try: + try: + lipo += find_lipobox(pair=each_pair, regex=2) # , minimum=8) + except TypeError: # catches if something doesnt have the min/max requirements (something is too small) + continue + # except: + # continue + # print('\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n') + #############################3 + # g = prep_a_gff3(fa='putative_isp.fa', spanin_type='isp') + # print(g) + # write_gff3(data=g)