Mercurial > repos > cpt > cpt_sar_finder
changeset 1:112751823323 draft
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author | cpt |
---|---|
date | Mon, 05 Jun 2023 02:52:57 +0000 |
parents | 9f62910edcc9 |
children | 373c8bca8a82 |
files | SAR_finder.py SAR_finder.xml SAR_functions.py biopython_parsing.py cpt-macros.xml cpt_sar_finder/SAR_finder.py cpt_sar_finder/SAR_finder.xml cpt_sar_finder/SAR_functions.py cpt_sar_finder/biopython_parsing.py cpt_sar_finder/cpt-macros.xml cpt_sar_finder/file_operations.py cpt_sar_finder/macros.xml cpt_sar_finder/test-data/candidate_SAR.fa cpt_sar_finder/test-data/candidate_SAR.gff3 cpt_sar_finder/test-data/candidate_SAR_stats.tsv cpt_sar_finder/test-data/simple-proteins.fa file_operations.py macros.xml test-data/candidate_SAR.fa test-data/candidate_SAR.gff3 test-data/candidate_SAR_stats.tsv test-data/simple-proteins.fa |
diffstat | 22 files changed, 894 insertions(+), 583 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SAR_finder.py Mon Jun 05 02:52:57 2023 +0000 @@ -0,0 +1,76 @@ +import sys +import argparse +import os +import re +from biopython_parsing import FASTA_parser +from file_operations import fasta_from_SAR_dict, gff3_from_SAR_dict, tab_from_SAR_dict +from SAR_functions import CheckSequence + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="SAR Finder") + + parser.add_argument( + "fa", type=argparse.FileType("r"), help="organism's multi fasta file" + ) + + parser.add_argument( + "--min", type=int, default=20, help="minimum size of candidate peptide" + ) + + parser.add_argument( + "--max", type=int, default=200, help="maximum size of candidate peptide" + ) + + parser.add_argument( + "--sar_min", + type=int, + default=15, + help="minimum size of candidate peptide TMD domain", + ) + + parser.add_argument( + "--sar_max", + type=int, + default=24, + help="maximum size of candidate peptide TMD domain", + ) + + parser.add_argument( + "--out_fa", + type=argparse.FileType("w"), + help="multifasta output of candidate SAR proteins", + default="candidate_SAR.fa", + ) + + parser.add_argument( + "--out_stat", + type=argparse.FileType("w"), + help="summary statistic file for candidate SAR proteins, tab separated", + default="candidate_SAR_stats.tsv", + ) + + parser.add_argument( + "--out_gff3", + type=argparse.FileType("w"), + help="multigff3 file for candidate SAR proteins", + default="candidate_SAR.gff3", + ) + + args = parser.parse_args() + + fa_dict = FASTA_parser(fa=args.fa).multifasta_dict() + + sars = {} + + for protein_name, protein_data in fa_dict.items(): + sar = CheckSequence(protein_name, protein_data) + # sar.check_sizes(min=args.min,max=args.max) + hydros = sar.shrink_results(sar_min=args.sar_min, sar_max=args.sar_max) + sars.update(hydros) + + gff3_from_SAR_dict(sars, args.out_gff3) + tab_from_SAR_dict( + sars, args.out_stat, "SGAT", sar_min=args.sar_min, sar_max=args.sar_max + ) + fasta_from_SAR_dict(sars, args.out_fa) + # stat_file_from_SAR_dict(sars,args.out_stat,sar_min=args.sar_min,sar_max=args.sar_max) # fix this whenever ready.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SAR_finder.xml Mon Jun 05 02:52:57 2023 +0000 @@ -0,0 +1,73 @@ +<tool id="edu.tamu.cpt.sar.sar_finder" name="SAR Finder" version="1.0"> + <description>SAR Domain Finder</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements"> + </expand> + <command detect_errors="aggressive"><![CDATA[ +python '$__tool_directory__/SAR_finder.py' +'$fa' +--sar_min '$sar_min' +--sar_max '$sar_max' +--out_fa '$out_fa' +--out_gff3 '$out_gff3' +--out_stat '$out_stat' + ]]></command> + <inputs> + <param label="Multi FASTA File" name="fa" type="data" format="fasta"/> + <param label="SAR domain minimal size" name="sar_min" type="integer" value="15"/> + <param label="SAR domain maximum size" name="sar_max" type="integer" value="20"/> + </inputs> + <outputs> + <data format="tabular" name="out_stat" label="candidate_SAR_stats.tsv"/> + <data format="fasta" name="out_fa" label="candidate_SAR.fa"/> + <data format="gff3" name="out_gff3" label="candidate_SAR.gff3"/> + </outputs> + <tests> + <test> + <param name="fa" value="simple-proteins.fa"/> + <param name="sar_min" value="15"/> + <param name="sar_max" value="20"/> + <output name="out_stat" file="candidate_SAR_stats.tsv"/> + <output name="out_fa" file="candidate_SAR.fa"/> + <output name="out_gff3" file="candidate_SAR.gff3"/> + </test> + </tests> + <help><![CDATA[ + +**What it does** +A tool that analyzes the sequence within the first 50 residues of a protein for a weakly hydrophobic domain called Signal-Anchor-Release (aka SAR). +The tool finds proteins that contain a stretch (default 15-20 residues) of hydrophobic residues (Ile, Leu, Val, Phe, Tyr, Trp, Met, Gly, Ala, Ser) and +calculates the % Gly/Ala/Ser/Thr residues in the hydrophobic stretch. The net charge on the N-terminus is also displayed to aid in determining the +SAR orientation in the membrane.[2] + +Definition: A Signal-Anchor-Release (SAR) domain is an N-terminal, weakly hydrophobic transmembrane region rich in Gly/Ala and/or Ser (and sometimes Thr) residues. +The SAR domain is sometimes found in phage lysis proteins, including endolysins and holins. The SAR domain can be released from the membrane in a proton +motive force-dependent manner. Known SAR domains in phage endolysins often have >50-60% Gly/Ala/Ser/Thr content. SAR endolysins are expected to have a net positive +charge on the N-terminus by the positive-inside rule. + +**INPUT** --> Protein Multi FASTA + +**OUTPUT** --> + +* Multi FASTA with candidate proteins that pass the SAR domain criteria + +* Tabular summary file that lists every subdomain fitting the criteria for each potential SAR domain-containing protein with the following: protein name/sequence/length, SAR length/start/sequence/end, individual and total GAST% content in SAR, and N-terminal sequence/net charge + +* Multi GFF3 for unique candidate SAR domain-containing proteins + + ]]></help> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="doi">https://dx.doi.org/10.1016/bs.aivir.2018.09.003</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {C. Ross}, + title = {CPT Galaxy Tools}, + year = {2020-}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SAR_functions.py Mon Jun 05 02:52:57 2023 +0000 @@ -0,0 +1,348 @@ +#!/usr/bin/env python + +import sys +import argparse +import os +import re +from Bio import SeqIO + + +class CheckSequence: + """ + SAR endolysin Verification class, which starts with complete FA file, and is shrunk by each function to reveal best candidates of SAR endolysin proteins + """ + + def __init__(self, protein_name, protein_data): + self.name = protein_name + self.seq = protein_data.seq + self.description = protein_data.description + self.size = len(self.seq) + self.store = {} + + def check_sizes(self, min, max): + """check the minimum and maximum peptide lengths""" + if self.size < min: + print("too small") + elif self.size > max: + print("too large") + else: + print(f"{self.name} : {self.seq}") + return True + + def check_hydrophobicity_and_charge( + self, sar_min=15, sar_max=20, perc_residues="SGAT" + ): + """verifies the existence of a hydrophobic region within the sequence""" + hydrophobic_residues = "['FIWLVMYCATGSP']" # fed through regex + hits = self.store + pos_res = "RK" + neg_res = "DE" + + if self.size > 50: + seq = self.seq[0:50] + else: + seq = self.seq + for sar_size in range(sar_min, sar_max, 1): + for i in range(0, len(seq) - sar_size, 1): + sar_seq = str(seq[i : i + sar_size]) + if re.search( + (hydrophobic_residues + "{" + str(sar_size) + "}"), sar_seq + ): + ( + charge_seq, + charge, + perc_cont, + sar_coords, + nterm_coords, + cterm_coords, + sar_start, + sar_end, + ) = rep_funcs( + self, seq, i, pos_res, neg_res, sar_seq, perc_residues, sar_size + ) + storage_dict( + self=self, + sar_size=sar_size, + sar_seq=sar_seq, + hits=hits, + charge_seq=charge_seq, + charge=charge, + perc_cont=perc_cont, + nterm_coords=nterm_coords, + sar_coords=sar_coords, + cterm_coords=cterm_coords, + sar_start=sar_start, + sar_end=sar_end, + ) + # print("TMDSIZE: {}\tINDEX: {}".format(sar_size,i+1)) + elif "K" in sar_seq[0] and re.search( + (hydrophobic_residues + "{" + str(sar_size - 1) + "}"), sar_seq[1:] + ): # check frontend snorkels + ( + charge_seq, + charge, + perc_cont, + sar_coords, + nterm_coords, + cterm_coords, + sar_start, + sar_end, + ) = rep_funcs( + self, seq, i, pos_res, neg_res, sar_seq, perc_residues, sar_size + ) + storage_dict( + self=self, + sar_size=sar_size, + sar_seq=sar_seq, + hits=hits, + charge_seq=charge_seq, + charge=charge, + perc_cont=perc_cont, + nterm_coords=nterm_coords, + sar_coords=sar_coords, + cterm_coords=cterm_coords, + sar_start=sar_start, + sar_end=sar_end, + ) + # print("TMDSIZE: {}\tINDEX: {}".format(sar_size,i+1)) + elif "K" in sar_seq[-1] and re.search( + (hydrophobic_residues + "{" + str(sar_size - 1) + "}"), sar_seq[:-1] + ): # check backend snorkels + ( + charge_seq, + charge, + perc_cont, + sar_coords, + nterm_coords, + cterm_coords, + sar_start, + sar_end, + ) = rep_funcs( + self, seq, i, pos_res, neg_res, sar_seq, perc_residues, sar_size + ) + storage_dict( + self=self, + sar_size=sar_size, + sar_seq=sar_seq, + hits=hits, + charge_seq=charge_seq, + charge=charge, + perc_cont=perc_cont, + nterm_coords=nterm_coords, + sar_coords=sar_coords, + cterm_coords=cterm_coords, + sar_start=sar_start, + sar_end=sar_end, + ) + # print("TMDSIZE: {}\tINDEX: {}".format(sar_size,i+1)) + continue + + return hits + + def shrink_results(self, sar_min=15, sar_max=20, perc_residues="SGAT"): + """removes repetiive hits, keeps only the shortest and longest of each SAR domain""" + compare_candidates = {} + hits = self.check_hydrophobicity_and_charge(sar_min=sar_min, sar_max=sar_max) + for sar_name, data in hits.items(): + # print(sar_name) + compare_candidates[sar_name] = {} + # print("\nThese are the values: {}".format(v)) + # count_of_times = 0 + tmd_log = [] + for sar_size in range(sar_max, sar_min - 1, -1): + if "TMD_" + str(sar_size) in data: + tmd_log.append(sar_size) + # print(tmd_log) + for idx, the_data in enumerate(data["TMD_" + str(sar_size)]): + # print(the_data[7]) + # print(the_data) + # print(f"This is the index: {idx}") + # print(f"This is the list of data at this index: {the_data}") + if ( + the_data[7] in compare_candidates[sar_name] + ): # index to start + compare_candidates[sar_name][the_data[7]]["count"] += 1 + compare_candidates[sar_name][the_data[7]]["size"].append( + sar_size + ) + compare_candidates[sar_name][the_data[7]]["index"].append( + idx + ) + else: + compare_candidates[sar_name][the_data[7]] = {} + compare_candidates[sar_name][the_data[7]]["count"] = 1 + compare_candidates[sar_name][the_data[7]]["size"] = [ + sar_size + ] + compare_candidates[sar_name][the_data[7]]["index"] = [idx] + hits[sar_name]["biggest_sar"] = tmd_log[0] + for sar_name, compare_data in compare_candidates.items(): + for data in compare_data.values(): + if len(data["size"]) >= 3: + # print(f"{each_size} --> {data}") + minmax = [min(data["size"]), max(data["size"])] + nonminmax = [x for x in data["size"] if x not in minmax] + nonminmax_index = [] + for each_nonminmax in nonminmax: + v = data["size"].index(each_nonminmax) + x = data["index"][v] + nonminmax_index.append(x) + nons = zip(nonminmax, nonminmax_index) + for value in nons: + # hits[sar_name]["TMD_"+str(value[0])] = hits[sar_name]["TMD_"+str(value[0])].pop(value[1]) + hits[sar_name]["TMD_" + str(value[0])][value[1]] = [""] + + return hits + + +def rep_funcs(self, seq, loc, pos_res, neg_res, sar_seq, perc_residues, sar_size): + """run a set of functions together before sending the results to the storage dictionary""" + + charge_seq = str(seq[:loc]) + charge = charge_check(charge_seq, pos_res, neg_res) + perc_cont = percent_calc(sar_seq, perc_residues, int(sar_size)) + sar_start = loc + sar_end = loc + sar_size + sar_coords = "{}..{}".format(loc, loc + sar_size) + nterm_coords = "{}..{}".format("0", loc - 1) + cterm_coords = "{}..{}".format(loc + sar_size + 1, self.size) + + return ( + charge_seq, + charge, + perc_cont, + sar_coords, + nterm_coords, + cterm_coords, + sar_start, + sar_end, + ) + + +### Extra "helper" functions +def storage_dict( + self, + sar_size, + sar_seq, + hits, + charge_seq, + charge, + perc_cont, + nterm_coords, + sar_coords, + cterm_coords, + sar_start, + sar_end, +): # probably not good to call "self" a param here...definitley not PEP approved... + """organize dictionary for hydrophobicity check""" + if self.name not in hits: + hits[self.name] = {} + hits[self.name]["description"] = str(self.description) + hits[self.name]["sequence"] = str(self.seq) + hits[self.name]["size"] = str(self.size) + # GAcont = str((str(self.seq).count("G")+str(self.seq).count("A"))/int(self.size)*100) + # hits[self.name]["GAcont"] = "{:.2f}%".format(float(GAcont)) + if "TMD_" + str(sar_size) not in hits[self.name]: + hits[self.name]["TMD_" + str(sar_size)] = [] + hits[self.name]["TMD_" + str(sar_size)].append( + [ + sar_seq, + charge_seq, + charge, + perc_cont, + nterm_coords, + sar_coords, + cterm_coords, + sar_start, + sar_end, + ] + ) + else: + hits[self.name]["TMD_" + str(sar_size)].append( + [ + sar_seq, + charge_seq, + charge, + perc_cont, + nterm_coords, + sar_coords, + cterm_coords, + sar_start, + sar_end, + ] + ) + else: + if "TMD_" + str(sar_size) not in hits[self.name]: + hits[self.name]["TMD_" + str(sar_size)] = [] + hits[self.name]["TMD_" + str(sar_size)].append( + [ + sar_seq, + charge_seq, + charge, + perc_cont, + nterm_coords, + sar_coords, + cterm_coords, + sar_start, + sar_end, + ] + ) + else: + hits[self.name]["TMD_" + str(sar_size)].append( + [ + sar_seq, + charge_seq, + charge, + perc_cont, + nterm_coords, + sar_coords, + cterm_coords, + sar_start, + sar_end, + ] + ) + + +def percent_calc(sequence, residues, size): + """Calculate the percent of a set of residues within an input sequence""" + counted = {} + for aa in sequence: + # print(aa) + if aa in counted: + counted[aa] += 1 + else: + counted[aa] = 1 + residue_amt = 0 + my_ratios = [] + for res_of_interest in residues: + try: + residue_amt = counted[res_of_interest] + except KeyError: + residue_amt = 0 + ratio = residue_amt / size + my_ratios.append((round(ratio * 100, 2))) + + res_rat = list(zip(residues, my_ratios)) + + return res_rat + + +def charge_check(charge_seq, pos_res, neg_res): + charge = 0 + for aa in charge_seq: + if aa in pos_res: + charge += 1 + if aa in neg_res: + charge -= 1 + return charge + + +if __name__ == "__main__": + sequence = "MAGBYYYTRLCVRKLRKGGGHP" + residues = "YL" + size = len(sequence) + print(size) + v = percent_calc(sequence, residues, size) + print(v) + for i in v: + print(i)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/biopython_parsing.py Mon Jun 05 02:52:57 2023 +0000 @@ -0,0 +1,24 @@ +#!/usr/bin/env python +# Biopython parsing module. Uses in conjunction with the sar_finder script, and potential future scripts down the line. + +from Bio import SeqIO + + +class FASTA_parser: + """Parses multi fasta file, and zips together header with sequence""" + + def __init__(self, fa): + self.fa = fa + + def multifasta_dict(self): + """parses the input multi fasta, and puts results into dictionary""" + + return SeqIO.to_dict(SeqIO.parse(self.fa, "fasta")) + + +if __name__ == "__main__": + fa_file = "test-data/mu-proteins.fa" + d = FASTA_parser(fa_file).multifasta_dict() + print(d) + for k, v in d.items(): + print(v.description)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt-macros.xml Mon Jun 05 02:52:57 2023 +0000 @@ -0,0 +1,115 @@ +<macros> + <xml name="gff_requirements"> + <requirements> + <requirement type="package" version="2.7">python</requirement> + <requirement type="package" version="1.65">biopython</requirement> + <requirement type="package" version="2.12.1">requests</requirement> + <requirement type="package" version="1.2.2">cpt_gffparser</requirement> + <yield/> + </requirements> + <version_command> + <![CDATA[ + cd '$__tool_directory__' && git rev-parse HEAD + ]]> + </version_command> + </xml> + <xml name="citation/mijalisrasche"> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex">@unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + </xml> + <xml name="citations"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-crr"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {C. Ross}, + title = {CPT Galaxy Tools}, + year = {2020-}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-2020"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {A. Criscione}, + title = {CPT Galaxy Tools}, + year = {2019-2021}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-2020-AJC-solo"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {A. Criscione}, + title = {CPT Galaxy Tools}, + year = {2019-2021}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-clm"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {C. Maughmer}, + title = {CPT Galaxy Tools}, + year = {2017-2020}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="sl-citations-clm"> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {C. Maughmer}, + title = {CPT Galaxy Tools}, + year = {2017-2020}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </xml> +</macros>
--- a/cpt_sar_finder/SAR_finder.py Fri Jun 17 13:15:55 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,44 +0,0 @@ -import sys -import argparse -import os -import re -from biopython_parsing import FASTA_parser -from file_operations import fasta_from_SAR_dict, gff3_from_SAR_dict, tab_from_SAR_dict -from SAR_functions import CheckSequence - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="SAR Finder") - - parser.add_argument("fa",type=argparse.FileType("r"),help="organism's multi fasta file") - - parser.add_argument("--min",type=int,default=20,help="minimum size of candidate peptide") - - parser.add_argument("--max",type=int,default=200,help="maximum size of candidate peptide") - - parser.add_argument("--sar_min",type=int,default=15,help="minimum size of candidate peptide TMD domain") - - parser.add_argument("--sar_max",type=int,default=24,help="maximum size of candidate peptide TMD domain") - - parser.add_argument("--out_fa",type=argparse.FileType("w"),help="multifasta output of candidate SAR proteins",default="candidate_SAR.fa") - - parser.add_argument("--out_stat",type=argparse.FileType("w"),help="summary statistic file for candidate SAR proteins, tab separated",default="candidate_SAR_stats.tsv") - - parser.add_argument("--out_gff3",type=argparse.FileType("w"),help="multigff3 file for candidate SAR proteins",default="candidate_SAR.gff3") - - args = parser.parse_args() - - fa_dict = FASTA_parser(fa=args.fa).multifasta_dict() - - sars = {} - - for protein_name, protein_data in fa_dict.items(): - sar = CheckSequence(protein_name, protein_data) - #sar.check_sizes(min=args.min,max=args.max) - hydros = sar.shrink_results(sar_min=args.sar_min, sar_max=args.sar_max) - sars.update(hydros) - - - gff3_from_SAR_dict(sars, args.out_gff3) - tab_from_SAR_dict(sars,args.out_stat,"SGAT",sar_min=args.sar_min, sar_max=args.sar_max) - fasta_from_SAR_dict(sars,args.out_fa) - #stat_file_from_SAR_dict(sars,args.out_stat,sar_min=args.sar_min,sar_max=args.sar_max) # fix this whenever ready. \ No newline at end of file
--- a/cpt_sar_finder/SAR_finder.xml Fri Jun 17 13:15:55 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,73 +0,0 @@ -<tool id="edu.tamu.cpt.sar.sar_finder" name="SAR Finder" version="1.0"> - <description>SAR Domain Finder</description> - <macros> - <import>macros.xml</import> - </macros> - <expand macro="requirements"> - </expand> - <command detect_errors="aggressive"><![CDATA[ -python $__tool_directory__/SAR_finder.py -$fa ---sar_min $sar_min ---sar_max $sar_max ---out_fa $out_fa ---out_gff3 $out_gff3 ---out_stat $out_stat - ]]></command> - <inputs> - <param label="Multi FASTA File" name="fa" type="data" format="fasta" /> - <param label="SAR domain minimal size" name="sar_min" type="integer" value="15" /> - <param label="SAR domain maximum size" name="sar_max" type="integer" value="20" /> - </inputs> - <outputs> - <data format="tabular" name="out_stat" label="candidate_SAR_stats.tsv"/> - <data format="fasta" name="out_fa" label="candidate_SAR.fa"/> - <data format="gff3" name="out_gff3" label="candidate_SAR.gff3"/> - </outputs> - <tests> - <test> - <param name="fa" value="simple-proteins.fa"/> - <param name="sar_min" value="15"/> - <param name="sar_max" value="20"/> - <output name="out_stat" file="candidate_SAR_stats.tsv"/> - <output name="out_fa" file="candidate_SAR.fa"/> - <output name="out_gff3" file="candidate_SAR.gff3"/> - </test> - </tests> - <help><![CDATA[ - -**What it does** -A tool that analyzes the sequence within the first 50 residues of a protein for a weakly hydrophobic domain called Signal-Anchor-Release (aka SAR). -The tool finds proteins that contain a stretch (default 15-20 residues) of hydrophobic residues (Ile, Leu, Val, Phe, Tyr, Trp, Met, Gly, Ala, Ser) and -calculates the % Gly/Ala/Ser/Thr residues in the hydrophobic stretch. The net charge on the N-terminus is also displayed to aid in determining the -SAR orientation in the membrane.[2] - -Definition: A Signal-Anchor-Release (SAR) domain is an N-terminal, weakly hydrophobic transmembrane region rich in Gly/Ala and/or Ser (and sometimes Thr) residues. -The SAR domain is sometimes found in phage lysis proteins, including endolysins and holins. The SAR domain can be released from the membrane in a proton -motive force-dependent manner. Known SAR domains in phage endolysins often have >50-60% Gly/Ala/Ser/Thr content. SAR endolysins are expected to have a net positive -charge on the N-terminus by the positive-inside rule. - -**INPUT** --> Protein Multi FASTA - -**OUTPUT** --> - -* Multi FASTA with candidate proteins that pass the SAR domain criteria - -* Tabular summary file that lists every subdomain fitting the criteria for each potential SAR domain-containing protein with the following: protein name/sequence/length, SAR length/start/sequence/end, individual and total GAST% content in SAR, and N-terminal sequence/net charge - -* Multi GFF3 for unique candidate SAR domain-containing proteins - - ]]></help> - <citations> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="doi">https://dx.doi.org/10.1016/bs.aivir.2018.09.003</citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {C. Ross}, - title = {CPT Galaxy Tools}, - year = {2020-}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - </citations> -</tool>
--- a/cpt_sar_finder/SAR_functions.py Fri Jun 17 13:15:55 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,193 +0,0 @@ -#!/usr/bin/env python - -import sys -import argparse -import os -import re -from Bio import SeqIO - - -class CheckSequence: - """ - SAR endolysin Verification class, which starts with complete FA file, and is shrunk by each function to reveal best candidates of SAR endolysin proteins - """ - - - def __init__(self, protein_name, protein_data): - self.name = protein_name - self.seq = protein_data.seq - self.description = protein_data.description - self.size = len(self.seq) - self.store = {} - - - def check_sizes(self,min,max): - """ check the minimum and maximum peptide lengths """ - if self.size < min: - print("too small") - elif self.size > max: - print("too large") - else: - print(f"{self.name} : {self.seq}") - return True - - - def check_hydrophobicity_and_charge(self,sar_min=15,sar_max=20,perc_residues="SGAT"): - """ verifies the existence of a hydrophobic region within the sequence """ - hydrophobic_residues = "['FIWLVMYCATGSP']" # fed through regex - hits = self.store - pos_res = "RK" - neg_res = "DE" - - if self.size > 50: - seq = self.seq[0:50] - else: - seq = self.seq - for sar_size in range(sar_min, sar_max, 1): - for i in range(0,len(seq)-sar_size,1): - sar_seq = str(seq[i:i+sar_size]) - if re.search((hydrophobic_residues+"{"+str(sar_size)+"}"),sar_seq): - charge_seq, charge, perc_cont, sar_coords, nterm_coords, cterm_coords, sar_start, sar_end = rep_funcs(self,seq,i,pos_res,neg_res,sar_seq,perc_residues,sar_size) - storage_dict(self=self,sar_size=sar_size,sar_seq=sar_seq,hits=hits,charge_seq=charge_seq,charge=charge,perc_cont=perc_cont,nterm_coords=nterm_coords,sar_coords=sar_coords,cterm_coords=cterm_coords,sar_start=sar_start,sar_end=sar_end) - #print("TMDSIZE: {}\tINDEX: {}".format(sar_size,i+1)) - elif "K" in sar_seq[0] and re.search((hydrophobic_residues+"{"+str(sar_size-1)+"}"),sar_seq[1:]): # check frontend snorkels - charge_seq, charge, perc_cont, sar_coords, nterm_coords, cterm_coords, sar_start, sar_end = rep_funcs(self,seq,i,pos_res,neg_res,sar_seq,perc_residues,sar_size) - storage_dict(self=self,sar_size=sar_size,sar_seq=sar_seq,hits=hits,charge_seq=charge_seq,charge=charge,perc_cont=perc_cont,nterm_coords=nterm_coords,sar_coords=sar_coords,cterm_coords=cterm_coords,sar_start=sar_start,sar_end=sar_end) - #print("TMDSIZE: {}\tINDEX: {}".format(sar_size,i+1)) - elif "K" in sar_seq[-1] and re.search((hydrophobic_residues+"{"+str(sar_size-1)+"}"),sar_seq[:-1]): # check backend snorkels - charge_seq, charge, perc_cont, sar_coords, nterm_coords, cterm_coords, sar_start, sar_end = rep_funcs(self,seq,i,pos_res,neg_res,sar_seq,perc_residues,sar_size) - storage_dict(self=self,sar_size=sar_size,sar_seq=sar_seq,hits=hits,charge_seq=charge_seq,charge=charge,perc_cont=perc_cont,nterm_coords=nterm_coords,sar_coords=sar_coords,cterm_coords=cterm_coords,sar_start=sar_start,sar_end=sar_end) - #print("TMDSIZE: {}\tINDEX: {}".format(sar_size,i+1)) - continue - - return hits - - def shrink_results(self,sar_min=15,sar_max=20,perc_residues="SGAT"): - """ removes repetiive hits, keeps only the shortest and longest of each SAR domain """ - compare_candidates = {} - hits = self.check_hydrophobicity_and_charge(sar_min=sar_min,sar_max=sar_max) - for sar_name, data in hits.items(): - #print(sar_name) - compare_candidates[sar_name] = {} - #print("\nThese are the values: {}".format(v)) - #count_of_times = 0 - tmd_log = [] - for sar_size in range(sar_max,sar_min-1,-1): - if "TMD_"+str(sar_size) in data: - tmd_log.append(sar_size) - #print(tmd_log) - for idx,the_data in enumerate(data["TMD_"+str(sar_size)]): - #print(the_data[7]) - #print(the_data) - #print(f"This is the index: {idx}") - #print(f"This is the list of data at this index: {the_data}") - if the_data[7] in compare_candidates[sar_name]: # index to start - compare_candidates[sar_name][the_data[7]]["count"] += 1 - compare_candidates[sar_name][the_data[7]]["size"].append(sar_size) - compare_candidates[sar_name][the_data[7]]["index"].append(idx) - else: - compare_candidates[sar_name][the_data[7]] = {} - compare_candidates[sar_name][the_data[7]]["count"] = 1 - compare_candidates[sar_name][the_data[7]]["size"] = [sar_size] - compare_candidates[sar_name][the_data[7]]["index"] = [idx] - hits[sar_name]["biggest_sar"] = tmd_log[0] - for sar_name, compare_data in compare_candidates.items(): - for data in compare_data.values(): - if len(data["size"]) >= 3: - #print(f"{each_size} --> {data}") - minmax = [min(data["size"]),max(data["size"])] - nonminmax = [x for x in data["size"] if x not in minmax] - nonminmax_index = [] - for each_nonminmax in nonminmax: - v = data["size"].index(each_nonminmax) - x = data["index"][v] - nonminmax_index.append(x) - nons = zip(nonminmax,nonminmax_index) - for value in nons: - #hits[sar_name]["TMD_"+str(value[0])] = hits[sar_name]["TMD_"+str(value[0])].pop(value[1]) - hits[sar_name]["TMD_"+str(value[0])][value[1]] = [""] - - return hits - - -def rep_funcs(self,seq,loc,pos_res,neg_res,sar_seq,perc_residues,sar_size): - """ run a set of functions together before sending the results to the storage dictionary """ - - charge_seq = str(seq[:loc]) - charge = charge_check(charge_seq,pos_res,neg_res) - perc_cont = percent_calc(sar_seq,perc_residues,int(sar_size)) - sar_start = loc - sar_end = loc + sar_size - sar_coords = "{}..{}".format(loc,loc+sar_size) - nterm_coords = "{}..{}".format("0",loc-1) - cterm_coords = "{}..{}".format(loc+sar_size+1,self.size) - - return charge_seq, charge, perc_cont, sar_coords, nterm_coords, cterm_coords, sar_start, sar_end - - -### Extra "helper" functions -def storage_dict(self,sar_size,sar_seq,hits,charge_seq,charge,perc_cont,nterm_coords,sar_coords,cterm_coords,sar_start,sar_end): # probably not good to call "self" a param here...definitley not PEP approved... - """ organize dictionary for hydrophobicity check """ - if self.name not in hits: - hits[self.name] = {} - hits[self.name]["description"] = str(self.description) - hits[self.name]["sequence"] = str(self.seq) - hits[self.name]["size"] = str(self.size) - #GAcont = str((str(self.seq).count("G")+str(self.seq).count("A"))/int(self.size)*100) - #hits[self.name]["GAcont"] = "{:.2f}%".format(float(GAcont)) - if "TMD_"+str(sar_size) not in hits[self.name]: - hits[self.name]["TMD_"+str(sar_size)] = [] - hits[self.name]["TMD_"+str(sar_size)].append([sar_seq,charge_seq,charge,perc_cont,nterm_coords,sar_coords,cterm_coords,sar_start,sar_end]) - else: - hits[self.name]["TMD_"+str(sar_size)].append([sar_seq,charge_seq,charge,perc_cont,nterm_coords,sar_coords,cterm_coords,sar_start,sar_end]) - else: - if "TMD_"+str(sar_size) not in hits[self.name]: - hits[self.name]["TMD_"+str(sar_size)] = [] - hits[self.name]["TMD_"+str(sar_size)].append([sar_seq,charge_seq,charge,perc_cont,nterm_coords,sar_coords,cterm_coords,sar_start,sar_end]) - else: - hits[self.name]["TMD_"+str(sar_size)].append([sar_seq,charge_seq,charge,perc_cont,nterm_coords,sar_coords,cterm_coords,sar_start,sar_end]) - - -def percent_calc(sequence,residues,size): - """ Calculate the percent of a set of residues within an input sequence """ - counted = {} - for aa in sequence: - #print(aa) - if aa in counted: - counted[aa] += 1 - else: - counted[aa] = 1 - residue_amt = 0 - my_ratios = [] - for res_of_interest in residues: - try: - residue_amt = counted[res_of_interest] - except KeyError: - residue_amt = 0 - ratio = residue_amt/size - my_ratios.append((round(ratio*100,2))) - - res_rat = list(zip(residues,my_ratios)) - - return res_rat - - -def charge_check(charge_seq,pos_res,neg_res): - charge = 0 - for aa in charge_seq: - if aa in pos_res: - charge += 1 - if aa in neg_res: - charge -= 1 - return charge - -if __name__ == "__main__": - sequence = "MAGBYYYTRLCVRKLRKGGGHP" - residues = "YL" - size = len(sequence) - print(size) - v = percent_calc(sequence,residues,size) - print(v) - for i in v: - print(i) -
--- a/cpt_sar_finder/biopython_parsing.py Fri Jun 17 13:15:55 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,24 +0,0 @@ -#!/usr/bin/env python -# Biopython parsing module. Uses in conjunction with the sar_finder script, and potential future scripts down the line. - -from Bio import SeqIO - -class FASTA_parser: - """ Parses multi fasta file, and zips together header with sequence """ - - def __init__(self, fa): - self.fa = fa - - def multifasta_dict(self): - """ parses the input multi fasta, and puts results into dictionary """ - - return SeqIO.to_dict(SeqIO.parse(self.fa,"fasta")) - - -if __name__ == "__main__": - fa_file = "test-data/mu-proteins.fa" - d = FASTA_parser(fa_file).multifasta_dict() - print(d) - for k, v in d.items(): - print(v.description) -
--- a/cpt_sar_finder/cpt-macros.xml Fri Jun 17 13:15:55 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,115 +0,0 @@ -<?xml version="1.0"?> -<macros> - <xml name="gff_requirements"> - <requirements> - <requirement type="package" version="2.7">python</requirement> - <requirement type="package" version="1.65">biopython</requirement> - <requirement type="package" version="2.12.1">requests</requirement> - <yield/> - </requirements> - <version_command> - <![CDATA[ - cd $__tool_directory__ && git rev-parse HEAD - ]]> - </version_command> - </xml> - <xml name="citation/mijalisrasche"> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex">@unpublished{galaxyTools, - author = {E. Mijalis, H. Rasche}, - title = {CPT Galaxy Tools}, - year = {2013-2017}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - </xml> - <xml name="citations"> - <citations> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {E. Mijalis, H. Rasche}, - title = {CPT Galaxy Tools}, - year = {2013-2017}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <yield/> - </citations> - </xml> - <xml name="citations-crr"> - <citations> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {C. Ross}, - title = {CPT Galaxy Tools}, - year = {2020-}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <yield/> - </citations> - </xml> - <xml name="citations-2020"> - <citations> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {E. Mijalis, H. Rasche}, - title = {CPT Galaxy Tools}, - year = {2013-2017}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {A. Criscione}, - title = {CPT Galaxy Tools}, - year = {2019-2021}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <yield/> - </citations> - </xml> - <xml name="citations-2020-AJC-solo"> - <citations> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {A. Criscione}, - title = {CPT Galaxy Tools}, - year = {2019-2021}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <yield/> - </citations> - </xml> - <xml name="citations-clm"> - <citations> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {C. Maughmer}, - title = {CPT Galaxy Tools}, - year = {2017-2020}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <yield/> - </citations> - </xml> - <xml name="sl-citations-clm"> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {C. Maughmer}, - title = {CPT Galaxy Tools}, - year = {2017-2020}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <yield/> - </xml> -</macros>
--- a/cpt_sar_finder/file_operations.py Fri Jun 17 13:15:55 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,98 +0,0 @@ - -def fasta_from_SAR_dict(sar_dict,fa_file): - """ makes a multi fasta with candidates from SAR dictionary """ - with fa_file as f: - for data in sar_dict.values(): - f.writelines(">{}\n".format(data["description"])) - f.writelines("{}\n".format(data["sequence"])) - -def gff3_from_SAR_dict(sar_dict,gff3_file): - """ make a multi gff3 with candidates from SAR dictionary """ - gff3_cols = ["Seqid","Source","Type","Start","End","Score","Strand","Phase","Attributes"] - with gff3_file as f: - f.writelines(f"{gff3_cols[0]}\t{gff3_cols[1]}\t{gff3_cols[2]}\t{gff3_cols[3]}\t{gff3_cols[4]}\t{gff3_cols[5]}\t{gff3_cols[6]}\t{gff3_cols[7]}\t{gff3_cols[8]}\n") - if sar_dict: - #print(sar_dict) - for name, data in sar_dict.items(): - min_idx = 0 - f.writelines("##gff-version 3\n") - f.writelines(f"##sequence-region {name}\n") - n_start, n_end = split_seq_string(data["TMD_"+str(data["biggest_sar"])][min_idx][4]) - sar_start, sar_end = split_seq_string(data["TMD_"+str(data["biggest_sar"])][min_idx][5]) - c_start, c_end = split_seq_string(data["TMD_"+str(data["biggest_sar"])][min_idx][6]) - f.writelines(f'{name}\tSAR_finder\tTopological domain\t{n_start}\t{n_end}\t.\t.\t.\tNote=N-terminal net charge is {data["TMD_"+str(data["biggest_sar"])][min_idx][2]}\n') - f.writelines(f'{name}\tSAR_finder\tSAR domain\t{sar_start}\t{sar_end}\t.\t.\t.\tNote=residue % in SAR {[perc for perc in data["TMD_"+str(data["biggest_sar"])][min_idx][3]]},Total % is {round(sum(j for i,j in data["TMD_"+str(data["biggest_sar"])][min_idx][3]),2)}\n') - f.writelines(f'{name}\tSAR_finder\tTopological domain\t{c_start}\t{c_end}\t.\t.\t.\tNote=C-terminus\n') - else: - f.writelines("##gff-version 3\n") - f.writelines(f"##sequence-region\n") - - -def tab_from_SAR_dict(sar_dict,stat_file,hydrophillic_res, sar_min, sar_max): - """ convert SAR dict to a dataframe """ - columns = ["Name","Protein Sequence","Protein Length","SAR Length","SAR Start","Putative SAR Sequence","SAR End",[f"{res}%" for res in hydrophillic_res],"% Total","N-term Sequence","N-term net Charge"] # using different residues for percent calc: [f"{res}%" for res in hydrophillic_res] - with stat_file as f: - f.writelines(f"{columns[0]}\t{columns[1]}\t{columns[2]}\t{columns[3]}\t{columns[4]}\t{columns[5]}\t{columns[6]}\t{columns[7]}\t{columns[8]}\t{columns[9]}\t{columns[10]}\n") - if sar_dict: - #print(sar_dict) - for name, data in sar_dict.items(): - for tmd_size in range(sar_max, sar_min-1, -1): - if "TMD_"+str(tmd_size) in data: - for each_match in data["TMD_"+str(tmd_size)]: - if each_match != [""]: - #print(f"{name} - {data}") - #print(each_match) - #for perc in each_match[3]: - # print(perc) - try: - f.writelines(f'{name}\t{data["sequence"]}\t{data["size"]}\t{tmd_size}\t{int(each_match[7])+1}\t{each_match[0]}\t{int(each_match[8])+1}\t{[perc for perc in each_match[3]]}\t{round(sum(j for i,j in each_match[3]),2)}\t{each_match[1]}\t{each_match[2]}\n') - except IndexError: - f.writelines(f'ERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\n') - else: - continue - -def stat_file_from_SAR_dict(sar_dict, stat_file, sar_min, sar_max): - """ summary statistics from SAR finder function """ - with stat_file as f: - f.writelines("..........:::::: Candidate SAR Proteins ::::::..........\n\n") - if sar_dict: - for data in sar_dict.values(): - f.writelines("Protein Description and Name: {}\n".format(data["description"])) - f.writelines("Protein Sequence: {}\n".format(data["sequence"])) - f.writelines("Protein Length: {}\n".format(data["size"])) - f.writelines("SAR Criteria matching region(s)\n") - for tmd_size in range(sar_max, sar_min-1, -1): - if "TMD_"+str(tmd_size) in data: - f.writelines("\nSAR length of {}:\n".format(tmd_size)) - for each_match in data["TMD_"+str(tmd_size)]: - if each_match != ['']: - f.writelines("\nPotential SAR domain sequence: {}\n".format(each_match[0])) - f.writelines("N-term sequence: {}\n".format(each_match[1])) - f.writelines("N-term net charge: {}\n".format(each_match[2])) - for each_perc_calc in each_match[3]: - f.writelines("Percent {} content: {}%\n".format(each_perc_calc[0],each_perc_calc[1])) - f.writelines("N-term coords: {}\n".format(each_match[4])) - f.writelines("SAR coords: {}\n".format(each_match[5])) - f.writelines("C-term coords: {}\n".format(each_match[6])) - f.writelines("SAR start: {}\n".format(each_match[7])) - else: - continue - f.writelines("========================================================\n\n") - else: - f.writelines("No candidate SAR Proteins found") - -def split_seq_string(input_range, python_indexing=True): - """ splits a #..# sequence into the two respective starts and ends, if python indexing, adds 1, otherwise keeps """ - if python_indexing: - values = input_range.split("..") - start =int(values[0]) + 1 - end = int(values[1]) + 1 - else: - values = input_range.split("..") - start = values[0] - end = values[1] - - return start, end - -if __name__ == "__main__": - pass \ No newline at end of file
--- a/cpt_sar_finder/macros.xml Fri Jun 17 13:15:55 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,15 +0,0 @@ -<?xml version="1.0"?> -<macros> - <xml name="requirements"> - <requirements> - <requirement type="package" version="3.6.1">python</requirement> - <requirement type="package" version="1.67">biopython</requirement> - <yield/> - </requirements> - <version_command> - <![CDATA[ - cd $__tool_directory__ && git rev-parse HEAD - ]]> - </version_command> - </xml> -</macros> \ No newline at end of file
--- a/cpt_sar_finder/test-data/candidate_SAR.fa Fri Jun 17 13:15:55 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,2 +0,0 @@ ->SAR-endolysin -MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGPDIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTLLKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ
--- a/cpt_sar_finder/test-data/candidate_SAR.gff3 Fri Jun 17 13:15:55 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,6 +0,0 @@ -Seqid Source Type Start End Score Strand Phase Attributes -##gff-version 3 -##sequence-region SAR-endolysin -SAR-endolysin SAR_finder Topological domain 1 8 . . . Note=N-terminal net charge is 2 -SAR-endolysin SAR_finder SAR domain 9 26 . . . Note=residue % in SAR [('S', 0.0), ('G', 29.41), ('A', 23.53), ('T', 5.88)],Total % is 58.82 -SAR-endolysin SAR_finder Topological domain 27 172 . . . Note=C-terminus
--- a/cpt_sar_finder/test-data/candidate_SAR_stats.tsv Fri Jun 17 13:15:55 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,6 +0,0 @@ -Name Protein Sequence Protein Length SAR Length SAR Start Putative SAR Sequence SAR End ['S%', 'G%', 'A%', 'T%'] % Total N-term Sequence N-term net Charge -SAR-endolysin MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGPDIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTLLKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ 171 17 9 KAALLAVTIAGGGVGGY 26 [('S', 0.0), ('G', 29.41), ('A', 23.53), ('T', 5.88)] 58.82 MAGIPKKL 2 -SAR-endolysin MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGPDIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTLLKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ 171 16 10 AALLAVTIAGGGVGGY 26 [('S', 0.0), ('G', 31.25), ('A', 25.0), ('T', 6.25)] 62.5 MAGIPKKLK 3 -SAR-endolysin MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGPDIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTLLKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ 171 15 9 KAALLAVTIAGGGVG 24 [('S', 0.0), ('G', 26.67), ('A', 26.67), ('T', 6.67)] 60.01 MAGIPKKL 2 -SAR-endolysin MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGPDIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTLLKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ 171 15 10 AALLAVTIAGGGVGG 25 [('S', 0.0), ('G', 33.33), ('A', 26.67), ('T', 6.67)] 66.67 MAGIPKKLK 3 -SAR-endolysin MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGPDIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTLLKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ 171 15 11 ALLAVTIAGGGVGGY 26 [('S', 0.0), ('G', 33.33), ('A', 20.0), ('T', 6.67)] 60.0 MAGIPKKLKA 3
--- a/cpt_sar_finder/test-data/simple-proteins.fa Fri Jun 17 13:15:55 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,7 +0,0 @@ ->SAR-endolysin -MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGP -DIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTL -LKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ ->CPT_NC_000929.1_038 -MLKIKPAAGKAIRDPLTMKLLASEGEEKPRNSFWIRRLAAGDVVEVGSTENTADDTDAAP -KKRSKSK \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/file_operations.py Mon Jun 05 02:52:57 2023 +0000 @@ -0,0 +1,163 @@ +def fasta_from_SAR_dict(sar_dict, fa_file): + """makes a multi fasta with candidates from SAR dictionary""" + with fa_file as f: + for data in sar_dict.values(): + f.writelines(">{}\n".format(data["description"])) + f.writelines("{}\n".format(data["sequence"])) + + +def gff3_from_SAR_dict(sar_dict, gff3_file): + """make a multi gff3 with candidates from SAR dictionary""" + gff3_cols = [ + "Seqid", + "Source", + "Type", + "Start", + "End", + "Score", + "Strand", + "Phase", + "Attributes", + ] + with gff3_file as f: + f.writelines( + f"{gff3_cols[0]}\t{gff3_cols[1]}\t{gff3_cols[2]}\t{gff3_cols[3]}\t{gff3_cols[4]}\t{gff3_cols[5]}\t{gff3_cols[6]}\t{gff3_cols[7]}\t{gff3_cols[8]}\n" + ) + if sar_dict: + # print(sar_dict) + for name, data in sar_dict.items(): + min_idx = 0 + f.writelines("##gff-version 3\n") + f.writelines(f"##sequence-region {name}\n") + n_start, n_end = split_seq_string( + data["TMD_" + str(data["biggest_sar"])][min_idx][4] + ) + sar_start, sar_end = split_seq_string( + data["TMD_" + str(data["biggest_sar"])][min_idx][5] + ) + c_start, c_end = split_seq_string( + data["TMD_" + str(data["biggest_sar"])][min_idx][6] + ) + f.writelines( + f'{name}\tSAR_finder\tTopological domain\t{n_start}\t{n_end}\t.\t.\t.\tNote=N-terminal net charge is {data["TMD_"+str(data["biggest_sar"])][min_idx][2]}\n' + ) + f.writelines( + f'{name}\tSAR_finder\tSAR domain\t{sar_start}\t{sar_end}\t.\t.\t.\tNote=residue % in SAR {[perc for perc in data["TMD_"+str(data["biggest_sar"])][min_idx][3]]},Total % is {round(sum(j for i,j in data["TMD_"+str(data["biggest_sar"])][min_idx][3]),2)}\n' + ) + f.writelines( + f"{name}\tSAR_finder\tTopological domain\t{c_start}\t{c_end}\t.\t.\t.\tNote=C-terminus\n" + ) + else: + f.writelines("##gff-version 3\n") + f.writelines(f"##sequence-region\n") + + +def tab_from_SAR_dict(sar_dict, stat_file, hydrophillic_res, sar_min, sar_max): + """convert SAR dict to a dataframe""" + columns = [ + "Name", + "Protein Sequence", + "Protein Length", + "SAR Length", + "SAR Start", + "Putative SAR Sequence", + "SAR End", + [f"{res}%" for res in hydrophillic_res], + "% Total", + "N-term Sequence", + "N-term net Charge", + ] # using different residues for percent calc: [f"{res}%" for res in hydrophillic_res] + with stat_file as f: + f.writelines( + f"{columns[0]}\t{columns[1]}\t{columns[2]}\t{columns[3]}\t{columns[4]}\t{columns[5]}\t{columns[6]}\t{columns[7]}\t{columns[8]}\t{columns[9]}\t{columns[10]}\n" + ) + if sar_dict: + # print(sar_dict) + for name, data in sar_dict.items(): + for tmd_size in range(sar_max, sar_min - 1, -1): + if "TMD_" + str(tmd_size) in data: + for each_match in data["TMD_" + str(tmd_size)]: + if each_match != [""]: + # print(f"{name} - {data}") + # print(each_match) + # for perc in each_match[3]: + # print(perc) + try: + f.writelines( + f'{name}\t{data["sequence"]}\t{data["size"]}\t{tmd_size}\t{int(each_match[7])+1}\t{each_match[0]}\t{int(each_match[8])+1}\t{[perc for perc in each_match[3]]}\t{round(sum(j for i,j in each_match[3]),2)}\t{each_match[1]}\t{each_match[2]}\n' + ) + except IndexError: + f.writelines( + f"ERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\n" + ) + else: + continue + + +def stat_file_from_SAR_dict(sar_dict, stat_file, sar_min, sar_max): + """summary statistics from SAR finder function""" + with stat_file as f: + f.writelines("..........:::::: Candidate SAR Proteins ::::::..........\n\n") + if sar_dict: + for data in sar_dict.values(): + f.writelines( + "Protein Description and Name: {}\n".format(data["description"]) + ) + f.writelines("Protein Sequence: {}\n".format(data["sequence"])) + f.writelines("Protein Length: {}\n".format(data["size"])) + f.writelines("SAR Criteria matching region(s)\n") + for tmd_size in range(sar_max, sar_min - 1, -1): + if "TMD_" + str(tmd_size) in data: + f.writelines("\nSAR length of {}:\n".format(tmd_size)) + for each_match in data["TMD_" + str(tmd_size)]: + if each_match != [""]: + f.writelines( + "\nPotential SAR domain sequence: {}\n".format( + each_match[0] + ) + ) + f.writelines( + "N-term sequence: {}\n".format(each_match[1]) + ) + f.writelines( + "N-term net charge: {}\n".format(each_match[2]) + ) + for each_perc_calc in each_match[3]: + f.writelines( + "Percent {} content: {}%\n".format( + each_perc_calc[0], each_perc_calc[1] + ) + ) + f.writelines( + "N-term coords: {}\n".format(each_match[4]) + ) + f.writelines("SAR coords: {}\n".format(each_match[5])) + f.writelines( + "C-term coords: {}\n".format(each_match[6]) + ) + f.writelines("SAR start: {}\n".format(each_match[7])) + else: + continue + f.writelines( + "========================================================\n\n" + ) + else: + f.writelines("No candidate SAR Proteins found") + + +def split_seq_string(input_range, python_indexing=True): + """splits a #..# sequence into the two respective starts and ends, if python indexing, adds 1, otherwise keeps""" + if python_indexing: + values = input_range.split("..") + start = int(values[0]) + 1 + end = int(values[1]) + 1 + else: + values = input_range.split("..") + start = values[0] + end = values[1] + + return start, end + + +if __name__ == "__main__": + pass
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Mon Jun 05 02:52:57 2023 +0000 @@ -0,0 +1,74 @@ +<macros> + <xml name="requirements"> + <requirements> + <requirement type="package">progressivemauve</requirement> + <!--<requirement type="package" version="2.7">python</requirement>--> + <requirement type="package" version="0.6.4">bcbiogff</requirement> + <yield/> + </requirements> + </xml> + <token name="@WRAPPER_VERSION@">2.4.0</token> + <xml name="citation/progressive_mauve"> + <citation type="doi">10.1371/journal.pone.0011147</citation> + </xml> + <xml name="citation/gepard"> + <citation type="doi">10.1093/bioinformatics/btm039</citation> + </xml> + <token name="@XMFA_INPUT@"> + '$xmfa' + </token> + <xml name="xmfa_input" token_formats="xmfa"> + <param type="data" format="@FORMATS@" name="xmfa" label="XMFA MSA"/> + </xml> + <token name="@XMFA_FA_INPUT@"> + '$sequences' + </token> + <xml name="xmfa_fa_input"> + <param type="data" format="fasta" name="sequences" label="Sequences in alignment" help="These sequences should be the SAME DATASET that was used in the progressiveMauve run. Failing that, they should be provided in the same order as in original progressiveMauve run"/> + </xml> + <xml name="genome_selector"> + <conditional name="reference_genome"> + <param name="reference_genome_source" type="select" label="Reference Genome"> + <option value="history" selected="True">From History</option> + <option value="cached">Locally Cached</option> + </param> + <when value="cached"> + <param name="fasta_indexes" type="select" label="Source FASTA Sequence"> + <options from_data_table="all_fasta"/> + </param> + </when> + <when value="history"> + <param name="genome_fasta" type="data" format="fasta" label="Source FASTA Sequence"/> + </when> + </conditional> + </xml> + <xml name="gff3_input"> + <param label="GFF3 Annotations" name="gff3_data" type="data" format="gff3"/> + </xml> + <xml name="input/gff3+fasta"> + <expand macro="gff3_input"/> + <expand macro="genome_selector"/> + </xml> + <token name="@INPUT_GFF@"> + '$gff3_data' + </token> + <token name="@INPUT_FASTA@"> + #if str($reference_genome.reference_genome_source) == 'cached': + '${reference_genome.fasta_indexes.fields.path}' + #else if str($reference_genome.reference_genome_source) == 'history': + genomeref.fa + #end if + </token> + <token name="@GENOME_SELECTOR_PRE@"> + #if $reference_genome.reference_genome_source == 'history': + ln -s '$reference_genome.genome_fasta' genomeref.fa; + #end if + </token> + <token name="@GENOME_SELECTOR@"> + #if str($reference_genome.reference_genome_source) == 'cached': + '${reference_genome.fasta_indexes.fields.path}' + #else if str($reference_genome.reference_genome_source) == 'history': + genomeref.fa + #end if + </token> +</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/candidate_SAR.fa Mon Jun 05 02:52:57 2023 +0000 @@ -0,0 +1,2 @@ +>SAR-endolysin +MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGPDIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTLLKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/candidate_SAR.gff3 Mon Jun 05 02:52:57 2023 +0000 @@ -0,0 +1,6 @@ +Seqid Source Type Start End Score Strand Phase Attributes +##gff-version 3 +##sequence-region SAR-endolysin +SAR-endolysin SAR_finder Topological domain 1 8 . . . Note=N-terminal net charge is 2 +SAR-endolysin SAR_finder SAR domain 9 26 . . . Note=residue % in SAR [('S', 0.0), ('G', 29.41), ('A', 23.53), ('T', 5.88)],Total % is 58.82 +SAR-endolysin SAR_finder Topological domain 27 172 . . . Note=C-terminus
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/candidate_SAR_stats.tsv Mon Jun 05 02:52:57 2023 +0000 @@ -0,0 +1,6 @@ +Name Protein Sequence Protein Length SAR Length SAR Start Putative SAR Sequence SAR End ['S%', 'G%', 'A%', 'T%'] % Total N-term Sequence N-term net Charge +SAR-endolysin MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGPDIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTLLKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ 171 17 9 KAALLAVTIAGGGVGGY 26 [('S', 0.0), ('G', 29.41), ('A', 23.53), ('T', 5.88)] 58.82 MAGIPKKL 2 +SAR-endolysin MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGPDIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTLLKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ 171 16 10 AALLAVTIAGGGVGGY 26 [('S', 0.0), ('G', 31.25), ('A', 25.0), ('T', 6.25)] 62.5 MAGIPKKLK 3 +SAR-endolysin MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGPDIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTLLKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ 171 15 9 KAALLAVTIAGGGVG 24 [('S', 0.0), ('G', 26.67), ('A', 26.67), ('T', 6.67)] 60.01 MAGIPKKL 2 +SAR-endolysin MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGPDIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTLLKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ 171 15 10 AALLAVTIAGGGVGG 25 [('S', 0.0), ('G', 33.33), ('A', 26.67), ('T', 6.67)] 66.67 MAGIPKKLK 3 +SAR-endolysin MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGPDIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTLLKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ 171 15 11 ALLAVTIAGGGVGGY 26 [('S', 0.0), ('G', 33.33), ('A', 20.0), ('T', 6.67)] 60.0 MAGIPKKLKA 3
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/simple-proteins.fa Mon Jun 05 02:52:57 2023 +0000 @@ -0,0 +1,7 @@ +>SAR-endolysin +MAGIPKKLKAALLAVTIAGGGVGGYQEMTRQSLIHLENIAYMPYRDIAGVLTVCVGHTGP +DIEMRRYSHAECMALLDSDLKPVYAAIDRLVRVPLTPYQKTALATFIFNTGVTAFSKSTL +LKKLNAGDYAGARDQMARWVFAAGHKWKGLMNRREVEMAIWNIRGADDLRQ +>CPT_NC_000929.1_038 +MLKIKPAAGKAIRDPLTMKLLASEGEEKPRNSFWIRRLAAGDVVEVGSTENTADDTDAAP +KKRSKSK \ No newline at end of file