Mercurial > repos > cpt > cpt_sar_finder
diff SAR_finder.py @ 1:112751823323 draft
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author | cpt |
---|---|
date | Mon, 05 Jun 2023 02:52:57 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SAR_finder.py Mon Jun 05 02:52:57 2023 +0000 @@ -0,0 +1,76 @@ +import sys +import argparse +import os +import re +from biopython_parsing import FASTA_parser +from file_operations import fasta_from_SAR_dict, gff3_from_SAR_dict, tab_from_SAR_dict +from SAR_functions import CheckSequence + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="SAR Finder") + + parser.add_argument( + "fa", type=argparse.FileType("r"), help="organism's multi fasta file" + ) + + parser.add_argument( + "--min", type=int, default=20, help="minimum size of candidate peptide" + ) + + parser.add_argument( + "--max", type=int, default=200, help="maximum size of candidate peptide" + ) + + parser.add_argument( + "--sar_min", + type=int, + default=15, + help="minimum size of candidate peptide TMD domain", + ) + + parser.add_argument( + "--sar_max", + type=int, + default=24, + help="maximum size of candidate peptide TMD domain", + ) + + parser.add_argument( + "--out_fa", + type=argparse.FileType("w"), + help="multifasta output of candidate SAR proteins", + default="candidate_SAR.fa", + ) + + parser.add_argument( + "--out_stat", + type=argparse.FileType("w"), + help="summary statistic file for candidate SAR proteins, tab separated", + default="candidate_SAR_stats.tsv", + ) + + parser.add_argument( + "--out_gff3", + type=argparse.FileType("w"), + help="multigff3 file for candidate SAR proteins", + default="candidate_SAR.gff3", + ) + + args = parser.parse_args() + + fa_dict = FASTA_parser(fa=args.fa).multifasta_dict() + + sars = {} + + for protein_name, protein_data in fa_dict.items(): + sar = CheckSequence(protein_name, protein_data) + # sar.check_sizes(min=args.min,max=args.max) + hydros = sar.shrink_results(sar_min=args.sar_min, sar_max=args.sar_max) + sars.update(hydros) + + gff3_from_SAR_dict(sars, args.out_gff3) + tab_from_SAR_dict( + sars, args.out_stat, "SGAT", sar_min=args.sar_min, sar_max=args.sar_max + ) + fasta_from_SAR_dict(sars, args.out_fa) + # stat_file_from_SAR_dict(sars,args.out_stat,sar_min=args.sar_min,sar_max=args.sar_max) # fix this whenever ready.