diff SAR_finder.py @ 1:112751823323 draft

planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author cpt
date Mon, 05 Jun 2023 02:52:57 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/SAR_finder.py	Mon Jun 05 02:52:57 2023 +0000
@@ -0,0 +1,76 @@
+import sys
+import argparse
+import os
+import re
+from biopython_parsing import FASTA_parser
+from file_operations import fasta_from_SAR_dict, gff3_from_SAR_dict, tab_from_SAR_dict
+from SAR_functions import CheckSequence
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="SAR Finder")
+
+    parser.add_argument(
+        "fa", type=argparse.FileType("r"), help="organism's multi fasta file"
+    )
+
+    parser.add_argument(
+        "--min", type=int, default=20, help="minimum size of candidate peptide"
+    )
+
+    parser.add_argument(
+        "--max", type=int, default=200, help="maximum size of candidate peptide"
+    )
+
+    parser.add_argument(
+        "--sar_min",
+        type=int,
+        default=15,
+        help="minimum size of candidate peptide TMD domain",
+    )
+
+    parser.add_argument(
+        "--sar_max",
+        type=int,
+        default=24,
+        help="maximum size of candidate peptide TMD domain",
+    )
+
+    parser.add_argument(
+        "--out_fa",
+        type=argparse.FileType("w"),
+        help="multifasta output of candidate SAR proteins",
+        default="candidate_SAR.fa",
+    )
+
+    parser.add_argument(
+        "--out_stat",
+        type=argparse.FileType("w"),
+        help="summary statistic file for candidate SAR proteins, tab separated",
+        default="candidate_SAR_stats.tsv",
+    )
+
+    parser.add_argument(
+        "--out_gff3",
+        type=argparse.FileType("w"),
+        help="multigff3 file for candidate SAR proteins",
+        default="candidate_SAR.gff3",
+    )
+
+    args = parser.parse_args()
+
+    fa_dict = FASTA_parser(fa=args.fa).multifasta_dict()
+
+    sars = {}
+
+    for protein_name, protein_data in fa_dict.items():
+        sar = CheckSequence(protein_name, protein_data)
+        # sar.check_sizes(min=args.min,max=args.max)
+        hydros = sar.shrink_results(sar_min=args.sar_min, sar_max=args.sar_max)
+        sars.update(hydros)
+
+    gff3_from_SAR_dict(sars, args.out_gff3)
+    tab_from_SAR_dict(
+        sars, args.out_stat, "SGAT", sar_min=args.sar_min, sar_max=args.sar_max
+    )
+    fasta_from_SAR_dict(sars, args.out_fa)
+    # stat_file_from_SAR_dict(sars,args.out_stat,sar_min=args.sar_min,sar_max=args.sar_max) # fix this whenever ready.