diff graphprot_predict_wrapper.py @ 5:ddcf35a868b8 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/graphprot commit ad60258f5759eaa205fec4af6143c728ea131419
author bgruening
date Wed, 05 Jun 2024 16:40:51 +0000
parents 4ad83aed5c3c
children
line wrap: on
line diff
--- a/graphprot_predict_wrapper.py	Thu Jan 28 15:06:14 2021 +0000
+++ b/graphprot_predict_wrapper.py	Wed Jun 05 16:40:51 2024 +0000
@@ -7,7 +7,6 @@
 
 import gplib
 
-
 """
 
 TOOL DEPENDENCIES
@@ -81,7 +80,8 @@
 """
 
 
-###############################################################################
+#######################################################################
+
 
 def setup_argument_parser():
     """Setup argparse parser."""
@@ -100,100 +100,119 @@
 
     """
     # Define argument parser.
-    p = ap.ArgumentParser(add_help=False,
-                          prog="graphprot_predict_wrapper.py",
-                          description=help_description,
-                          formatter_class=ap.MetavarTypeHelpFormatter)
+    p = ap.ArgumentParser(
+        add_help=False,
+        prog="graphprot_predict_wrapper.py",
+        description=help_description,
+        formatter_class=ap.MetavarTypeHelpFormatter,
+    )
 
     # Argument groups.
     p_man = p.add_argument_group("REQUIRED ARGUMENTS")
     p_opt = p.add_argument_group("OPTIONAL ARGUMENTS")
 
     # Required arguments.
-    p_opt.add_argument("-h", "--help",
-                       action="help",
-                       help="Print help message")
-    p_man.add_argument("--fasta",
-                       dest="in_fa",
-                       type=str,
-                       required=True,
-                       help="Sequences .fa file to predict"
-                            " on (option -fasta)")
-    p_man.add_argument("--model",
-                       dest="in_model",
-                       type=str,
-                       required=True,
-                       help="GraphProt model file to use for predictions"
-                            " (option -model)")
-    p_man.add_argument("--params",
-                       dest="in_params",
-                       type=str,
-                       required=True,
-                       help="Parameter file for given model")
-    p_man.add_argument("--data-id",
-                       dest="data_id",
-                       type=str,
-                       required=True,
-                       help="Data ID (option -prefix)")
+    p_opt.add_argument("-h", "--help", action="help", help="Print help message")
+    p_man.add_argument(
+        "--fasta",
+        dest="in_fa",
+        type=str,
+        required=True,
+        help="Sequences .fa file to predict" " on (option -fasta)",
+    )
+    p_man.add_argument(
+        "--model",
+        dest="in_model",
+        type=str,
+        required=True,
+        help="GraphProt model file to use for predictions" " (option -model)",
+    )
+    p_man.add_argument(
+        "--params",
+        dest="in_params",
+        type=str,
+        required=True,
+        help="Parameter file for given model",
+    )
+    p_man.add_argument(
+        "--data-id",
+        dest="data_id",
+        type=str,
+        required=True,
+        help="Data ID (option -prefix)",
+    )
     # ---> I'm  a conditional argument <---
-    p_opt.add_argument("--ws-pred",
-                       dest="ws_pred",
-                       default=False,
-                       action="store_true",
-                       help="Run a whole site prediction instead "
-                            "of calculating profiles (default: false)")
+    p_opt.add_argument(
+        "--ws-pred",
+        dest="ws_pred",
+        default=False,
+        action="store_true",
+        help="Run a whole site prediction instead "
+        "of calculating profiles (default: false)",
+    )
     # Additional arguments.
-    p_opt.add_argument("--sc-thr",
-                       dest="score_thr",
-                       type=float,
-                       default=0,
-                       help="Score threshold for extracting "
-                            "average profile peak regions (default: 0)")
-    p_opt.add_argument("--max-merge-dist",
-                       dest="max_merge_dist",
-                       type=int,
-                       default=0,
-                       choices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
-                       help="Maximum merge distance for nearby peak regions"
-                            " (default: report all non-overlapping regions)")
-    p_opt.add_argument("--gen-site-bed",
-                       dest="genomic_sites_bed",
-                       type=str,
-                       help=".bed file specifying the genomic regions of "
-                            "the input .fa sequences. Corrupt .bed "
-                            "information will be punished (default: false)")
-    p_opt.add_argument("--conf-out",
-                       dest="conf_out",
-                       default=False,
-                       action="store_true",
-                       help="Output filtered peak regions BED file or "
-                            "predictions file (if --ws-pred) using the median "
-                            "positive training site score for filtering "
-                            "(default: false)")
-    p_opt.add_argument("--gp-output",
-                       dest="gp_output",
-                       default=False,
-                       action="store_true",
-                       help="Print output produced by GraphProt "
-                            "(default: false)")
-    p_opt.add_argument("--ap-extlr",
-                       dest="ap_extlr",
-                       type=int,
-                       default=5,
-                       choices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
-                       help="Define average profile up- and downstream "
-                            "extension to produce the average profile. "
-                            "The mean over small sequence windows "
-                            "(window length = --ap-extlr*2 + 1) is used to "
-                            "get position scores, thus the average profile "
-                            "is more smooth than the initial profile output "
-                            "by GraphProt (default: 5)")
+    p_opt.add_argument(
+        "--sc-thr",
+        dest="score_thr",
+        type=float,
+        default=0,
+        help="Score threshold for extracting "
+        "average profile peak regions (default: 0)",
+    )
+    p_opt.add_argument(
+        "--max-merge-dist",
+        dest="max_merge_dist",
+        type=int,
+        default=0,
+        choices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+        help="Maximum merge distance for nearby peak regions"
+        " (default: report all non-overlapping regions)",
+    )
+    p_opt.add_argument(
+        "--gen-site-bed",
+        dest="genomic_sites_bed",
+        type=str,
+        help=".bed file specifying the genomic regions of "
+        "the input .fa sequences. Corrupt .bed "
+        "information will be punished (default: false)",
+    )
+    p_opt.add_argument(
+        "--conf-out",
+        dest="conf_out",
+        default=False,
+        action="store_true",
+        help="Output filtered peak regions BED file or "
+        "predictions file (if --ws-pred) using the median "
+        "positive training site score for filtering "
+        "(default: false)",
+    )
+    p_opt.add_argument(
+        "--gp-output",
+        dest="gp_output",
+        default=False,
+        action="store_true",
+        help="Print output produced by GraphProt " "(default: false)",
+    )
+    p_opt.add_argument(
+        "--ap-extlr",
+        dest="ap_extlr",
+        type=int,
+        default=5,
+        choices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+        help="Define average profile up- and downstream "
+        "extension to produce the average profile. "
+        "The mean over small sequence windows "
+        "(window length = --ap-extlr*2 + 1) is used to "
+        "get position scores, thus the average profile "
+        "is more smooth than the initial profile output "
+        "by GraphProt (default: 5)",
+    )
     return p
 
 
-###############################################################################
+#######################################################################
 
-if __name__ == '__main__':
+if __name__ == "__main__":
 
     # Setup argparse.
     parser = setup_argument_parser()
@@ -209,70 +228,86 @@
     # Check tool availability.
     assert gplib.is_tool("GraphProt.pl"), "GraphProt.pl not in PATH"
     # Check file inputs.
-    assert os.path.exists(args.in_fa), \
-        "input .fa file \"%s\" not found" % (args.in_fa)
-    assert os.path.exists(args.in_model), \
-        "input .model file \"%s\" not found" % (args.in_model)
-    assert os.path.exists(args.in_params), \
-        "input .params file \"%s\" not found" % (args.in_params)
+    assert os.path.exists(args.in_fa), 'input .fa file "%s" not found' % (args.in_fa)
+    assert os.path.exists(args.in_model), 'input .model file "%s" not found' % (
+        args.in_model
+    )
+    assert os.path.exists(args.in_params), 'input .params file "%s" not found' % (
+        args.in_params
+    )
     # Count .fa entries.
     c_in_fa = gplib.count_fasta_headers(args.in_fa)
-    assert c_in_fa, "input .fa file \"%s\" no headers found" % (args.in_fa)
+    assert c_in_fa, 'input .fa file "%s" no headers found' % (args.in_fa)
     print("# input .fa sequences:   %i" % (c_in_fa))
     # Read in FASTA sequences to check for uppercase sequences.
     seqs_dic = gplib.read_fasta_into_dic(args.in_fa)
     # Check for lowercase only sequences, which cause GP to crash.
-    error_mess = "input sequences encountered containing "\
-        "only lowercase characters or lowercase characters in between "\
-        "uppercase characters. Please provide either all uppercase "\
-        "sequences or sequences containing uppercase regions surrounded "\
-        "by lowercase context regions for structure calculation (see "\
-        "viewpoint concept in original GraphProt publication "\
+    error_mess = (
+        "input sequences encountered containing "
+        "only lowercase characters or lowercase characters in between "
+        "uppercase characters. Please provide either all uppercase "
+        "sequences or sequences containing uppercase regions surrounded "
+        "by lowercase context regions for structure calculation (see "
+        "viewpoint concept in original GraphProt publication "
         "for more details)"
+    )
     if args.ws_pred:
         bad_ids = gplib.check_seqs_dic_format(seqs_dic)
         assert not bad_ids, "%s" % (error_mess)
 
     c_uc_nt = gplib.seqs_dic_count_uc_nts(seqs_dic)
-    assert c_uc_nt, "no uppercase nucleotides in input .fa sequences. "\
-                    "Please change sequences to uppercase (keep in mind "\
-                    "GraphProt only scores uppercase regions (according "\
-                    "to its viewpoint concept)"
+    assert c_uc_nt, (
+        "no uppercase nucleotides in input .fa sequences. "
+        "Please change sequences to uppercase (keep in mind "
+        "GraphProt only scores uppercase regions (according "
+        "to its viewpoint concept)"
+    )
     if not args.ws_pred:
         # Check for lowercase sequences.
         c_lc_nt = gplib.seqs_dic_count_lc_nts(seqs_dic)
-        assert not c_lc_nt, "lowercase nucleotides in input .fa not allowed"\
-            " in profile predictions, since GraphProt only scores "\
+        assert not c_lc_nt, (
+            "lowercase nucleotides in input .fa not allowed"
+            " in profile predictions, since GraphProt only scores "
             "uppercase regions (according to its viewpoint concept)"
+        )
     # Check .bed.
     if args.genomic_sites_bed:
         # An array of checks, marvelous.
-        assert \
-            os.path.exists(args.genomic_sites_bed), \
-            "genomic .bed file \"%s\" not found" % (args.genomic_sites_bed)
+        assert os.path.exists(
+            args.genomic_sites_bed
+        ), 'genomic .bed file "%s" not found' % (args.genomic_sites_bed)
         # Check .bed for content.
-        assert gplib.count_file_rows(args.genomic_sites_bed), \
-            "genomic .bed file \"%s\" is empty" % (args.genomic_sites_bed)
+        assert gplib.count_file_rows(
+            args.genomic_sites_bed
+        ), 'genomic .bed file "%s" is empty' % (args.genomic_sites_bed)
         # Check .bed for 6-column format.
-        assert gplib.bed_check_six_col_format(args.genomic_sites_bed), \
-            "genomic .bed file \"%s\" appears to not "\
-            "be in 6-column .bed format" % (args.genomic_sites_bed)
+        assert gplib.bed_check_six_col_format(
+            args.genomic_sites_bed
+        ), 'genomic .bed file "%s" appears to not ' "be in 6-column .bed format" % (
+            args.genomic_sites_bed
+        )
         # Check for unique column 4 IDs.
-        assert gplib.bed_check_unique_ids(args.genomic_sites_bed), \
-            "genomic .bed file \"%s\" column 4 IDs not unique" % \
-            (args.genomic_sites_bed)
+        assert gplib.bed_check_unique_ids(
+            args.genomic_sites_bed
+        ), 'genomic .bed file "%s" column 4 IDs not unique' % (args.genomic_sites_bed)
         # Read in .bed regions, compare to FASTA sequences.
         seq_len_dic = gplib.get_seq_lengths_from_seqs_dic(seqs_dic)
         reg_len_dic = gplib.bed_get_region_lengths(args.genomic_sites_bed)
         for seq_id in seq_len_dic:
             seq_l = seq_len_dic[seq_id]
-            assert seq_id in reg_len_dic, \
-                "sequence ID \"%s\" missing in input .bed \"%s\"" % \
-                (seq_id, args.genomic_sites_bed)
+            assert (
+                seq_id in reg_len_dic
+            ), 'sequence ID "%s" missing in input .bed "%s"' % (
+                seq_id,
+                args.genomic_sites_bed,
+            )
             reg_l = reg_len_dic[seq_id]
-            assert seq_l == reg_l, \
-                "sequence length differs from .bed region length (%i != %i)" \
-                % (seq_l, reg_l)
+            assert (
+                seq_l == reg_l
+            ), "sequence length differs from .bed region length (%i != %i)" % (
+                seq_l,
+                reg_l,
+            )
     # Read in model parameters.
     param_dic = gplib.graphprot_get_param_dic(args.in_params)
     # Create GraphProt parameter string.
@@ -284,110 +319,147 @@
     """
     if args.ws_pred:
         # Do whole site prediction.
-        print("Starting whole site predictions on input .fa file"
-              " (-action predict) ... ")
-        check_cmd = "GraphProt.pl -action predict -prefix " \
-            + args.data_id + " -fasta " + args.in_fa + " " \
-            + param_string + " -model " + args.in_model
+        print(
+            "Starting whole site predictions on input .fa file"
+            " (-action predict) ... "
+        )
+        check_cmd = (
+            "GraphProt.pl -action predict -prefix "
+            + args.data_id
+            + " -fasta "
+            + args.in_fa
+            + " "
+            + param_string
+            + " -model "
+            + args.in_model
+        )
         output = subprocess.getoutput(check_cmd)
-        assert output, "the following call of GraphProt.pl "\
-            "produced no output:\n%s" % (check_cmd)
+        assert (
+            output
+        ), "the following call of GraphProt.pl " "produced no output:\n%s" % (check_cmd)
         if args.gp_output:
             print(output)
         ws_predictions_file = args.data_id + ".predictions"
-        assert os.path.exists(ws_predictions_file), \
-            "Whole site prediction output .predictions file \"%s\" not found" \
-            % (ws_predictions_file)
+        assert os.path.exists(
+            ws_predictions_file
+        ), 'Whole site prediction output .predictions file "%s" not found' % (
+            ws_predictions_file
+        )
         if args.conf_out:
             # Filter by pos_train_ws_pred_median median.
-            assert "pos_train_ws_pred_median" in param_dic, \
-                "whole site top scores median information "\
-                "missing in .params file"
-            pos_tr_ws_pred_med = \
-                float(param_dic["pos_train_ws_pred_median"])
+            assert "pos_train_ws_pred_median" in param_dic, (
+                "whole site top scores median information " "missing in .params file"
+            )
+            pos_tr_ws_pred_med = float(param_dic["pos_train_ws_pred_median"])
             # Filtered file.
             filt_ws_predictions_file = args.data_id + ".p50.predictions"
-            print("Extracting p50 sites from whole site predictions"
-                  " (score threshold = %f) ... " % (pos_tr_ws_pred_med))
-            gplib.graphprot_filter_predictions_file(ws_predictions_file,
-                                                    filt_ws_predictions_file,
-                                                    sc_thr=pos_tr_ws_pred_med)
+            print(
+                "Extracting p50 sites from whole site predictions"
+                " (score threshold = %f) ... " % (pos_tr_ws_pred_med)
+            )
+            gplib.graphprot_filter_predictions_file(
+                ws_predictions_file, filt_ws_predictions_file, sc_thr=pos_tr_ws_pred_med
+            )
     else:
         # Do profile prediction.
-        print("Starting profile predictions on on input .fa file "
-              "(-action predict_profile) ... ")
-        check_cmd = "GraphProt.pl -action predict_profile -prefix " \
-                    + args.data_id + " -fasta " + args.in_fa + " " + \
-                    param_string + " -model " + args.in_model
+        print(
+            "Starting profile predictions on on input .fa file "
+            "(-action predict_profile) ... "
+        )
+        check_cmd = (
+            "GraphProt.pl -action predict_profile -prefix "
+            + args.data_id
+            + " -fasta "
+            + args.in_fa
+            + " "
+            + param_string
+            + " -model "
+            + args.in_model
+        )
         output = subprocess.getoutput(check_cmd)
-        assert output, "the following call of GraphProt.pl produced "\
-                       "no output:\n%s" % (check_cmd)
+        assert (
+            output
+        ), "the following call of GraphProt.pl produced " "no output:\n%s" % (check_cmd)
         if args.gp_output:
             print(output)
         profile_predictions_file = args.data_id + ".profile"
-        assert os.path.exists(profile_predictions_file), \
-            "Profile prediction output .profile file \"%s\" not found" % \
-            (profile_predictions_file)
+        assert os.path.exists(
+            profile_predictions_file
+        ), 'Profile prediction output .profile file "%s" not found' % (
+            profile_predictions_file
+        )
 
         # Profile prediction output files.
         avg_prof_file = args.data_id + ".avg_profile"
         avg_prof_peaks_file = args.data_id + ".avg_profile.peaks.bed"
-        avg_prof_gen_peaks_file = args.data_id + \
-            ".avg_profile.genomic_peaks.bed"
-        avg_prof_peaks_p50_file = args.data_id + \
-            ".avg_profile.p50.peaks.bed"
-        avg_prof_gen_peaks_p50_file = args.data_id + \
-            ".avg_profile.p50.genomic_peaks.bed"
+        avg_prof_gen_peaks_file = args.data_id + ".avg_profile.genomic_peaks.bed"
+        avg_prof_peaks_p50_file = args.data_id + ".avg_profile.p50.peaks.bed"
+        avg_prof_gen_peaks_p50_file = (
+            args.data_id + ".avg_profile.p50.genomic_peaks.bed"
+        )
 
         # Get sequence IDs in order from input .fa file.
         seq_ids_list = gplib.fasta_read_in_ids(args.in_fa)
         # Calculate average profiles.
-        print("Getting average profile from profile "
-              "(extlr for smoothing: %i) ... " % (args.ap_extlr))
-        gplib.graphprot_profile_calc_avg_profile(profile_predictions_file,
-                                                 avg_prof_file,
-                                                 ap_extlr=args.ap_extlr,
-                                                 seq_ids_list=seq_ids_list,
-                                                 method=2)
+        print(
+            "Getting average profile from profile "
+            "(extlr for smoothing: %i) ... " % (args.ap_extlr)
+        )
+        gplib.graphprot_profile_calc_avg_profile(
+            profile_predictions_file,
+            avg_prof_file,
+            ap_extlr=args.ap_extlr,
+            seq_ids_list=seq_ids_list,
+            method=2,
+        )
         # Extract peak regions on sequences with threshold score 0.
-        print("Extracting peak regions from average profile "
-              "(score threshold = 0) ... ")
+        print(
+            "Extracting peak regions from average profile " "(score threshold = 0) ... "
+        )
         killpep8 = args.max_merge_dist
-        gplib.graphprot_profile_extract_peak_regions(avg_prof_file,
-                                                     avg_prof_peaks_file,
-                                                     max_merge_dist=killpep8,
-                                                     sc_thr=args.score_thr)
+        gplib.graphprot_profile_extract_peak_regions(
+            avg_prof_file,
+            avg_prof_peaks_file,
+            max_merge_dist=killpep8,
+            sc_thr=args.score_thr,
+        )
         # Convert peaks to genomic coordinates.
         if args.genomic_sites_bed:
             print("Converting peak regions to genomic coordinates ... ")
             killit = args.genomic_sites_bed
-            gplib.bed_peaks_to_genomic_peaks(avg_prof_peaks_file,
-                                             avg_prof_gen_peaks_file,
-                                             print_rows=False,
-                                             genomic_sites_bed=killit)
+            gplib.bed_peaks_to_genomic_peaks(
+                avg_prof_peaks_file,
+                avg_prof_gen_peaks_file,
+                print_rows=False,
+                genomic_sites_bed=killit,
+            )
         # Extract peak regions with threshold score p50.
         if args.conf_out:
             sc_id = "pos_train_avg_profile_median_%i" % (args.ap_extlr)
             # Filter by pos_tr_ws_pred_med median.
-            assert sc_id in param_dic, "average profile extlr %i median "\
+            assert sc_id in param_dic, (
+                "average profile extlr %i median "
                 "information missing in .params file" % (args.ap_extlr)
+            )
             p50_sc_thr = float(param_dic[sc_id])
-            print("Extracting p50 peak regions from average profile "
-                  "(score threshold = %f) ... " % (p50_sc_thr))
+            print(
+                "Extracting p50 peak regions from average profile "
+                "(score threshold = %f) ... " % (p50_sc_thr)
+            )
             despair = avg_prof_peaks_p50_file
             pain = args.max_merge_dist
-            gplib.graphprot_profile_extract_peak_regions(avg_prof_file,
-                                                         despair,
-                                                         max_merge_dist=pain,
-                                                         sc_thr=p50_sc_thr)
+            gplib.graphprot_profile_extract_peak_regions(
+                avg_prof_file, despair, max_merge_dist=pain, sc_thr=p50_sc_thr
+            )
             # Convert peaks to genomic coordinates.
             if args.genomic_sites_bed:
-                print("Converting p50 peak regions to "
-                      "genomic coordinates ... ")
+                print("Converting p50 peak regions to " "genomic coordinates ... ")
                 madness = args.genomic_sites_bed
-                gplib.bed_peaks_to_genomic_peaks(avg_prof_peaks_p50_file,
-                                                 avg_prof_gen_peaks_p50_file,
-                                                 genomic_sites_bed=madness)
+                gplib.bed_peaks_to_genomic_peaks(
+                    avg_prof_peaks_p50_file,
+                    avg_prof_gen_peaks_p50_file,
+                    genomic_sites_bed=madness,
+                )
     # Done.
     print("Script: I'm done.")
     print("Author: ... ")