Repository 'graphprot_predict_profile'
hg clone https://toolshed.g2.bx.psu.edu/repos/rnateam/graphprot_predict_profile

Changeset 5:ddcf35a868b8 (2024-06-05)
Previous changeset 4:4ad83aed5c3c (2021-01-28)
Commit message:
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/graphprot commit ad60258f5759eaa205fec4af6143c728ea131419
modified:
gplib.py
graphprot_predict_wrapper.py
graphprot_train_predict.xml
graphprot_train_wrapper.py
b
diff -r 4ad83aed5c3c -r ddcf35a868b8 gplib.py
--- a/gplib.py Thu Jan 28 15:06:14 2021 +0000
+++ b/gplib.py Wed Jun 05 16:40:51 2024 +0000
[
b'@@ -1,4 +1,3 @@\n-\n import gzip\n import random\n import re\n@@ -6,7 +5,6 @@\n import subprocess\n from distutils.spawn import find_executable\n \n-\n """\n \n Run doctests:\n@@ -17,7 +15,8 @@\n """\n \n \n-###############################################################################\n+#######################################################################\n+\n \n def graphprot_predictions_get_median(predictions_file):\n     """\n@@ -41,11 +40,12 @@\n     return statistics.median(sc_list)\n \n \n-###############################################################################\n+#######################################################################\n+\n \n-def graphprot_profile_get_tsm(profile_file,\n-                              profile_type="profile",\n-                              avg_profile_extlr=5):\n+def graphprot_profile_get_tsm(\n+    profile_file, profile_type="profile", avg_profile_extlr=5\n+):\n \n     """\n     Given a GraphProt .profile file, extract for each site (identified by\n@@ -88,23 +88,21 @@\n             max_list.append(max_sc)\n         elif profile_type == "avg_profile":\n             # Convert profile score list to average profile scores list.\n-            aps_list = \\\n-                list_moving_window_average_values(lists_dic[seq_id],\n-                                                  win_extlr=avg_profile_extlr)\n+            aps_list = list_moving_window_average_values(\n+                lists_dic[seq_id], win_extlr=avg_profile_extlr\n+            )\n             max_sc = max(aps_list)\n             max_list.append(max_sc)\n         else:\n-            assert 0, "invalid profile_type argument given: \\"%s\\"" \\\n-                % (profile_type)\n+            assert 0, \'invalid profile_type argument given: "%s"\' % (profile_type)\n     # Return the median.\n     return statistics.median(max_list)\n \n \n-###############################################################################\n+#######################################################################\n \n-def list_moving_window_average_values(in_list,\n-                                      win_extlr=5,\n-                                      method=1):\n+\n+def list_moving_window_average_values(in_list, win_extlr=5, method=1):\n     """\n     Take a list of numeric values, and calculate for each position a new value,\n     by taking the mean value of the window of positions -win_extlr and\n@@ -152,7 +150,8 @@\n     return new_list\n \n \n-###############################################################################\n+#######################################################################\n+\n \n def echo_add_to_file(echo_string, out_file):\n     """\n@@ -167,14 +166,16 @@\n     assert not error, "echo is complaining:\\n%s\\n%s" % (check_cmd, output)\n \n \n-###############################################################################\n+#######################################################################\n+\n \n def is_tool(name):\n     """Check whether tool "name" is in PATH."""\n     return find_executable(name) is not None\n \n \n-###############################################################################\n+#######################################################################\n+\n \n def count_fasta_headers(fasta_file):\n     """\n@@ -194,7 +195,8 @@\n     return row_count\n \n \n-###############################################################################\n+#######################################################################\n+\n \n def make_file_copy(in_file, out_file):\n     """\n@@ -202,21 +204,26 @@\n \n     """\n     check_cmd = "cat " + in_file + " > " + out_file\n-    assert in_file != out_file, \\\n-        "cat does not like to cat file into same file (%s)" % (check_cmd)\n+    assert in_file != out_file, "cat does not like to cat file into same file (%s)" % (\n+        check_cmd\n+    )\n     output = subprocess.getoutput(check_cmd)\n     error = False\n     if output:\n         error = True\n-    assert not error, \\\n-        "cat did not like your input (in_file: %s, out_file: %s):\\n%s" \\\n-        % (in_file, out_file, output)\n'..b'        sc_thr=0):\n+\n+def list_extract_peaks(in_list, max_merge_dist=0, coords="list", sc_thr=0):\n     """\n     Extract peak regions from list.\n     Peak region is defined as region >= score threshold.\n@@ -969,8 +991,12 @@\n                     if peak_list[i][3] < peak_list[j][3]:\n                         new_top_pos = peak_list[j][2]\n                         new_top_sc = peak_list[j][3]\n-                    new_peak = [peak_list[i][0], peak_list[j][1],\n-                                new_top_pos, new_top_sc]\n+                    new_peak = [\n+                        peak_list[i][0],\n+                        peak_list[j][1],\n+                        new_top_pos,\n+                        new_top_sc,\n+                    ]\n                 # If two peaks were merged.\n                 if new_peak:\n                     merged_peak_list.append(new_peak)\n@@ -991,10 +1017,12 @@\n     return peak_list\n \n \n-###############################################################################\n+#######################################################################\n+\n \n-def bed_peaks_to_genomic_peaks(peak_file, genomic_peak_file, genomic_sites_bed,\n-                               print_rows=False):\n+def bed_peaks_to_genomic_peaks(\n+    peak_file, genomic_peak_file, genomic_sites_bed, print_rows=False\n+):\n     """\n     Given a .bed file of sequence peak regions (possible coordinates from\n     0 to length of s), convert peak coordinates to genomic coordinates.\n@@ -1017,10 +1045,9 @@\n             row = line.strip()\n             cols = line.strip().split("\\t")\n             site_id = cols[3]\n-            assert site_id \\\n-                not in id2row_dic, \\\n-                "column 4 IDs not unique in given .bed file \\"%s\\"" \\\n-                % (genomic_sites_bed)\n+            assert (\n+                site_id not in id2row_dic\n+            ), \'column 4 IDs not unique in given .bed file "%s"\' % (genomic_sites_bed)\n             id2row_dic[site_id] = row\n     f.close()\n \n@@ -1034,13 +1061,14 @@\n             site_e = int(cols[2])\n             site_id2 = cols[3]\n             site_sc = float(cols[4])\n-            assert re.search(".+,.+", site_id2), \\\n-                "regular expression failed for ID \\"%s\\"" % (site_id2)\n+            assert re.search(\n+                ".+,.+", site_id2\n+            ), \'regular expression failed for ID "%s"\' % (site_id2)\n             m = re.search(r".+,(\\d+)", site_id2)\n             sc_pos = int(m.group(1))  # 1-based.\n-            assert site_id in id2row_dic, \\\n-                "site ID \\"%s\\" not found in genomic sites dictionary" \\\n-                % (site_id)\n+            assert (\n+                site_id in id2row_dic\n+            ), \'site ID "%s" not found in genomic sites dictionary\' % (site_id)\n             row = id2row_dic[site_id]\n             rowl = row.split("\\t")\n             gen_chr = rowl[0]\n@@ -1054,16 +1082,23 @@\n                 new_s = gen_e - site_e\n                 new_e = gen_e - site_s\n                 new_sc_pos = gen_e - sc_pos + 1  # keep 1-based.\n-            new_row = "%s\\t%i\\t%i\\t%s,%i\\t%f\\t%s" \\\n-                      % (gen_chr, new_s, new_e,\n-                         site_id, new_sc_pos, site_sc, gen_pol)\n+            new_row = "%s\\t%i\\t%i\\t%s,%i\\t%f\\t%s" % (\n+                gen_chr,\n+                new_s,\n+                new_e,\n+                site_id,\n+                new_sc_pos,\n+                site_sc,\n+                gen_pol,\n+            )\n             OUTPEAKS.write("%s\\n" % (new_row))\n             if print_rows:\n                 print(new_row)\n     OUTPEAKS.close()\n \n \n-###############################################################################\n+#######################################################################\n+\n \n def diff_two_files_identical(file1, file2):\n     """\n@@ -1087,4 +1122,4 @@\n     return same\n \n \n-###############################################################################\n+#######################################################################\n'
b
diff -r 4ad83aed5c3c -r ddcf35a868b8 graphprot_predict_wrapper.py
--- a/graphprot_predict_wrapper.py Thu Jan 28 15:06:14 2021 +0000
+++ b/graphprot_predict_wrapper.py Wed Jun 05 16:40:51 2024 +0000
[
b'@@ -7,7 +7,6 @@\n \n import gplib\n \n-\n """\n \n TOOL DEPENDENCIES\n@@ -81,7 +80,8 @@\n """\n \n \n-###############################################################################\n+#######################################################################\n+\n \n def setup_argument_parser():\n     """Setup argparse parser."""\n@@ -100,100 +100,119 @@\n \n     """\n     # Define argument parser.\n-    p = ap.ArgumentParser(add_help=False,\n-                          prog="graphprot_predict_wrapper.py",\n-                          description=help_description,\n-                          formatter_class=ap.MetavarTypeHelpFormatter)\n+    p = ap.ArgumentParser(\n+        add_help=False,\n+        prog="graphprot_predict_wrapper.py",\n+        description=help_description,\n+        formatter_class=ap.MetavarTypeHelpFormatter,\n+    )\n \n     # Argument groups.\n     p_man = p.add_argument_group("REQUIRED ARGUMENTS")\n     p_opt = p.add_argument_group("OPTIONAL ARGUMENTS")\n \n     # Required arguments.\n-    p_opt.add_argument("-h", "--help",\n-                       action="help",\n-                       help="Print help message")\n-    p_man.add_argument("--fasta",\n-                       dest="in_fa",\n-                       type=str,\n-                       required=True,\n-                       help="Sequences .fa file to predict"\n-                            " on (option -fasta)")\n-    p_man.add_argument("--model",\n-                       dest="in_model",\n-                       type=str,\n-                       required=True,\n-                       help="GraphProt model file to use for predictions"\n-                            " (option -model)")\n-    p_man.add_argument("--params",\n-                       dest="in_params",\n-                       type=str,\n-                       required=True,\n-                       help="Parameter file for given model")\n-    p_man.add_argument("--data-id",\n-                       dest="data_id",\n-                       type=str,\n-                       required=True,\n-                       help="Data ID (option -prefix)")\n+    p_opt.add_argument("-h", "--help", action="help", help="Print help message")\n+    p_man.add_argument(\n+        "--fasta",\n+        dest="in_fa",\n+        type=str,\n+        required=True,\n+        help="Sequences .fa file to predict" " on (option -fasta)",\n+    )\n+    p_man.add_argument(\n+        "--model",\n+        dest="in_model",\n+        type=str,\n+        required=True,\n+        help="GraphProt model file to use for predictions" " (option -model)",\n+    )\n+    p_man.add_argument(\n+        "--params",\n+        dest="in_params",\n+        type=str,\n+        required=True,\n+        help="Parameter file for given model",\n+    )\n+    p_man.add_argument(\n+        "--data-id",\n+        dest="data_id",\n+        type=str,\n+        required=True,\n+        help="Data ID (option -prefix)",\n+    )\n     # ---> I\'m  a conditional argument <---\n-    p_opt.add_argument("--ws-pred",\n-                       dest="ws_pred",\n-                       default=False,\n-                       action="store_true",\n-                       help="Run a whole site prediction instead "\n-                            "of calculating profiles (default: false)")\n+    p_opt.add_argument(\n+        "--ws-pred",\n+        dest="ws_pred",\n+        default=False,\n+        action="store_true",\n+        help="Run a whole site prediction instead "\n+        "of calculating profiles (default: false)",\n+    )\n     # Additional arguments.\n-    p_opt.add_argument("--sc-thr",\n-                       dest="score_thr",\n-                       type=float,\n-                       default=0,\n-                       help="Score threshold for extracting "\n-                            "average profile peak regions (default: 0)")\n-    p_opt.add_argument("--max-merge-dist",\n-                       dest="max_merge_dist",\n-                       type=int,\n-                       default=0,\n-                       choices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],\n-  '..b'args.ap_extlr)\n+        )\n+        gplib.graphprot_profile_calc_avg_profile(\n+            profile_predictions_file,\n+            avg_prof_file,\n+            ap_extlr=args.ap_extlr,\n+            seq_ids_list=seq_ids_list,\n+            method=2,\n+        )\n         # Extract peak regions on sequences with threshold score 0.\n-        print("Extracting peak regions from average profile "\n-              "(score threshold = 0) ... ")\n+        print(\n+            "Extracting peak regions from average profile " "(score threshold = 0) ... "\n+        )\n         killpep8 = args.max_merge_dist\n-        gplib.graphprot_profile_extract_peak_regions(avg_prof_file,\n-                                                     avg_prof_peaks_file,\n-                                                     max_merge_dist=killpep8,\n-                                                     sc_thr=args.score_thr)\n+        gplib.graphprot_profile_extract_peak_regions(\n+            avg_prof_file,\n+            avg_prof_peaks_file,\n+            max_merge_dist=killpep8,\n+            sc_thr=args.score_thr,\n+        )\n         # Convert peaks to genomic coordinates.\n         if args.genomic_sites_bed:\n             print("Converting peak regions to genomic coordinates ... ")\n             killit = args.genomic_sites_bed\n-            gplib.bed_peaks_to_genomic_peaks(avg_prof_peaks_file,\n-                                             avg_prof_gen_peaks_file,\n-                                             print_rows=False,\n-                                             genomic_sites_bed=killit)\n+            gplib.bed_peaks_to_genomic_peaks(\n+                avg_prof_peaks_file,\n+                avg_prof_gen_peaks_file,\n+                print_rows=False,\n+                genomic_sites_bed=killit,\n+            )\n         # Extract peak regions with threshold score p50.\n         if args.conf_out:\n             sc_id = "pos_train_avg_profile_median_%i" % (args.ap_extlr)\n             # Filter by pos_tr_ws_pred_med median.\n-            assert sc_id in param_dic, "average profile extlr %i median "\\\n+            assert sc_id in param_dic, (\n+                "average profile extlr %i median "\n                 "information missing in .params file" % (args.ap_extlr)\n+            )\n             p50_sc_thr = float(param_dic[sc_id])\n-            print("Extracting p50 peak regions from average profile "\n-                  "(score threshold = %f) ... " % (p50_sc_thr))\n+            print(\n+                "Extracting p50 peak regions from average profile "\n+                "(score threshold = %f) ... " % (p50_sc_thr)\n+            )\n             despair = avg_prof_peaks_p50_file\n             pain = args.max_merge_dist\n-            gplib.graphprot_profile_extract_peak_regions(avg_prof_file,\n-                                                         despair,\n-                                                         max_merge_dist=pain,\n-                                                         sc_thr=p50_sc_thr)\n+            gplib.graphprot_profile_extract_peak_regions(\n+                avg_prof_file, despair, max_merge_dist=pain, sc_thr=p50_sc_thr\n+            )\n             # Convert peaks to genomic coordinates.\n             if args.genomic_sites_bed:\n-                print("Converting p50 peak regions to "\n-                      "genomic coordinates ... ")\n+                print("Converting p50 peak regions to " "genomic coordinates ... ")\n                 madness = args.genomic_sites_bed\n-                gplib.bed_peaks_to_genomic_peaks(avg_prof_peaks_p50_file,\n-                                                 avg_prof_gen_peaks_p50_file,\n-                                                 genomic_sites_bed=madness)\n+                gplib.bed_peaks_to_genomic_peaks(\n+                    avg_prof_peaks_p50_file,\n+                    avg_prof_gen_peaks_p50_file,\n+                    genomic_sites_bed=madness,\n+                )\n     # Done.\n     print("Script: I\'m done.")\n     print("Author: ... ")\n'
b
diff -r 4ad83aed5c3c -r ddcf35a868b8 graphprot_train_predict.xml
--- a/graphprot_train_predict.xml Thu Jan 28 15:06:14 2021 +0000
+++ b/graphprot_train_predict.xml Wed Jun 05 16:40:51 2024 +0000
b
@@ -20,6 +20,7 @@
                 $action_type.training_options.disable_cv
                 $action_type.training_options.disable_motifs
                 --min-train $action_type.training_options.min_train
+                --gp-output
 
         #elif $action_type.action_type_selector == 'predict':
             python '$__tool_directory__/graphprot_predict_wrapper.py'
@@ -35,6 +36,7 @@
                 --ap-extlr $action_type.prediction_options.ap_extlr
                 $action_type.prediction_options.conf_out
                 $action_type.prediction_options.ws_pred_out
+                --gp-output
         #end if
 
 
b
diff -r 4ad83aed5c3c -r ddcf35a868b8 graphprot_train_wrapper.py
--- a/graphprot_train_wrapper.py Thu Jan 28 15:06:14 2021 +0000
+++ b/graphprot_train_wrapper.py Wed Jun 05 16:40:51 2024 +0000
b
b'@@ -7,7 +7,6 @@\n \n import gplib\n \n-\n """\n \n TOOL DEPENDENCIES\n@@ -62,7 +61,8 @@\n """\n \n \n-###############################################################################\n+#######################################################################\n+\n \n def setup_argument_parser():\n     """Setup argparse parser."""\n@@ -84,89 +84,107 @@\n \n     """\n     # Define argument parser.\n-    p = ap.ArgumentParser(add_help=False,\n-                          prog="graphprot_train_wrapper.py",\n-                          description=help_description,\n-                          formatter_class=ap.MetavarTypeHelpFormatter)\n+    p = ap.ArgumentParser(\n+        add_help=False,\n+        prog="graphprot_train_wrapper.py",\n+        description=help_description,\n+        formatter_class=ap.MetavarTypeHelpFormatter,\n+    )\n \n     # Argument groups.\n     p_man = p.add_argument_group("REQUIRED ARGUMENTS")\n     p_opt = p.add_argument_group("OPTIONAL ARGUMENTS")\n \n     # Required arguments.\n-    p_opt.add_argument("-h", "--help",\n-                       action="help",\n-                       help="Print help message")\n-    p_man.add_argument("--pos",\n-                       dest="in_pos_fa",\n-                       type=str,\n-                       required=True,\n-                       help="Positive (= binding site) sequences .fa file "\n-                            "for model training (option -fasta)")\n-    p_man.add_argument("--neg",\n-                       dest="in_neg_fa",\n-                       type=str,\n-                       required=True,\n-                       help="Negative sequences .fa file for model "\n-                            "training (option -negfasta)")\n-    p_man.add_argument("--data-id",\n-                       dest="data_id",\n-                       type=str,\n-                       required=True,\n-                       help="Data ID (option -prefix)")\n+    p_opt.add_argument("-h", "--help", action="help", help="Print help message")\n+    p_man.add_argument(\n+        "--pos",\n+        dest="in_pos_fa",\n+        type=str,\n+        required=True,\n+        help="Positive (= binding site) sequences .fa file "\n+        "for model training (option -fasta)",\n+    )\n+    p_man.add_argument(\n+        "--neg",\n+        dest="in_neg_fa",\n+        type=str,\n+        required=True,\n+        help="Negative sequences .fa file for model " "training (option -negfasta)",\n+    )\n+    p_man.add_argument(\n+        "--data-id",\n+        dest="data_id",\n+        type=str,\n+        required=True,\n+        help="Data ID (option -prefix)",\n+    )\n     # Additional arguments.\n-    p_opt.add_argument("--opt-set-size",\n-                       dest="opt_set_size",\n-                       type=int,\n-                       default=500,\n-                       help="Hyperparameter optimization set size (taken "\n-                            "away from both --pos and --neg) (default: 500)")\n-    p_opt.add_argument("--opt-pos",\n-                       dest="opt_pos_fa",\n-                       type=str,\n-                       help="Positive (= binding site) sequences .fa file "\n-                            "for hyperparameter optimization (default: take "\n-                            "--opt-set-size from --pos)")\n-    p_opt.add_argument("--opt-neg",\n-                       dest="opt_neg_fa",\n-                       type=str,\n-                       help="Negative sequences .fa file for hyperparameter "\n-                            "optimization (default: take --opt-set-size "\n-                            "from --neg)")\n-    p_opt.add_argument("--min-train",\n-                       dest="min_train",\n-                       type=int,\n-                       default=500,\n-                       help="Minimum amount of training sites demanded "\n-                            "(default: 500)")\n-    p_opt.add_argument("--disable-cv",\n-                       dest="disable_cv",\n-                       default=False,\n-                       action="store_true",\n-               '..b' " -model "\n+        + model_file\n+    )\n     print(check_cmd)\n     output = subprocess.getoutput(check_cmd)\n-    assert output, \\\n-        "The following call of GraphProt.pl produced no output:\\n%s" \\\n-        % (check_cmd)\n+    assert output, "The following call of GraphProt.pl produced no output:\\n%s" % (\n+        check_cmd\n+    )\n     if args.gp_output:\n         print(output)\n     ws_predictions_file = args.data_id + ".predictions"\n-    assert os.path.exists(ws_predictions_file), \\\n-        "Whole site prediction output .predictions file \\"%s\\" not found" \\\n-        % (ws_predictions_file)\n+    assert os.path.exists(\n+        ws_predictions_file\n+    ), \'Whole site prediction output .predictions file "%s" not found\' % (\n+        ws_predictions_file\n+    )\n \n     """\n     Do profile predictions on positive training set.\n \n     """\n-    print("Starting profile predictions on positive training set "\n-          "-action predict_profile) ... ")\n-    check_cmd = "GraphProt.pl -action predict_profile -prefix " \\\n-        + args.data_id + " -fasta " + pos_train_fa + " " \\\n-        + param_string + " -model " + model_file\n+    print(\n+        "Starting profile predictions on positive training set "\n+        "-action predict_profile) ... "\n+    )\n+    check_cmd = (\n+        "GraphProt.pl -action predict_profile -prefix "\n+        + args.data_id\n+        + " -fasta "\n+        + pos_train_fa\n+        + " "\n+        + param_string\n+        + " -model "\n+        + model_file\n+    )\n     print(check_cmd)\n     output = subprocess.getoutput(check_cmd)\n-    assert output, \\\n-        "The following call of GraphProt.pl produced no output:\\n%s" \\\n-        % (check_cmd)\n+    assert output, "The following call of GraphProt.pl produced no output:\\n%s" % (\n+        check_cmd\n+    )\n     if args.gp_output:\n         print(output)\n     profile_predictions_file = args.data_id + ".profile"\n-    assert os.path.exists(profile_predictions_file), \\\n-        "Profile prediction output .profile file \\"%s\\" not found" \\\n-        % (profile_predictions_file)\n+    assert os.path.exists(\n+        profile_predictions_file\n+    ), \'Profile prediction output .profile file "%s" not found\' % (\n+        profile_predictions_file\n+    )\n \n     """\n     Get 50 % score (median) for .predictions and .profile file.\n@@ -454,12 +550,11 @@\n     print("Getting .profile and .predictions median scores ... ")\n \n     # Whole site scores median.\n-    ws_pred_median = \\\n-        gplib.graphprot_predictions_get_median(ws_predictions_file)\n+    ws_pred_median = gplib.graphprot_predictions_get_median(ws_predictions_file)\n     # Profile top site scores median.\n-    profile_median = \\\n-        gplib.graphprot_profile_get_tsm(profile_predictions_file,\n-                                        profile_type="profile")\n+    profile_median = gplib.graphprot_profile_get_tsm(\n+        profile_predictions_file, profile_type="profile"\n+    )\n     ws_pred_string = "pos_train_ws_pred_median: %f" % (ws_pred_median)\n     profile_string = "pos_train_profile_median: %f" % (profile_median)\n     gplib.echo_add_to_file(ws_pred_string, params_file)\n@@ -467,13 +562,14 @@\n     # Average profile top site scores median for extlr 1 to 10.\n     for i in range(10):\n         i += 1\n-        avg_profile_median = \\\n-            gplib.graphprot_profile_get_tsm(profile_predictions_file,\n-                                            profile_type="avg_profile",\n-                                            avg_profile_extlr=i)\n+        avg_profile_median = gplib.graphprot_profile_get_tsm(\n+            profile_predictions_file, profile_type="avg_profile", avg_profile_extlr=i\n+        )\n \n-        avg_profile_string = "pos_train_avg_profile_median_%i: %f" \\\n-            % (i, avg_profile_median)\n+        avg_profile_string = "pos_train_avg_profile_median_%i: %f" % (\n+            i,\n+            avg_profile_median,\n+        )\n         gplib.echo_add_to_file(avg_profile_string, params_file)\n \n     print("Script: I\'m done.")\n'