changeset 5:ddcf35a868b8 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/graphprot commit ad60258f5759eaa205fec4af6143c728ea131419
author bgruening
date Wed, 05 Jun 2024 16:40:51 +0000
parents 4ad83aed5c3c
children 33b590aa07c1
files gplib.py graphprot_predict_wrapper.py graphprot_train_predict.xml graphprot_train_wrapper.py
diffstat 4 files changed, 723 insertions(+), 518 deletions(-) [+]
line wrap: on
line diff
--- a/gplib.py	Thu Jan 28 15:06:14 2021 +0000
+++ b/gplib.py	Wed Jun 05 16:40:51 2024 +0000
@@ -1,4 +1,3 @@
-
 import gzip
 import random
 import re
@@ -6,7 +5,6 @@
 import subprocess
 from distutils.spawn import find_executable
 
-
 """
 
 Run doctests:
@@ -17,7 +15,8 @@
 """
 
 
-###############################################################################
+#######################################################################
+
 
 def graphprot_predictions_get_median(predictions_file):
     """
@@ -41,11 +40,12 @@
     return statistics.median(sc_list)
 
 
-###############################################################################
+#######################################################################
+
 
-def graphprot_profile_get_tsm(profile_file,
-                              profile_type="profile",
-                              avg_profile_extlr=5):
+def graphprot_profile_get_tsm(
+    profile_file, profile_type="profile", avg_profile_extlr=5
+):
 
     """
     Given a GraphProt .profile file, extract for each site (identified by
@@ -88,23 +88,21 @@
             max_list.append(max_sc)
         elif profile_type == "avg_profile":
             # Convert profile score list to average profile scores list.
-            aps_list = \
-                list_moving_window_average_values(lists_dic[seq_id],
-                                                  win_extlr=avg_profile_extlr)
+            aps_list = list_moving_window_average_values(
+                lists_dic[seq_id], win_extlr=avg_profile_extlr
+            )
             max_sc = max(aps_list)
             max_list.append(max_sc)
         else:
-            assert 0, "invalid profile_type argument given: \"%s\"" \
-                % (profile_type)
+            assert 0, 'invalid profile_type argument given: "%s"' % (profile_type)
     # Return the median.
     return statistics.median(max_list)
 
 
-###############################################################################
+#######################################################################
 
-def list_moving_window_average_values(in_list,
-                                      win_extlr=5,
-                                      method=1):
+
+def list_moving_window_average_values(in_list, win_extlr=5, method=1):
     """
     Take a list of numeric values, and calculate for each position a new value,
     by taking the mean value of the window of positions -win_extlr and
@@ -152,7 +150,8 @@
     return new_list
 
 
-###############################################################################
+#######################################################################
+
 
 def echo_add_to_file(echo_string, out_file):
     """
@@ -167,14 +166,16 @@
     assert not error, "echo is complaining:\n%s\n%s" % (check_cmd, output)
 
 
-###############################################################################
+#######################################################################
+
 
 def is_tool(name):
     """Check whether tool "name" is in PATH."""
     return find_executable(name) is not None
 
 
-###############################################################################
+#######################################################################
+
 
 def count_fasta_headers(fasta_file):
     """
@@ -194,7 +195,8 @@
     return row_count
 
 
-###############################################################################
+#######################################################################
+
 
 def make_file_copy(in_file, out_file):
     """
@@ -202,21 +204,26 @@
 
     """
     check_cmd = "cat " + in_file + " > " + out_file
-    assert in_file != out_file, \
-        "cat does not like to cat file into same file (%s)" % (check_cmd)
+    assert in_file != out_file, "cat does not like to cat file into same file (%s)" % (
+        check_cmd
+    )
     output = subprocess.getoutput(check_cmd)
     error = False
     if output:
         error = True
-    assert not error, \
-        "cat did not like your input (in_file: %s, out_file: %s):\n%s" \
-        % (in_file, out_file, output)
+    assert not error, "cat did not like your input (in_file: %s, out_file: %s):\n%s" % (
+        in_file,
+        out_file,
+        output,
+    )
 
 
-###############################################################################
+#######################################################################
+
 
-def split_fasta_into_test_train_files(in_fasta, test_out_fa, train_out_fa,
-                                      test_size=500):
+def split_fasta_into_test_train_files(
+    in_fasta, test_out_fa, train_out_fa, test_size=500
+):
     """
     Split in_fasta .fa file into two files (e.g. test, train).
 
@@ -230,7 +237,7 @@
     TRAINOUT = open(train_out_fa, "w")
     for seq_id in rand_ids_list:
         seq = seqs_dic[seq_id]
-        if (c_out >= test_size):
+        if c_out >= test_size:
             TRAINOUT.write(">%s\n%s\n" % (seq_id, seq))
         else:
             TESTOUT.write(">%s\n%s\n" % (seq_id, seq))
@@ -239,7 +246,8 @@
     TRAINOUT.close()
 
 
-###############################################################################
+#######################################################################
+
 
 def check_seqs_dic_format(seqs_dic):
     """
@@ -267,16 +275,19 @@
     return bad_seq_ids
 
 
-###############################################################################
+#######################################################################
+
 
-def read_fasta_into_dic(fasta_file,
-                        seqs_dic=False,
-                        ids_dic=False,
-                        read_dna=False,
-                        short_ensembl=False,
-                        reject_lc=False,
-                        convert_to_uc=False,
-                        skip_n_seqs=True):
+def read_fasta_into_dic(
+    fasta_file,
+    seqs_dic=False,
+    ids_dic=False,
+    read_dna=False,
+    short_ensembl=False,
+    reject_lc=False,
+    convert_to_uc=False,
+    skip_n_seqs=True,
+):
     """
     Read in FASTA sequences, convert to RNA, store in dictionary
     and return dictionary.
@@ -302,7 +313,7 @@
 
     # Go through FASTA file, extract sequences.
     if re.search(r".+\.gz$", fasta_file):
-        f = gzip.open(fasta_file, 'rt')
+        f = gzip.open(fasta_file, "rt")
     else:
         f = open(fasta_file, "r")
     for line in f:
@@ -315,9 +326,10 @@
                 if re.search(r".+\..+", seq_id):
                     m = re.search(r"(.+?)\..+", seq_id)
                     seq_id = m.group(1)
-            assert seq_id not in seqs_dic, \
-                "non-unique FASTA header \"%s\" in \"%s\"" \
-                % (seq_id, fasta_file)
+            assert seq_id not in seqs_dic, 'non-unique FASTA header "%s" in "%s"' % (
+                seq_id,
+                fasta_file,
+            )
             if ids_dic:
                 if seq_id in ids_dic:
                     seqs_dic[seq_id] = ""
@@ -328,31 +340,31 @@
                 m = re.search("([ACGTUN]+)", line, re.I)
                 seq = m.group(1)
                 if reject_lc:
-                    assert \
-                        not re.search("[a-z]", seq), \
-                        "lc char detected in seq \"%i\" (reject_lc=True)" \
-                        % (seq_id)
+                    assert not re.search(
+                        "[a-z]", seq
+                    ), 'lc char detected in seq "%i" (reject_lc=True)' % (seq_id)
                 if convert_to_uc:
                     seq = seq.upper()
                 # If sequences with N nucleotides should be skipped.
                 if skip_n_seqs:
                     if "n" in m.group(1) or "N" in m.group(1):
-                        print("WARNING: \"%s\" contains N. Discarding "
-                              "sequence ... " % (seq_id))
+                        print(
+                            'WARNING: "%s" contains N. Discarding '
+                            "sequence ... " % (seq_id)
+                        )
                         del seqs_dic[seq_id]
                         continue
                 # Convert to RNA, concatenate sequence.
                 if read_dna:
-                    seqs_dic[seq_id] += \
-                        m.group(1).replace("U", "T").replace("u", "t")
+                    seqs_dic[seq_id] += m.group(1).replace("U", "T").replace("u", "t")
                 else:
-                    seqs_dic[seq_id] += \
-                        m.group(1).replace("T", "U").replace("t", "u")
+                    seqs_dic[seq_id] += m.group(1).replace("T", "U").replace("t", "u")
     f.close()
     return seqs_dic
 
 
-###############################################################################
+#######################################################################
+
 
 def random_order_dic_keys_into_list(in_dic):
     """
@@ -366,7 +378,8 @@
     return id_list
 
 
-###############################################################################
+#######################################################################
+
 
 def graphprot_get_param_string(params_file):
     """
@@ -394,11 +407,12 @@
                 else:
                     param_string += "-%s %s " % (par, setting)
             else:
-                assert 0, "pattern matching failed for string \"%s\"" % (param)
+                assert 0, 'pattern matching failed for string "%s"' % (param)
     return param_string
 
 
-###############################################################################
+#######################################################################
+
 
 def seqs_dic_count_uc_nts(seqs_dic):
     """
@@ -416,11 +430,12 @@
     assert seqs_dic, "Given sequence dictionary empty"
     c_uc = 0
     for seq_id in seqs_dic:
-        c_uc += len(re.findall(r'[A-Z]', seqs_dic[seq_id]))
+        c_uc += len(re.findall(r"[A-Z]", seqs_dic[seq_id]))
     return c_uc
 
 
-###############################################################################
+#######################################################################
+
 
 def seqs_dic_count_lc_nts(seqs_dic):
     """
@@ -438,11 +453,12 @@
     assert seqs_dic, "Given sequence dictionary empty"
     c_uc = 0
     for seq_id in seqs_dic:
-        c_uc += len(re.findall(r'[a-z]', seqs_dic[seq_id]))
+        c_uc += len(re.findall(r"[a-z]", seqs_dic[seq_id]))
     return c_uc
 
 
-###############################################################################
+#######################################################################
+
 
 def count_file_rows(in_file):
     """
@@ -462,7 +478,8 @@
     return row_count
 
 
-###############################################################################
+#######################################################################
+
 
 def bed_check_six_col_format(bed_file):
     """
@@ -488,7 +505,8 @@
     return six_col_format
 
 
-###############################################################################
+#######################################################################
+
 
 def bed_check_unique_ids(bed_file):
     """
@@ -512,7 +530,8 @@
         return True
 
 
-###############################################################################
+#######################################################################
+
 
 def get_seq_lengths_from_seqs_dic(seqs_dic):
     """
@@ -527,7 +546,8 @@
     return seq_len_dic
 
 
-###############################################################################
+#######################################################################
+
 
 def bed_get_region_lengths(bed_file):
     """
@@ -548,19 +568,19 @@
             site_e = int(cols[2])
             site_id = cols[3]
             site_l = site_e - site_s
-            assert site_id \
-                not in id2len_dic, \
-                "column 4 IDs not unique in given .bed file \"%s\"" \
-                % (bed_file)
+            assert (
+                site_id not in id2len_dic
+            ), 'column 4 IDs not unique in given .bed file "%s"' % (bed_file)
             id2len_dic[site_id] = site_l
     f.closed
-    assert id2len_dic, \
-        "No IDs read into dic (input file \"%s\" empty or malformatted?)" \
-        % (bed_file)
+    assert (
+        id2len_dic
+    ), 'No IDs read into dic (input file "%s" empty or malformatted?)' % (bed_file)
     return id2len_dic
 
 
-###############################################################################
+#######################################################################
+
 
 def graphprot_get_param_dic(params_file):
     """
@@ -599,10 +619,10 @@
     return param_dic
 
 
-###############################################################################
+#######################################################################
 
-def graphprot_filter_predictions_file(in_file, out_file,
-                                      sc_thr=0):
+
+def graphprot_filter_predictions_file(in_file, out_file, sc_thr=0):
     """
     Filter GraphProt .predictions file by given score thr_sc.
     """
@@ -619,7 +639,8 @@
     OUTPRED.close()
 
 
-###############################################################################
+#######################################################################
+
 
 def fasta_read_in_ids(fasta_file):
     """
@@ -642,12 +663,12 @@
     return ids_list
 
 
-###############################################################################
+#######################################################################
+
 
-def graphprot_profile_calc_avg_profile(in_file, out_file,
-                                       ap_extlr=5,
-                                       seq_ids_list=False,
-                                       method=1):
+def graphprot_profile_calc_avg_profile(
+    in_file, out_file, ap_extlr=5, seq_ids_list=False, method=1
+):
     """
     Given a GraphProt .profile file, calculate average profiles and output
     average profile file.
@@ -702,15 +723,16 @@
         if seq_ids_list:
             c_seq_ids = len(seq_ids_list)
             c_site_ids = len(site_starts_dic)
-            assert c_seq_ids == c_site_ids, \
-                "# sequence IDs != # site IDs (%i != %i)" \
-                % (c_seq_ids, c_site_ids)
+            assert (
+                c_seq_ids == c_site_ids
+            ), "# sequence IDs != # site IDs (%i != %i)" % (c_seq_ids, c_site_ids)
         OUTPROF = open(out_file, "w")
         # For each site, calculate average profile scores list.
         for site_id in lists_dic:
             # Convert profile score list to average profile scores list.
-            aps_list = list_moving_window_average_values(lists_dic[site_id],
-                                                         win_extlr=ap_extlr)
+            aps_list = list_moving_window_average_values(
+                lists_dic[site_id], win_extlr=ap_extlr
+            )
             start_pos = site_starts_dic[site_id]
             # Get original FASTA sequence ID.
             if seq_ids_list:
@@ -741,10 +763,9 @@
                 if cur_id != old_id:
                     # Process old id scores.
                     if scores_list:
-                        aps_list = \
-                            list_moving_window_average_values(
-                                scores_list,
-                                win_extlr=ap_extlr)
+                        aps_list = list_moving_window_average_values(
+                            scores_list, win_extlr=ap_extlr
+                        )
                         start_pos = site_starts_dic[old_id]
                         seq_id = old_id
                         # Get original FASTA sequence ID.
@@ -763,8 +784,9 @@
         f.close()
         # Process last block.
         if scores_list:
-            aps_list = list_moving_window_average_values(scores_list,
-                                                         win_extlr=ap_extlr)
+            aps_list = list_moving_window_average_values(
+                scores_list, win_extlr=ap_extlr
+            )
             start_pos = site_starts_dic[old_id]
             seq_id = old_id
             # Get original FASTA sequence ID.
@@ -776,11 +798,12 @@
         OUTPROF.close()
 
 
-###############################################################################
+#######################################################################
+
 
-def graphprot_profile_extract_peak_regions(in_file, out_file,
-                                           max_merge_dist=0,
-                                           sc_thr=0):
+def graphprot_profile_extract_peak_regions(
+    in_file, out_file, max_merge_dist=0, sc_thr=0
+):
     """
     Extract peak regions from GraphProt .profile file.
     Store the peak regions (defined as regions with scores >= sc_thr)
@@ -835,21 +858,22 @@
                 # Process old id scores.
                 if scores_list:
                     # Extract peaks from region.
-                    peak_list = \
-                        list_extract_peaks(scores_list,
-                                           max_merge_dist=max_merge_dist,
-                                           coords="bed",
-                                           sc_thr=sc_thr)
+                    peak_list = list_extract_peaks(
+                        scores_list,
+                        max_merge_dist=max_merge_dist,
+                        coords="bed",
+                        sc_thr=sc_thr,
+                    )
                     start_pos = site_starts_dic[old_id]
                     # Print out peaks in .bed format.
                     for ln in peak_list:
                         peak_s = start_pos + ln[0]
                         peak_e = start_pos + ln[1]
                         site_id = "%s,%i" % (old_id, ln[2])
-                        OUTPEAKS.write("%s\t%i\t%i"
-                                       "\t%s\t%f\t+\n"
-                                       % (old_id, peak_s,
-                                          peak_e, site_id, ln[3]))
+                        OUTPEAKS.write(
+                            "%s\t%i\t%i"
+                            "\t%s\t%f\t+\n" % (old_id, peak_s, peak_e, site_id, ln[3])
+                        )
                     # Reset list.
                     scores_list = []
                 old_id = cur_id
@@ -861,27 +885,25 @@
     # Process last block.
     if scores_list:
         # Extract peaks from region.
-        peak_list = list_extract_peaks(scores_list,
-                                       max_merge_dist=max_merge_dist,
-                                       coords="bed",
-                                       sc_thr=sc_thr)
+        peak_list = list_extract_peaks(
+            scores_list, max_merge_dist=max_merge_dist, coords="bed", sc_thr=sc_thr
+        )
         start_pos = site_starts_dic[old_id]
         # Print out peaks in .bed format.
         for ln in peak_list:
             peak_s = start_pos + ln[0]
             peak_e = start_pos + ln[1]
             site_id = "%s,%i" % (old_id, ln[2])  # best score also 1-based.
-            OUTPEAKS.write("%s\t%i\t%i\t%s\t%f\t+\n"
-                           % (old_id, peak_s, peak_e, site_id, ln[3]))
+            OUTPEAKS.write(
+                "%s\t%i\t%i\t%s\t%f\t+\n" % (old_id, peak_s, peak_e, site_id, ln[3])
+            )
     OUTPEAKS.close()
 
 
-###############################################################################
+#######################################################################
 
-def list_extract_peaks(in_list,
-                       max_merge_dist=0,
-                       coords="list",
-                       sc_thr=0):
+
+def list_extract_peaks(in_list, max_merge_dist=0, coords="list", sc_thr=0):
     """
     Extract peak regions from list.
     Peak region is defined as region >= score threshold.
@@ -969,8 +991,12 @@
                     if peak_list[i][3] < peak_list[j][3]:
                         new_top_pos = peak_list[j][2]
                         new_top_sc = peak_list[j][3]
-                    new_peak = [peak_list[i][0], peak_list[j][1],
-                                new_top_pos, new_top_sc]
+                    new_peak = [
+                        peak_list[i][0],
+                        peak_list[j][1],
+                        new_top_pos,
+                        new_top_sc,
+                    ]
                 # If two peaks were merged.
                 if new_peak:
                     merged_peak_list.append(new_peak)
@@ -991,10 +1017,12 @@
     return peak_list
 
 
-###############################################################################
+#######################################################################
+
 
-def bed_peaks_to_genomic_peaks(peak_file, genomic_peak_file, genomic_sites_bed,
-                               print_rows=False):
+def bed_peaks_to_genomic_peaks(
+    peak_file, genomic_peak_file, genomic_sites_bed, print_rows=False
+):
     """
     Given a .bed file of sequence peak regions (possible coordinates from
     0 to length of s), convert peak coordinates to genomic coordinates.
@@ -1017,10 +1045,9 @@
             row = line.strip()
             cols = line.strip().split("\t")
             site_id = cols[3]
-            assert site_id \
-                not in id2row_dic, \
-                "column 4 IDs not unique in given .bed file \"%s\"" \
-                % (genomic_sites_bed)
+            assert (
+                site_id not in id2row_dic
+            ), 'column 4 IDs not unique in given .bed file "%s"' % (genomic_sites_bed)
             id2row_dic[site_id] = row
     f.close()
 
@@ -1034,13 +1061,14 @@
             site_e = int(cols[2])
             site_id2 = cols[3]
             site_sc = float(cols[4])
-            assert re.search(".+,.+", site_id2), \
-                "regular expression failed for ID \"%s\"" % (site_id2)
+            assert re.search(
+                ".+,.+", site_id2
+            ), 'regular expression failed for ID "%s"' % (site_id2)
             m = re.search(r".+,(\d+)", site_id2)
             sc_pos = int(m.group(1))  # 1-based.
-            assert site_id in id2row_dic, \
-                "site ID \"%s\" not found in genomic sites dictionary" \
-                % (site_id)
+            assert (
+                site_id in id2row_dic
+            ), 'site ID "%s" not found in genomic sites dictionary' % (site_id)
             row = id2row_dic[site_id]
             rowl = row.split("\t")
             gen_chr = rowl[0]
@@ -1054,16 +1082,23 @@
                 new_s = gen_e - site_e
                 new_e = gen_e - site_s
                 new_sc_pos = gen_e - sc_pos + 1  # keep 1-based.
-            new_row = "%s\t%i\t%i\t%s,%i\t%f\t%s" \
-                      % (gen_chr, new_s, new_e,
-                         site_id, new_sc_pos, site_sc, gen_pol)
+            new_row = "%s\t%i\t%i\t%s,%i\t%f\t%s" % (
+                gen_chr,
+                new_s,
+                new_e,
+                site_id,
+                new_sc_pos,
+                site_sc,
+                gen_pol,
+            )
             OUTPEAKS.write("%s\n" % (new_row))
             if print_rows:
                 print(new_row)
     OUTPEAKS.close()
 
 
-###############################################################################
+#######################################################################
+
 
 def diff_two_files_identical(file1, file2):
     """
@@ -1087,4 +1122,4 @@
     return same
 
 
-###############################################################################
+#######################################################################
--- a/graphprot_predict_wrapper.py	Thu Jan 28 15:06:14 2021 +0000
+++ b/graphprot_predict_wrapper.py	Wed Jun 05 16:40:51 2024 +0000
@@ -7,7 +7,6 @@
 
 import gplib
 
-
 """
 
 TOOL DEPENDENCIES
@@ -81,7 +80,8 @@
 """
 
 
-###############################################################################
+#######################################################################
+
 
 def setup_argument_parser():
     """Setup argparse parser."""
@@ -100,100 +100,119 @@
 
     """
     # Define argument parser.
-    p = ap.ArgumentParser(add_help=False,
-                          prog="graphprot_predict_wrapper.py",
-                          description=help_description,
-                          formatter_class=ap.MetavarTypeHelpFormatter)
+    p = ap.ArgumentParser(
+        add_help=False,
+        prog="graphprot_predict_wrapper.py",
+        description=help_description,
+        formatter_class=ap.MetavarTypeHelpFormatter,
+    )
 
     # Argument groups.
     p_man = p.add_argument_group("REQUIRED ARGUMENTS")
     p_opt = p.add_argument_group("OPTIONAL ARGUMENTS")
 
     # Required arguments.
-    p_opt.add_argument("-h", "--help",
-                       action="help",
-                       help="Print help message")
-    p_man.add_argument("--fasta",
-                       dest="in_fa",
-                       type=str,
-                       required=True,
-                       help="Sequences .fa file to predict"
-                            " on (option -fasta)")
-    p_man.add_argument("--model",
-                       dest="in_model",
-                       type=str,
-                       required=True,
-                       help="GraphProt model file to use for predictions"
-                            " (option -model)")
-    p_man.add_argument("--params",
-                       dest="in_params",
-                       type=str,
-                       required=True,
-                       help="Parameter file for given model")
-    p_man.add_argument("--data-id",
-                       dest="data_id",
-                       type=str,
-                       required=True,
-                       help="Data ID (option -prefix)")
+    p_opt.add_argument("-h", "--help", action="help", help="Print help message")
+    p_man.add_argument(
+        "--fasta",
+        dest="in_fa",
+        type=str,
+        required=True,
+        help="Sequences .fa file to predict" " on (option -fasta)",
+    )
+    p_man.add_argument(
+        "--model",
+        dest="in_model",
+        type=str,
+        required=True,
+        help="GraphProt model file to use for predictions" " (option -model)",
+    )
+    p_man.add_argument(
+        "--params",
+        dest="in_params",
+        type=str,
+        required=True,
+        help="Parameter file for given model",
+    )
+    p_man.add_argument(
+        "--data-id",
+        dest="data_id",
+        type=str,
+        required=True,
+        help="Data ID (option -prefix)",
+    )
     # ---> I'm  a conditional argument <---
-    p_opt.add_argument("--ws-pred",
-                       dest="ws_pred",
-                       default=False,
-                       action="store_true",
-                       help="Run a whole site prediction instead "
-                            "of calculating profiles (default: false)")
+    p_opt.add_argument(
+        "--ws-pred",
+        dest="ws_pred",
+        default=False,
+        action="store_true",
+        help="Run a whole site prediction instead "
+        "of calculating profiles (default: false)",
+    )
     # Additional arguments.
-    p_opt.add_argument("--sc-thr",
-                       dest="score_thr",
-                       type=float,
-                       default=0,
-                       help="Score threshold for extracting "
-                            "average profile peak regions (default: 0)")
-    p_opt.add_argument("--max-merge-dist",
-                       dest="max_merge_dist",
-                       type=int,
-                       default=0,
-                       choices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
-                       help="Maximum merge distance for nearby peak regions"
-                            " (default: report all non-overlapping regions)")
-    p_opt.add_argument("--gen-site-bed",
-                       dest="genomic_sites_bed",
-                       type=str,
-                       help=".bed file specifying the genomic regions of "
-                            "the input .fa sequences. Corrupt .bed "
-                            "information will be punished (default: false)")
-    p_opt.add_argument("--conf-out",
-                       dest="conf_out",
-                       default=False,
-                       action="store_true",
-                       help="Output filtered peak regions BED file or "
-                            "predictions file (if --ws-pred) using the median "
-                            "positive training site score for filtering "
-                            "(default: false)")
-    p_opt.add_argument("--gp-output",
-                       dest="gp_output",
-                       default=False,
-                       action="store_true",
-                       help="Print output produced by GraphProt "
-                            "(default: false)")
-    p_opt.add_argument("--ap-extlr",
-                       dest="ap_extlr",
-                       type=int,
-                       default=5,
-                       choices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
-                       help="Define average profile up- and downstream "
-                            "extension to produce the average profile. "
-                            "The mean over small sequence windows "
-                            "(window length = --ap-extlr*2 + 1) is used to "
-                            "get position scores, thus the average profile "
-                            "is more smooth than the initial profile output "
-                            "by GraphProt (default: 5)")
+    p_opt.add_argument(
+        "--sc-thr",
+        dest="score_thr",
+        type=float,
+        default=0,
+        help="Score threshold for extracting "
+        "average profile peak regions (default: 0)",
+    )
+    p_opt.add_argument(
+        "--max-merge-dist",
+        dest="max_merge_dist",
+        type=int,
+        default=0,
+        choices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+        help="Maximum merge distance for nearby peak regions"
+        " (default: report all non-overlapping regions)",
+    )
+    p_opt.add_argument(
+        "--gen-site-bed",
+        dest="genomic_sites_bed",
+        type=str,
+        help=".bed file specifying the genomic regions of "
+        "the input .fa sequences. Corrupt .bed "
+        "information will be punished (default: false)",
+    )
+    p_opt.add_argument(
+        "--conf-out",
+        dest="conf_out",
+        default=False,
+        action="store_true",
+        help="Output filtered peak regions BED file or "
+        "predictions file (if --ws-pred) using the median "
+        "positive training site score for filtering "
+        "(default: false)",
+    )
+    p_opt.add_argument(
+        "--gp-output",
+        dest="gp_output",
+        default=False,
+        action="store_true",
+        help="Print output produced by GraphProt " "(default: false)",
+    )
+    p_opt.add_argument(
+        "--ap-extlr",
+        dest="ap_extlr",
+        type=int,
+        default=5,
+        choices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+        help="Define average profile up- and downstream "
+        "extension to produce the average profile. "
+        "The mean over small sequence windows "
+        "(window length = --ap-extlr*2 + 1) is used to "
+        "get position scores, thus the average profile "
+        "is more smooth than the initial profile output "
+        "by GraphProt (default: 5)",
+    )
     return p
 
 
-###############################################################################
+#######################################################################
 
-if __name__ == '__main__':
+if __name__ == "__main__":
 
     # Setup argparse.
     parser = setup_argument_parser()
@@ -209,70 +228,86 @@
     # Check tool availability.
     assert gplib.is_tool("GraphProt.pl"), "GraphProt.pl not in PATH"
     # Check file inputs.
-    assert os.path.exists(args.in_fa), \
-        "input .fa file \"%s\" not found" % (args.in_fa)
-    assert os.path.exists(args.in_model), \
-        "input .model file \"%s\" not found" % (args.in_model)
-    assert os.path.exists(args.in_params), \
-        "input .params file \"%s\" not found" % (args.in_params)
+    assert os.path.exists(args.in_fa), 'input .fa file "%s" not found' % (args.in_fa)
+    assert os.path.exists(args.in_model), 'input .model file "%s" not found' % (
+        args.in_model
+    )
+    assert os.path.exists(args.in_params), 'input .params file "%s" not found' % (
+        args.in_params
+    )
     # Count .fa entries.
     c_in_fa = gplib.count_fasta_headers(args.in_fa)
-    assert c_in_fa, "input .fa file \"%s\" no headers found" % (args.in_fa)
+    assert c_in_fa, 'input .fa file "%s" no headers found' % (args.in_fa)
     print("# input .fa sequences:   %i" % (c_in_fa))
     # Read in FASTA sequences to check for uppercase sequences.
     seqs_dic = gplib.read_fasta_into_dic(args.in_fa)
     # Check for lowercase only sequences, which cause GP to crash.
-    error_mess = "input sequences encountered containing "\
-        "only lowercase characters or lowercase characters in between "\
-        "uppercase characters. Please provide either all uppercase "\
-        "sequences or sequences containing uppercase regions surrounded "\
-        "by lowercase context regions for structure calculation (see "\
-        "viewpoint concept in original GraphProt publication "\
+    error_mess = (
+        "input sequences encountered containing "
+        "only lowercase characters or lowercase characters in between "
+        "uppercase characters. Please provide either all uppercase "
+        "sequences or sequences containing uppercase regions surrounded "
+        "by lowercase context regions for structure calculation (see "
+        "viewpoint concept in original GraphProt publication "
         "for more details)"
+    )
     if args.ws_pred:
         bad_ids = gplib.check_seqs_dic_format(seqs_dic)
         assert not bad_ids, "%s" % (error_mess)
 
     c_uc_nt = gplib.seqs_dic_count_uc_nts(seqs_dic)
-    assert c_uc_nt, "no uppercase nucleotides in input .fa sequences. "\
-                    "Please change sequences to uppercase (keep in mind "\
-                    "GraphProt only scores uppercase regions (according "\
-                    "to its viewpoint concept)"
+    assert c_uc_nt, (
+        "no uppercase nucleotides in input .fa sequences. "
+        "Please change sequences to uppercase (keep in mind "
+        "GraphProt only scores uppercase regions (according "
+        "to its viewpoint concept)"
+    )
     if not args.ws_pred:
         # Check for lowercase sequences.
         c_lc_nt = gplib.seqs_dic_count_lc_nts(seqs_dic)
-        assert not c_lc_nt, "lowercase nucleotides in input .fa not allowed"\
-            " in profile predictions, since GraphProt only scores "\
+        assert not c_lc_nt, (
+            "lowercase nucleotides in input .fa not allowed"
+            " in profile predictions, since GraphProt only scores "
             "uppercase regions (according to its viewpoint concept)"
+        )
     # Check .bed.
     if args.genomic_sites_bed:
         # An array of checks, marvelous.
-        assert \
-            os.path.exists(args.genomic_sites_bed), \
-            "genomic .bed file \"%s\" not found" % (args.genomic_sites_bed)
+        assert os.path.exists(
+            args.genomic_sites_bed
+        ), 'genomic .bed file "%s" not found' % (args.genomic_sites_bed)
         # Check .bed for content.
-        assert gplib.count_file_rows(args.genomic_sites_bed), \
-            "genomic .bed file \"%s\" is empty" % (args.genomic_sites_bed)
+        assert gplib.count_file_rows(
+            args.genomic_sites_bed
+        ), 'genomic .bed file "%s" is empty' % (args.genomic_sites_bed)
         # Check .bed for 6-column format.
-        assert gplib.bed_check_six_col_format(args.genomic_sites_bed), \
-            "genomic .bed file \"%s\" appears to not "\
-            "be in 6-column .bed format" % (args.genomic_sites_bed)
+        assert gplib.bed_check_six_col_format(
+            args.genomic_sites_bed
+        ), 'genomic .bed file "%s" appears to not ' "be in 6-column .bed format" % (
+            args.genomic_sites_bed
+        )
         # Check for unique column 4 IDs.
-        assert gplib.bed_check_unique_ids(args.genomic_sites_bed), \
-            "genomic .bed file \"%s\" column 4 IDs not unique" % \
-            (args.genomic_sites_bed)
+        assert gplib.bed_check_unique_ids(
+            args.genomic_sites_bed
+        ), 'genomic .bed file "%s" column 4 IDs not unique' % (args.genomic_sites_bed)
         # Read in .bed regions, compare to FASTA sequences.
         seq_len_dic = gplib.get_seq_lengths_from_seqs_dic(seqs_dic)
         reg_len_dic = gplib.bed_get_region_lengths(args.genomic_sites_bed)
         for seq_id in seq_len_dic:
             seq_l = seq_len_dic[seq_id]
-            assert seq_id in reg_len_dic, \
-                "sequence ID \"%s\" missing in input .bed \"%s\"" % \
-                (seq_id, args.genomic_sites_bed)
+            assert (
+                seq_id in reg_len_dic
+            ), 'sequence ID "%s" missing in input .bed "%s"' % (
+                seq_id,
+                args.genomic_sites_bed,
+            )
             reg_l = reg_len_dic[seq_id]
-            assert seq_l == reg_l, \
-                "sequence length differs from .bed region length (%i != %i)" \
-                % (seq_l, reg_l)
+            assert (
+                seq_l == reg_l
+            ), "sequence length differs from .bed region length (%i != %i)" % (
+                seq_l,
+                reg_l,
+            )
     # Read in model parameters.
     param_dic = gplib.graphprot_get_param_dic(args.in_params)
     # Create GraphProt parameter string.
@@ -284,110 +319,147 @@
     """
     if args.ws_pred:
         # Do whole site prediction.
-        print("Starting whole site predictions on input .fa file"
-              " (-action predict) ... ")
-        check_cmd = "GraphProt.pl -action predict -prefix " \
-            + args.data_id + " -fasta " + args.in_fa + " " \
-            + param_string + " -model " + args.in_model
+        print(
+            "Starting whole site predictions on input .fa file"
+            " (-action predict) ... "
+        )
+        check_cmd = (
+            "GraphProt.pl -action predict -prefix "
+            + args.data_id
+            + " -fasta "
+            + args.in_fa
+            + " "
+            + param_string
+            + " -model "
+            + args.in_model
+        )
         output = subprocess.getoutput(check_cmd)
-        assert output, "the following call of GraphProt.pl "\
-            "produced no output:\n%s" % (check_cmd)
+        assert (
+            output
+        ), "the following call of GraphProt.pl " "produced no output:\n%s" % (check_cmd)
         if args.gp_output:
             print(output)
         ws_predictions_file = args.data_id + ".predictions"
-        assert os.path.exists(ws_predictions_file), \
-            "Whole site prediction output .predictions file \"%s\" not found" \
-            % (ws_predictions_file)
+        assert os.path.exists(
+            ws_predictions_file
+        ), 'Whole site prediction output .predictions file "%s" not found' % (
+            ws_predictions_file
+        )
         if args.conf_out:
             # Filter by pos_train_ws_pred_median median.
-            assert "pos_train_ws_pred_median" in param_dic, \
-                "whole site top scores median information "\
-                "missing in .params file"
-            pos_tr_ws_pred_med = \
-                float(param_dic["pos_train_ws_pred_median"])
+            assert "pos_train_ws_pred_median" in param_dic, (
+                "whole site top scores median information " "missing in .params file"
+            )
+            pos_tr_ws_pred_med = float(param_dic["pos_train_ws_pred_median"])
             # Filtered file.
             filt_ws_predictions_file = args.data_id + ".p50.predictions"
-            print("Extracting p50 sites from whole site predictions"
-                  " (score threshold = %f) ... " % (pos_tr_ws_pred_med))
-            gplib.graphprot_filter_predictions_file(ws_predictions_file,
-                                                    filt_ws_predictions_file,
-                                                    sc_thr=pos_tr_ws_pred_med)
+            print(
+                "Extracting p50 sites from whole site predictions"
+                " (score threshold = %f) ... " % (pos_tr_ws_pred_med)
+            )
+            gplib.graphprot_filter_predictions_file(
+                ws_predictions_file, filt_ws_predictions_file, sc_thr=pos_tr_ws_pred_med
+            )
     else:
         # Do profile prediction.
-        print("Starting profile predictions on on input .fa file "
-              "(-action predict_profile) ... ")
-        check_cmd = "GraphProt.pl -action predict_profile -prefix " \
-                    + args.data_id + " -fasta " + args.in_fa + " " + \
-                    param_string + " -model " + args.in_model
+        print(
+            "Starting profile predictions on on input .fa file "
+            "(-action predict_profile) ... "
+        )
+        check_cmd = (
+            "GraphProt.pl -action predict_profile -prefix "
+            + args.data_id
+            + " -fasta "
+            + args.in_fa
+            + " "
+            + param_string
+            + " -model "
+            + args.in_model
+        )
         output = subprocess.getoutput(check_cmd)
-        assert output, "the following call of GraphProt.pl produced "\
-                       "no output:\n%s" % (check_cmd)
+        assert (
+            output
+        ), "the following call of GraphProt.pl produced " "no output:\n%s" % (check_cmd)
         if args.gp_output:
             print(output)
         profile_predictions_file = args.data_id + ".profile"
-        assert os.path.exists(profile_predictions_file), \
-            "Profile prediction output .profile file \"%s\" not found" % \
-            (profile_predictions_file)
+        assert os.path.exists(
+            profile_predictions_file
+        ), 'Profile prediction output .profile file "%s" not found' % (
+            profile_predictions_file
+        )
 
         # Profile prediction output files.
         avg_prof_file = args.data_id + ".avg_profile"
         avg_prof_peaks_file = args.data_id + ".avg_profile.peaks.bed"
-        avg_prof_gen_peaks_file = args.data_id + \
-            ".avg_profile.genomic_peaks.bed"
-        avg_prof_peaks_p50_file = args.data_id + \
-            ".avg_profile.p50.peaks.bed"
-        avg_prof_gen_peaks_p50_file = args.data_id + \
-            ".avg_profile.p50.genomic_peaks.bed"
+        avg_prof_gen_peaks_file = args.data_id + ".avg_profile.genomic_peaks.bed"
+        avg_prof_peaks_p50_file = args.data_id + ".avg_profile.p50.peaks.bed"
+        avg_prof_gen_peaks_p50_file = (
+            args.data_id + ".avg_profile.p50.genomic_peaks.bed"
+        )
 
         # Get sequence IDs in order from input .fa file.
         seq_ids_list = gplib.fasta_read_in_ids(args.in_fa)
         # Calculate average profiles.
-        print("Getting average profile from profile "
-              "(extlr for smoothing: %i) ... " % (args.ap_extlr))
-        gplib.graphprot_profile_calc_avg_profile(profile_predictions_file,
-                                                 avg_prof_file,
-                                                 ap_extlr=args.ap_extlr,
-                                                 seq_ids_list=seq_ids_list,
-                                                 method=2)
+        print(
+            "Getting average profile from profile "
+            "(extlr for smoothing: %i) ... " % (args.ap_extlr)
+        )
+        gplib.graphprot_profile_calc_avg_profile(
+            profile_predictions_file,
+            avg_prof_file,
+            ap_extlr=args.ap_extlr,
+            seq_ids_list=seq_ids_list,
+            method=2,
+        )
         # Extract peak regions on sequences with threshold score 0.
-        print("Extracting peak regions from average profile "
-              "(score threshold = 0) ... ")
+        print(
+            "Extracting peak regions from average profile " "(score threshold = 0) ... "
+        )
         killpep8 = args.max_merge_dist
-        gplib.graphprot_profile_extract_peak_regions(avg_prof_file,
-                                                     avg_prof_peaks_file,
-                                                     max_merge_dist=killpep8,
-                                                     sc_thr=args.score_thr)
+        gplib.graphprot_profile_extract_peak_regions(
+            avg_prof_file,
+            avg_prof_peaks_file,
+            max_merge_dist=killpep8,
+            sc_thr=args.score_thr,
+        )
         # Convert peaks to genomic coordinates.
         if args.genomic_sites_bed:
             print("Converting peak regions to genomic coordinates ... ")
             killit = args.genomic_sites_bed
-            gplib.bed_peaks_to_genomic_peaks(avg_prof_peaks_file,
-                                             avg_prof_gen_peaks_file,
-                                             print_rows=False,
-                                             genomic_sites_bed=killit)
+            gplib.bed_peaks_to_genomic_peaks(
+                avg_prof_peaks_file,
+                avg_prof_gen_peaks_file,
+                print_rows=False,
+                genomic_sites_bed=killit,
+            )
         # Extract peak regions with threshold score p50.
         if args.conf_out:
             sc_id = "pos_train_avg_profile_median_%i" % (args.ap_extlr)
             # Filter by pos_tr_ws_pred_med median.
-            assert sc_id in param_dic, "average profile extlr %i median "\
+            assert sc_id in param_dic, (
+                "average profile extlr %i median "
                 "information missing in .params file" % (args.ap_extlr)
+            )
             p50_sc_thr = float(param_dic[sc_id])
-            print("Extracting p50 peak regions from average profile "
-                  "(score threshold = %f) ... " % (p50_sc_thr))
+            print(
+                "Extracting p50 peak regions from average profile "
+                "(score threshold = %f) ... " % (p50_sc_thr)
+            )
             despair = avg_prof_peaks_p50_file
             pain = args.max_merge_dist
-            gplib.graphprot_profile_extract_peak_regions(avg_prof_file,
-                                                         despair,
-                                                         max_merge_dist=pain,
-                                                         sc_thr=p50_sc_thr)
+            gplib.graphprot_profile_extract_peak_regions(
+                avg_prof_file, despair, max_merge_dist=pain, sc_thr=p50_sc_thr
+            )
             # Convert peaks to genomic coordinates.
             if args.genomic_sites_bed:
-                print("Converting p50 peak regions to "
-                      "genomic coordinates ... ")
+                print("Converting p50 peak regions to " "genomic coordinates ... ")
                 madness = args.genomic_sites_bed
-                gplib.bed_peaks_to_genomic_peaks(avg_prof_peaks_p50_file,
-                                                 avg_prof_gen_peaks_p50_file,
-                                                 genomic_sites_bed=madness)
+                gplib.bed_peaks_to_genomic_peaks(
+                    avg_prof_peaks_p50_file,
+                    avg_prof_gen_peaks_p50_file,
+                    genomic_sites_bed=madness,
+                )
     # Done.
     print("Script: I'm done.")
     print("Author: ... ")
--- a/graphprot_train_predict.xml	Thu Jan 28 15:06:14 2021 +0000
+++ b/graphprot_train_predict.xml	Wed Jun 05 16:40:51 2024 +0000
@@ -20,6 +20,7 @@
                 $action_type.training_options.disable_cv
                 $action_type.training_options.disable_motifs
                 --min-train $action_type.training_options.min_train
+                --gp-output
 
         #elif $action_type.action_type_selector == 'predict':
             python '$__tool_directory__/graphprot_predict_wrapper.py'
@@ -35,6 +36,7 @@
                 --ap-extlr $action_type.prediction_options.ap_extlr
                 $action_type.prediction_options.conf_out
                 $action_type.prediction_options.ws_pred_out
+                --gp-output
         #end if
 
 
--- a/graphprot_train_wrapper.py	Thu Jan 28 15:06:14 2021 +0000
+++ b/graphprot_train_wrapper.py	Wed Jun 05 16:40:51 2024 +0000
@@ -7,7 +7,6 @@
 
 import gplib
 
-
 """
 
 TOOL DEPENDENCIES
@@ -62,7 +61,8 @@
 """
 
 
-###############################################################################
+#######################################################################
+
 
 def setup_argument_parser():
     """Setup argparse parser."""
@@ -84,89 +84,107 @@
 
     """
     # Define argument parser.
-    p = ap.ArgumentParser(add_help=False,
-                          prog="graphprot_train_wrapper.py",
-                          description=help_description,
-                          formatter_class=ap.MetavarTypeHelpFormatter)
+    p = ap.ArgumentParser(
+        add_help=False,
+        prog="graphprot_train_wrapper.py",
+        description=help_description,
+        formatter_class=ap.MetavarTypeHelpFormatter,
+    )
 
     # Argument groups.
     p_man = p.add_argument_group("REQUIRED ARGUMENTS")
     p_opt = p.add_argument_group("OPTIONAL ARGUMENTS")
 
     # Required arguments.
-    p_opt.add_argument("-h", "--help",
-                       action="help",
-                       help="Print help message")
-    p_man.add_argument("--pos",
-                       dest="in_pos_fa",
-                       type=str,
-                       required=True,
-                       help="Positive (= binding site) sequences .fa file "
-                            "for model training (option -fasta)")
-    p_man.add_argument("--neg",
-                       dest="in_neg_fa",
-                       type=str,
-                       required=True,
-                       help="Negative sequences .fa file for model "
-                            "training (option -negfasta)")
-    p_man.add_argument("--data-id",
-                       dest="data_id",
-                       type=str,
-                       required=True,
-                       help="Data ID (option -prefix)")
+    p_opt.add_argument("-h", "--help", action="help", help="Print help message")
+    p_man.add_argument(
+        "--pos",
+        dest="in_pos_fa",
+        type=str,
+        required=True,
+        help="Positive (= binding site) sequences .fa file "
+        "for model training (option -fasta)",
+    )
+    p_man.add_argument(
+        "--neg",
+        dest="in_neg_fa",
+        type=str,
+        required=True,
+        help="Negative sequences .fa file for model " "training (option -negfasta)",
+    )
+    p_man.add_argument(
+        "--data-id",
+        dest="data_id",
+        type=str,
+        required=True,
+        help="Data ID (option -prefix)",
+    )
     # Additional arguments.
-    p_opt.add_argument("--opt-set-size",
-                       dest="opt_set_size",
-                       type=int,
-                       default=500,
-                       help="Hyperparameter optimization set size (taken "
-                            "away from both --pos and --neg) (default: 500)")
-    p_opt.add_argument("--opt-pos",
-                       dest="opt_pos_fa",
-                       type=str,
-                       help="Positive (= binding site) sequences .fa file "
-                            "for hyperparameter optimization (default: take "
-                            "--opt-set-size from --pos)")
-    p_opt.add_argument("--opt-neg",
-                       dest="opt_neg_fa",
-                       type=str,
-                       help="Negative sequences .fa file for hyperparameter "
-                            "optimization (default: take --opt-set-size "
-                            "from --neg)")
-    p_opt.add_argument("--min-train",
-                       dest="min_train",
-                       type=int,
-                       default=500,
-                       help="Minimum amount of training sites demanded "
-                            "(default: 500)")
-    p_opt.add_argument("--disable-cv",
-                       dest="disable_cv",
-                       default=False,
-                       action="store_true",
-                       help="Disable cross validation step (default: false)")
-    p_opt.add_argument("--disable-motifs",
-                       dest="disable_motifs",
-                       default=False,
-                       action="store_true",
-                       help="Disable motif generation step (default: false)")
-    p_opt.add_argument("--gp-output",
-                       dest="gp_output",
-                       default=False,
-                       action="store_true",
-                       help="Print output produced by GraphProt "
-                            "(default: false)")
-    p_opt.add_argument("--str-model",
-                       dest="train_str_model",
-                       default=False,
-                       action="store_true",
-                       help="Train a structure model (default: train "
-                            "a sequence model)")
+    p_opt.add_argument(
+        "--opt-set-size",
+        dest="opt_set_size",
+        type=int,
+        default=500,
+        help="Hyperparameter optimization set size (taken "
+        "away from both --pos and --neg) (default: 500)",
+    )
+    p_opt.add_argument(
+        "--opt-pos",
+        dest="opt_pos_fa",
+        type=str,
+        help="Positive (= binding site) sequences .fa file "
+        "for hyperparameter optimization (default: take "
+        "--opt-set-size from --pos)",
+    )
+    p_opt.add_argument(
+        "--opt-neg",
+        dest="opt_neg_fa",
+        type=str,
+        help="Negative sequences .fa file for hyperparameter "
+        "optimization (default: take --opt-set-size "
+        "from --neg)",
+    )
+    p_opt.add_argument(
+        "--min-train",
+        dest="min_train",
+        type=int,
+        default=500,
+        help="Minimum amount of training sites demanded " "(default: 500)",
+    )
+    p_opt.add_argument(
+        "--disable-cv",
+        dest="disable_cv",
+        default=False,
+        action="store_true",
+        help="Disable cross validation step (default: false)",
+    )
+    p_opt.add_argument(
+        "--disable-motifs",
+        dest="disable_motifs",
+        default=False,
+        action="store_true",
+        help="Disable motif generation step (default: false)",
+    )
+    p_opt.add_argument(
+        "--gp-output",
+        dest="gp_output",
+        default=False,
+        action="store_true",
+        help="Print output produced by GraphProt " "(default: false)",
+    )
+    p_opt.add_argument(
+        "--str-model",
+        dest="train_str_model",
+        default=False,
+        action="store_true",
+        help="Train a structure model (default: train " "a sequence model)",
+    )
     return p
 
 
-###############################################################################
+#######################################################################
 
-if __name__ == '__main__':
+if __name__ == "__main__":
 
     # Setup argparse.
     parser = setup_argument_parser()
@@ -182,17 +200,17 @@
     # Check tool availability.
     assert gplib.is_tool("GraphProt.pl"), "GraphProt.pl not in PATH"
     # Check file inputs.
-    assert os.path.exists(args.in_pos_fa), \
-        "positives .fa file \"%s\" not found" % (args.in_pos_fa)
-    assert os.path.exists(args.in_neg_fa), \
-        "negatives .fa file \"%s\" not found" % (args.in_neg_fa)
+    assert os.path.exists(args.in_pos_fa), 'positives .fa file "%s" not found' % (
+        args.in_pos_fa
+    )
+    assert os.path.exists(args.in_neg_fa), 'negatives .fa file "%s" not found' % (
+        args.in_neg_fa
+    )
     # Count .fa entries.
     c_pos_fa = gplib.count_fasta_headers(args.in_pos_fa)
     c_neg_fa = gplib.count_fasta_headers(args.in_neg_fa)
-    assert c_pos_fa, "positives .fa file \"%s\" no headers found" % \
-        (args.in_pos_fa)
-    assert c_neg_fa, "negatives .fa file \"%s\" no headers found" % \
-        (args.in_neg_fa)
+    assert c_pos_fa, 'positives .fa file "%s" no headers found' % (args.in_pos_fa)
+    assert c_neg_fa, 'negatives .fa file "%s" no headers found' % (args.in_neg_fa)
     print("# positive .fa sequences:   %i" % (c_pos_fa))
     print("# negative .fa sequences:   %i" % (c_neg_fa))
     # Check additional files.
@@ -201,13 +219,15 @@
     if args.opt_neg_fa:
         assert args.opt_pos_fa, "--opt-neg but no --opt-pos given"
     # Check for lowercase only sequences, which cause GP to crash.
-    error_mess = "input sequences encountered containing "\
-        "only lowercase characters or lowercase characters in between "\
-        "uppercase characters. Please provide either all uppercase "\
-        "sequences or sequences containing uppercase regions surrounded "\
-        "by lowercase context regions for structure calculation (see "\
-        "viewpoint concept in original GraphProt publication "\
+    error_mess = (
+        "input sequences encountered containing "
+        "only lowercase characters or lowercase characters in between "
+        "uppercase characters. Please provide either all uppercase "
+        "sequences or sequences containing uppercase regions surrounded "
+        "by lowercase context regions for structure calculation (see "
+        "viewpoint concept in original GraphProt publication "
         "for more details)"
+    )
     seqs_dic = gplib.read_fasta_into_dic(args.in_pos_fa)
     bad_ids = gplib.check_seqs_dic_format(seqs_dic)
     assert not bad_ids, "%s" % (error_mess)
@@ -227,57 +247,73 @@
     if args.opt_pos_fa and args.opt_neg_fa:
         c_parop_pos_fa = gplib.count_fasta_headers(args.opt_pos_fa)
         c_parop_neg_fa = gplib.count_fasta_headers(args.opt_neg_fa)
-        assert c_parop_pos_fa, "--opt-pos .fa file \"%s\" no headers found" \
-            % (args.opt_pos_fa)
-        assert c_parop_neg_fa, "--opt-neg .fa file \"%s\" no headers found" \
-            % (args.opt_neg_fa)
+        assert c_parop_pos_fa, '--opt-pos .fa file "%s" no headers found' % (
+            args.opt_pos_fa
+        )
+        assert c_parop_neg_fa, '--opt-neg .fa file "%s" no headers found' % (
+            args.opt_neg_fa
+        )
         # Less than 500 for training?? You gotta be kidding.
-        assert c_pos_fa >= args.min_train, \
-            "--pos for training < %i, please provide more (try at least "\
+        assert c_pos_fa >= args.min_train, (
+            "--pos for training < %i, please provide more (try at least "
             "> 1000, the more the better)" % (args.min_train)
-        assert c_neg_fa >= args.min_train, \
-            "--neg for training < %i, please provide more (try at least "\
+        )
+        assert c_neg_fa >= args.min_train, (
+            "--neg for training < %i, please provide more (try at least "
             "> 1000, the more the better)" % (args.min_train)
+        )
         # Looking closer at ratios.
         pos_neg_ratio = c_parop_pos_fa / c_parop_neg_fa
         if pos_neg_ratio < 0.8 or pos_neg_ratio > 1.25:
-            assert 0, "ratio of --opt-pos to --opt-neg < 0.8 or > 1.25 "\
-                "(ratio = %f). Try to keep ratio closer to 1 or better use "\
-                "identical numbers (keep in mind that performance measures "\
-                "such as accuracy or AUROC are not suitable for imbalanced "\
+            assert 0, (
+                "ratio of --opt-pos to --opt-neg < 0.8 or > 1.25 "
+                "(ratio = %f). Try to keep ratio closer to 1 or better use "
+                "identical numbers (keep in mind that performance measures "
+                "such as accuracy or AUROC are not suitable for imbalanced "
                 " datasets!)" % (pos_neg_ratio)
+            )
     else:
         # Define some minimum amount of training sites for the sake of sanity.
         c_pos_train = c_pos_fa - args.opt_set_size
         c_neg_train = c_neg_fa - args.opt_set_size
         # Start complaining.
-        assert c_pos_fa >= args.opt_set_size, \
-            "# positives < --opt-set-size (%i < %i)" \
-            % (c_pos_fa, args.opt_set_size)
-        assert c_neg_fa >= args.opt_set_size, \
-            "# negatives < --opt-set-size (%i < %i)" \
-            % (c_neg_fa, args.opt_set_size)
-        assert c_pos_train >= args.opt_set_size, \
-            "# positives remaining for training < --opt-set-size "\
-            "(%i < %i)" % (c_pos_train, args.opt_set_size)
-        assert c_neg_train >= args.opt_set_size, "# negatives remaining "\
-            "for training < --opt-set-size (%i < %i)" \
-            % (c_neg_train, args.opt_set_size)
+        assert (
+            c_pos_fa >= args.opt_set_size
+        ), "# positives < --opt-set-size (%i < %i)" % (c_pos_fa, args.opt_set_size)
+        assert (
+            c_neg_fa >= args.opt_set_size
+        ), "# negatives < --opt-set-size (%i < %i)" % (c_neg_fa, args.opt_set_size)
+        assert (
+            c_pos_train >= args.opt_set_size
+        ), "# positives remaining for training < --opt-set-size " "(%i < %i)" % (
+            c_pos_train,
+            args.opt_set_size,
+        )
+        assert (
+            c_neg_train >= args.opt_set_size
+        ), "# negatives remaining " "for training < --opt-set-size (%i < %i)" % (
+            c_neg_train,
+            args.opt_set_size,
+        )
         # Less than 500?? You gotta be kidding.
-        assert c_pos_train >= args.min_train, \
-            "# positives remaining for training < %i, please provide more "\
+        assert c_pos_train >= args.min_train, (
+            "# positives remaining for training < %i, please provide more "
             " (try at least > 1000, the more the better)" % (args.min_train)
-        assert c_neg_train >= args.min_train, \
-            "# negatives remaining for training < %i, please provide more "\
+        )
+        assert c_neg_train >= args.min_train, (
+            "# negatives remaining for training < %i, please provide more "
             "(try at least > 1000, the more the better)" % (args.min_train)
+        )
         # Looking closer at ratios.
         pos_neg_ratio = c_pos_train / c_neg_train
         if pos_neg_ratio < 0.8 or pos_neg_ratio > 1.25:
-            assert 0, "ratio of --pos to --neg < 0.8 or > 1.25 "\
-                "(ratio = %f). Try to keep ratio closer to 1 or better use "\
-                "identical numbers (keep in mind that performance measures "\
-                "such as accuracy or AUROC are not suitable for imbalanced "\
+            assert 0, (
+                "ratio of --pos to --neg < 0.8 or > 1.25 "
+                "(ratio = %f). Try to keep ratio closer to 1 or better use "
+                "identical numbers (keep in mind that performance measures "
+                "such as accuracy or AUROC are not suitable for imbalanced "
                 "datasets!)" % (pos_neg_ratio)
+            )
 
     """
     Generate parop + train .fa output files for hyperparameter
@@ -299,28 +335,37 @@
         gplib.make_file_copy(args.in_neg_fa, neg_train_fa)
     else:
         # Generate parop + train .fa files from input .fa files.
-        gplib.split_fasta_into_test_train_files(args.in_pos_fa, pos_parop_fa,
-                                                pos_train_fa,
-                                                test_size=args.opt_set_size)
-        gplib.split_fasta_into_test_train_files(args.in_neg_fa, neg_parop_fa,
-                                                neg_train_fa,
-                                                test_size=args.opt_set_size)
+        gplib.split_fasta_into_test_train_files(
+            args.in_pos_fa, pos_parop_fa, pos_train_fa, test_size=args.opt_set_size
+        )
+        gplib.split_fasta_into_test_train_files(
+            args.in_neg_fa, neg_parop_fa, neg_train_fa, test_size=args.opt_set_size
+        )
 
     """
     Do the hyperparameter optimization.
 
     """
     print("Starting hyperparameter optimization (-action ls) ... ")
-    check_cmd = "GraphProt.pl -action ls -prefix " + args.data_id + \
-        " -fasta " + pos_parop_fa + " -negfasta " + neg_parop_fa
+    check_cmd = (
+        "GraphProt.pl -action ls -prefix "
+        + args.data_id
+        + " -fasta "
+        + pos_parop_fa
+        + " -negfasta "
+        + neg_parop_fa
+    )
     # If sequence model should be trained (default).
     if not args.train_str_model:
         check_cmd += " -onlyseq"
     print(check_cmd)
     output = subprocess.getoutput(check_cmd)
     params_file = args.data_id + ".params"
-    assert os.path.exists(params_file), "Hyperparameter optimization output "\
-        " .params file \"%s\" not found" % (params_file)
+    assert os.path.exists(
+        params_file
+    ), "Hyperparameter optimization output " ' .params file "%s" not found' % (
+        params_file
+    )
     # Add model type to params file.
     if args.train_str_model:
         gplib.echo_add_to_file("model_type: structure", params_file)
@@ -334,19 +379,27 @@
 
     """
     print("Starting model training (-action train) ... ")
-    check_cmd = "GraphProt.pl -action train -prefix " + args.data_id \
-        + " -fasta " + pos_train_fa + " -negfasta " + neg_train_fa \
-        + " " + param_string
+    check_cmd = (
+        "GraphProt.pl -action train -prefix "
+        + args.data_id
+        + " -fasta "
+        + pos_train_fa
+        + " -negfasta "
+        + neg_train_fa
+        + " "
+        + param_string
+    )
     print(check_cmd)
     output = subprocess.getoutput(check_cmd)
-    assert output, \
-        "The following call of GraphProt.pl produced no output:\n%s" \
-        % (check_cmd)
+    assert output, "The following call of GraphProt.pl produced no output:\n%s" % (
+        check_cmd
+    )
     if args.gp_output:
         print(output)
     model_file = args.data_id + ".model"
-    assert os.path.exists(model_file), \
-        "Training output .model file \"%s\" not found" % (model_file)
+    assert os.path.exists(model_file), 'Training output .model file "%s" not found' % (
+        model_file
+    )
 
     """
     Do the 10-fold cross validation.
@@ -354,19 +407,29 @@
     """
     if not args.disable_cv:
         print("Starting 10-fold cross validation (-action cv) ... ")
-        check_cmd = "GraphProt.pl -action cv -prefix " + args.data_id \
-            + " -fasta " + pos_train_fa + " -negfasta " + neg_train_fa \
-            + " " + param_string + " -model " + model_file
+        check_cmd = (
+            "GraphProt.pl -action cv -prefix "
+            + args.data_id
+            + " -fasta "
+            + pos_train_fa
+            + " -negfasta "
+            + neg_train_fa
+            + " "
+            + param_string
+            + " -model "
+            + model_file
+        )
         print(check_cmd)
         output = subprocess.getoutput(check_cmd)
-        assert output, \
-            "The following call of GraphProt.pl produced no output:\n%s" \
-            % (check_cmd)
+        assert output, "The following call of GraphProt.pl produced no output:\n%s" % (
+            check_cmd
+        )
         if args.gp_output:
             print(output)
         cv_results_file = args.data_id + ".cv_results"
-        assert os.path.exists(cv_results_file), \
-            "CV output .cv_results file \"%s\" not found" % (cv_results_file)
+        assert os.path.exists(
+            cv_results_file
+        ), 'CV output .cv_results file "%s" not found' % (cv_results_file)
 
     """
     Do the motif generation.
@@ -374,75 +437,108 @@
     """
     if not args.disable_motifs:
         print("Starting motif generation (-action motif) ... ")
-        check_cmd = "GraphProt.pl -action motif -prefix " + args.data_id \
-            + " -fasta " + pos_train_fa + " -negfasta " + neg_train_fa \
-            + " " + param_string + " -model " + model_file
+        check_cmd = (
+            "GraphProt.pl -action motif -prefix "
+            + args.data_id
+            + " -fasta "
+            + pos_train_fa
+            + " -negfasta "
+            + neg_train_fa
+            + " "
+            + param_string
+            + " -model "
+            + model_file
+        )
         print(check_cmd)
         output = subprocess.getoutput(check_cmd)
-        assert output, \
-            "The following call of GraphProt.pl produced no output:\n%s" \
-            % (check_cmd)
+        assert output, "The following call of GraphProt.pl produced no output:\n%s" % (
+            check_cmd
+        )
         if args.gp_output:
             print(output)
         seq_motif_file = args.data_id + ".sequence_motif"
         seq_motif_png_file = args.data_id + ".sequence_motif.png"
-        assert os.path.exists(seq_motif_file), \
-            "Motif output .sequence_motif file \"%s\" not found" \
-            % (seq_motif_file)
-        assert os.path.exists(seq_motif_png_file), \
-            "Motif output .sequence_motif.png file \"%s\" not found" \
-            % (seq_motif_png_file)
+        assert os.path.exists(
+            seq_motif_file
+        ), 'Motif output .sequence_motif file "%s" not found' % (seq_motif_file)
+        assert os.path.exists(
+            seq_motif_png_file
+        ), 'Motif output .sequence_motif.png file "%s" not found' % (seq_motif_png_file)
         if args.train_str_model:
             str_motif_file = args.data_id + ".structure_motif"
             str_motif_png_file = args.data_id + ".structure_motif.png"
-            assert os.path.exists(str_motif_file), \
-                "Motif output .structure_motif file \"%s\" not found" \
-                % (str_motif_file)
-            assert os.path.exists(str_motif_png_file), \
-                "Motif output .structure_motif.png file \"%s\" not found" \
-                % (str_motif_png_file)
+            assert os.path.exists(
+                str_motif_file
+            ), 'Motif output .structure_motif file "%s" not found' % (str_motif_file)
+            assert os.path.exists(
+                str_motif_png_file
+            ), 'Motif output .structure_motif.png file "%s" not found' % (
+                str_motif_png_file
+            )
 
     """
     Do whole site predictions on positive training set.
 
     """
-    print("Starting whole site predictions on positive training set "
-          " (-action predict) ... ")
-    check_cmd = "GraphProt.pl -action predict -prefix " + args.data_id \
-        + " -fasta " + pos_train_fa + " " + param_string \
-        + " -model " + model_file
+    print(
+        "Starting whole site predictions on positive training set "
+        " (-action predict) ... "
+    )
+    check_cmd = (
+        "GraphProt.pl -action predict -prefix "
+        + args.data_id
+        + " -fasta "
+        + pos_train_fa
+        + " "
+        + param_string
+        + " -model "
+        + model_file
+    )
     print(check_cmd)
     output = subprocess.getoutput(check_cmd)
-    assert output, \
-        "The following call of GraphProt.pl produced no output:\n%s" \
-        % (check_cmd)
+    assert output, "The following call of GraphProt.pl produced no output:\n%s" % (
+        check_cmd
+    )
     if args.gp_output:
         print(output)
     ws_predictions_file = args.data_id + ".predictions"
-    assert os.path.exists(ws_predictions_file), \
-        "Whole site prediction output .predictions file \"%s\" not found" \
-        % (ws_predictions_file)
+    assert os.path.exists(
+        ws_predictions_file
+    ), 'Whole site prediction output .predictions file "%s" not found' % (
+        ws_predictions_file
+    )
 
     """
     Do profile predictions on positive training set.
 
     """
-    print("Starting profile predictions on positive training set "
-          "-action predict_profile) ... ")
-    check_cmd = "GraphProt.pl -action predict_profile -prefix " \
-        + args.data_id + " -fasta " + pos_train_fa + " " \
-        + param_string + " -model " + model_file
+    print(
+        "Starting profile predictions on positive training set "
+        "-action predict_profile) ... "
+    )
+    check_cmd = (
+        "GraphProt.pl -action predict_profile -prefix "
+        + args.data_id
+        + " -fasta "
+        + pos_train_fa
+        + " "
+        + param_string
+        + " -model "
+        + model_file
+    )
     print(check_cmd)
     output = subprocess.getoutput(check_cmd)
-    assert output, \
-        "The following call of GraphProt.pl produced no output:\n%s" \
-        % (check_cmd)
+    assert output, "The following call of GraphProt.pl produced no output:\n%s" % (
+        check_cmd
+    )
     if args.gp_output:
         print(output)
     profile_predictions_file = args.data_id + ".profile"
-    assert os.path.exists(profile_predictions_file), \
-        "Profile prediction output .profile file \"%s\" not found" \
-        % (profile_predictions_file)
+    assert os.path.exists(
+        profile_predictions_file
+    ), 'Profile prediction output .profile file "%s" not found' % (
+        profile_predictions_file
+    )
 
     """
     Get 50 % score (median) for .predictions and .profile file.
@@ -454,12 +550,11 @@
     print("Getting .profile and .predictions median scores ... ")
 
     # Whole site scores median.
-    ws_pred_median = \
-        gplib.graphprot_predictions_get_median(ws_predictions_file)
+    ws_pred_median = gplib.graphprot_predictions_get_median(ws_predictions_file)
     # Profile top site scores median.
-    profile_median = \
-        gplib.graphprot_profile_get_tsm(profile_predictions_file,
-                                        profile_type="profile")
+    profile_median = gplib.graphprot_profile_get_tsm(
+        profile_predictions_file, profile_type="profile"
+    )
     ws_pred_string = "pos_train_ws_pred_median: %f" % (ws_pred_median)
     profile_string = "pos_train_profile_median: %f" % (profile_median)
     gplib.echo_add_to_file(ws_pred_string, params_file)
@@ -467,13 +562,14 @@
     # Average profile top site scores median for extlr 1 to 10.
     for i in range(10):
         i += 1
-        avg_profile_median = \
-            gplib.graphprot_profile_get_tsm(profile_predictions_file,
-                                            profile_type="avg_profile",
-                                            avg_profile_extlr=i)
+        avg_profile_median = gplib.graphprot_profile_get_tsm(
+            profile_predictions_file, profile_type="avg_profile", avg_profile_extlr=i
+        )
 
-        avg_profile_string = "pos_train_avg_profile_median_%i: %f" \
-            % (i, avg_profile_median)
+        avg_profile_string = "pos_train_avg_profile_median_%i: %f" % (
+            i,
+            avg_profile_median,
+        )
         gplib.echo_add_to_file(avg_profile_string, params_file)
 
     print("Script: I'm done.")