changeset 3:ace92c9a4653 draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/graphprot commit efcac98677c3ea9039c1c61eaa9e58f78287ccb3"
author bgruening
date Wed, 27 Jan 2021 19:27:47 +0000
parents 7bbb7bf6304f
children 4ad83aed5c3c
files gplib.py graphprot_predict_wrapper.py graphprot_train_wrapper.py test-data/test4.fa
diffstat 4 files changed, 707 insertions(+), 491 deletions(-) [+]
line wrap: on
line diff
--- a/gplib.py	Mon Jan 27 18:37:05 2020 -0500
+++ b/gplib.py	Wed Jan 27 19:27:47 2021 +0000
@@ -1,13 +1,11 @@
 
-from distutils.spawn import find_executable
-import subprocess
-import statistics
+import gzip
 import random
-import gzip
-import uuid
-import sys
 import re
-import os
+import statistics
+import subprocess
+from distutils.spawn import find_executable
+
 
 """
 
@@ -19,11 +17,11 @@
 """
 
 
-################################################################################
+###############################################################################
 
 def graphprot_predictions_get_median(predictions_file):
     """
-    Given a GraphProt .predictions file, read in site scores and return 
+    Given a GraphProt .predictions file, read in site scores and return
     the median value.
 
     >>> test_file = "test-data/test.predictions"
@@ -43,29 +41,29 @@
     return statistics.median(sc_list)
 
 
-################################################################################
+###############################################################################
 
-def graphprot_profile_get_top_scores_median(profile_file,
-                                            profile_type="profile",
-                                            avg_profile_extlr=5):
+def graphprot_profile_get_tsm(profile_file,
+                              profile_type="profile",
+                              avg_profile_extlr=5):
 
     """
-    Given a GraphProt .profile file, extract for each site (identified by 
-    column 1 ID) the top (= highest) score. Then return the median of these 
+    Given a GraphProt .profile file, extract for each site (identified by
+    column 1 ID) the top (= highest) score. Then return the median of these
     top scores.
-    
+
     profile_type can be either "profile" or "avg_profile".
-    "avg_profile means that the position-wise scores will first get smoothed 
-    out by calculating for each position a new score through taking a 
-    sequence window -avg_profile_extlr to +avg_profile_extlr of the position 
-    and calculate the mean score over this window and assign it to the position.
-    After that, the maximum score of each site is chosen, and the median over 
-    all maximum scores is returned.
-    "profile" leaves the position-wise scores as they are, directly extracting 
+    "avg_profile means that the position-wise scores will first get smoothed
+    out by calculating for each position a new score through taking a
+    sequence window -avg_profile_extlr to +avg_profile_extlr of the position
+    and calculate the mean score over this window and assign it to the
+    position. After that, the maximum score of each site is chosen, and the
+    median over all maximum scores is returned.
+    "profile" leaves the position-wise scores as they are, directly extracting
     the maximum for each site and then reporting the median.
-    
+
     >>> test_file = "test-data/test.profile"
-    >>> graphprot_profile_get_top_scores_median(test_file)
+    >>> graphprot_profile_get_tsm(test_file)
     3.2
 
     """
@@ -90,25 +88,27 @@
             max_list.append(max_sc)
         elif profile_type == "avg_profile":
             # Convert profile score list to average profile scores list.
-            aps_list = list_moving_window_average_values(lists_dic[seq_id],
-                                                         win_extlr=avg_profile_extlr)
+            aps_list = \
+                list_moving_window_average_values(lists_dic[seq_id],
+                                                  win_extlr=avg_profile_extlr)
             max_sc = max(aps_list)
             max_list.append(max_sc)
         else:
-            assert 0, "invalid profile_type argument given: \"%s\"" %(profile_type)
+            assert 0, "invalid profile_type argument given: \"%s\"" \
+                % (profile_type)
     # Return the median.
     return statistics.median(max_list)
 
 
-################################################################################
+###############################################################################
 
-def list_moving_window_average_values(in_list, 
+def list_moving_window_average_values(in_list,
                                       win_extlr=5,
                                       method=1):
     """
-    Take a list of numeric values, and calculate for each position a new value, 
-    by taking the mean value of the window of positions -win_extlr and 
-    +win_extlr. If full extension is not possible (at list ends), it just 
+    Take a list of numeric values, and calculate for each position a new value,
+    by taking the mean value of the window of positions -win_extlr and
+    +win_extlr. If full extension is not possible (at list ends), it just
     takes what it gets.
     Two implementations of the task are given, chose by method=1 or method=2.
 
@@ -142,17 +142,17 @@
                 s = 0
             if e > l_list:
                 e = l_list
-            l = e-s
+            ln = e - s
             sc_sum = 0
-            for j in range(l):
-                sc_sum += in_list[s+j]
-            new_list[i] = sc_sum / l
+            for j in range(ln):
+                sc_sum += in_list[s + j]
+            new_list[i] = sc_sum / ln
     else:
-        assert 0, "invalid method ID given (%i)" %(method)
+        assert 0, "invalid method ID given (%i)" % (method)
     return new_list
 
 
-################################################################################
+###############################################################################
 
 def echo_add_to_file(echo_string, out_file):
     """
@@ -164,17 +164,17 @@
     error = False
     if output:
         error = True
-    assert error == False, "echo is complaining:\n%s\n%s" %(check_cmd, output)
+    assert not error, "echo is complaining:\n%s\n%s" % (check_cmd, output)
 
 
-################################################################################
+###############################################################################
 
 def is_tool(name):
     """Check whether tool "name" is in PATH."""
     return find_executable(name) is not None
 
 
-################################################################################
+###############################################################################
 
 def count_fasta_headers(fasta_file):
     """
@@ -194,7 +194,7 @@
     return row_count
 
 
-################################################################################
+###############################################################################
 
 def make_file_copy(in_file, out_file):
     """
@@ -202,17 +202,20 @@
 
     """
     check_cmd = "cat " + in_file + " > " + out_file
-    assert in_file != out_file, "cat does not like to cat file into same file (%s)" %(check_cmd)
+    assert in_file != out_file, \
+        "cat does not like to cat file into same file (%s)" % (check_cmd)
     output = subprocess.getoutput(check_cmd)
     error = False
     if output:
         error = True
-    assert error == False, "cat did not like your input (in_file: %s, out_file: %s):\n%s" %(in_file, out_file, output)
+    assert not error, \
+        "cat did not like your input (in_file: %s, out_file: %s):\n%s" \
+        % (in_file, out_file, output)
 
 
-################################################################################
+###############################################################################
 
-def split_fasta_into_test_train_files(in_fasta, test_out_fa, train_out_fa, 
+def split_fasta_into_test_train_files(in_fasta, test_out_fa, train_out_fa,
                                       test_size=500):
     """
     Split in_fasta .fa file into two files (e.g. test, train).
@@ -236,19 +239,48 @@
     TRAINOUT.close()
 
 
-################################################################################
+###############################################################################
+
+def check_seqs_dic_format(seqs_dic):
+    """
+    Check sequence dictionary for lowercase-only sequences or sequences
+    wich have lowercase nts in between uppercase nts.
+    Return suspicious IDs as list or empty list if not hits.
+    IDs with lowercase-only sequences.
+
+    >>> seqs_dic = {"id1" : "acguACGU", "id2" : "acgua", "id3" : "acgUUaUcc"}
+    >>> check_seqs_dic_format(seqs_dic)
+    ['id2', 'id3']
+    >>> seqs_dic = {"id1" : "acgAUaa", "id2" : "ACGUACUA"}
+    >>> check_seqs_dic_format(seqs_dic)
+    []
+
+    """
+    assert seqs_dic, "given seqs_dic empty"
+    bad_seq_ids = []
+    for seq_id in seqs_dic:
+        seq = seqs_dic[seq_id]
+        if re.search("^[acgtun]+$", seq):
+            bad_seq_ids.append(seq_id)
+        if re.search("[ACGTUN][acgtun]+[ACGTUN]", seq):
+            bad_seq_ids.append(seq_id)
+    return bad_seq_ids
+
+
+###############################################################################
 
 def read_fasta_into_dic(fasta_file,
                         seqs_dic=False,
                         ids_dic=False,
                         read_dna=False,
+                        short_ensembl=False,
                         reject_lc=False,
                         convert_to_uc=False,
                         skip_n_seqs=True):
     """
-    Read in FASTA sequences, convert to RNA, store in dictionary 
+    Read in FASTA sequences, convert to RNA, store in dictionary
     and return dictionary.
-    
+
     >>> test_fasta = "test-data/test.fa"
     >>> read_fasta_into_dic(test_fasta)
     {'seq1': 'acguACGUacgu', 'seq2': 'ugcaUGCAugcaACGUacgu'}
@@ -256,8 +288,11 @@
     >>> read_fasta_into_dic(test_fasta)
     {}
     >>> test_fasta = "test-data/test.ensembl.fa"
-    >>> read_fasta_into_dic(test_fasta, read_dna=True)
+    >>> read_fasta_into_dic(test_fasta, read_dna=True, short_ensembl=True)
     {'ENST00000415118': 'GAAATAGT', 'ENST00000448914': 'ACTGGGGGATACGAAAA'}
+    >>> test_fasta = "test-data/test4.fa"
+    >>> read_fasta_into_dic(test_fasta)
+    {'1': 'gccuAUGUuuua', '2': 'cugaAACUaugu'}
 
     """
     if not seqs_dic:
@@ -266,9 +301,9 @@
     seq = ""
 
     # Go through FASTA file, extract sequences.
-    if re.search(".+\.gz$", fasta_file):
+    if re.search(r".+\.gz$", fasta_file):
         f = gzip.open(fasta_file, 'rt')
-    else: 
+    else:
         f = open(fasta_file, "r")
     for line in f:
         if re.search(">.+", line):
@@ -276,10 +311,13 @@
             seq_id = m.group(1)
             # If there is a ".", take only first part of header.
             # This assumes ENSEMBL header format ">ENST00000631435.1 cdna ..."
-            if re.search(".+\..+", seq_id):
-                m = re.search("(.+?)\..+", seq_id)
-                seq_id = m.group(1)
-            assert seq_id not in seqs_dic, "non-unique FASTA header \"%s\" in \"%s\"" % (seq_id, fasta_file)
+            if short_ensembl:
+                if re.search(r".+\..+", seq_id):
+                    m = re.search(r"(.+?)\..+", seq_id)
+                    seq_id = m.group(1)
+            assert seq_id not in seqs_dic, \
+                "non-unique FASTA header \"%s\" in \"%s\"" \
+                % (seq_id, fasta_file)
             if ids_dic:
                 if seq_id in ids_dic:
                     seqs_dic[seq_id] = ""
@@ -290,25 +328,31 @@
                 m = re.search("([ACGTUN]+)", line, re.I)
                 seq = m.group(1)
                 if reject_lc:
-                    assert not re.search("[a-z]", seq), "lowercase characters detected in sequence \"%i\" (reject_lc=True)" %(seq_id)
+                    assert \
+                        not re.search("[a-z]", seq), \
+                        "lc char detected in seq \"%i\" (reject_lc=True)" \
+                        % (seq_id)
                 if convert_to_uc:
                     seq = seq.upper()
                 # If sequences with N nucleotides should be skipped.
                 if skip_n_seqs:
                     if "n" in m.group(1) or "N" in m.group(1):
-                        print ("WARNING: \"%s\" contains N nucleotides. Discarding sequence ... " % (seq_id))
+                        print("WARNING: \"%s\" contains N. Discarding "
+                              "sequence ... " % (seq_id))
                         del seqs_dic[seq_id]
                         continue
                 # Convert to RNA, concatenate sequence.
                 if read_dna:
-                    seqs_dic[seq_id] += m.group(1).replace("U","T").replace("u","t")
+                    seqs_dic[seq_id] += \
+                        m.group(1).replace("U", "T").replace("u", "t")
                 else:
-                    seqs_dic[seq_id] += m.group(1).replace("T","U").replace("t","u")
+                    seqs_dic[seq_id] += \
+                        m.group(1).replace("T", "U").replace("t", "u")
     f.close()
     return seqs_dic
 
 
-################################################################################
+###############################################################################
 
 def random_order_dic_keys_into_list(in_dic):
     """
@@ -322,7 +366,7 @@
     return id_list
 
 
-################################################################################
+###############################################################################
 
 def graphprot_get_param_string(params_file):
     """
@@ -348,19 +392,19 @@
                     if setting == "sequence":
                         param_string += "-onlyseq "
                 else:
-                    param_string += "-%s %s " %(par, setting)
+                    param_string += "-%s %s " % (par, setting)
             else:
-                assert 0, "pattern matching failed for string \"%s\"" %(param)
+                assert 0, "pattern matching failed for string \"%s\"" % (param)
     return param_string
 
 
-################################################################################
+###############################################################################
 
 def seqs_dic_count_uc_nts(seqs_dic):
     """
-    Count number of uppercase nucleotides in sequences stored in sequence 
+    Count number of uppercase nucleotides in sequences stored in sequence
     dictionary.
-    
+
     >>> seqs_dic = {'seq1': "acgtACGTacgt", 'seq2': 'acgtACacgt'}
     >>> seqs_dic_count_uc_nts(seqs_dic)
     6
@@ -376,13 +420,13 @@
     return c_uc
 
 
-################################################################################
+###############################################################################
 
 def seqs_dic_count_lc_nts(seqs_dic):
     """
-    Count number of lowercase nucleotides in sequences stored in sequence 
+    Count number of lowercase nucleotides in sequences stored in sequence
     dictionary.
-    
+
     >>> seqs_dic = {'seq1': "gtACGTac", 'seq2': 'cgtACacg'}
     >>> seqs_dic_count_lc_nts(seqs_dic)
     10
@@ -398,12 +442,12 @@
     return c_uc
 
 
-################################################################################
+###############################################################################
 
 def count_file_rows(in_file):
     """
     Count number of file rows for given input file.
-    
+
     >>> test_file = "test-data/test1.bed"
     >>> count_file_rows(test_file)
     7
@@ -418,7 +462,7 @@
     return row_count
 
 
-################################################################################
+###############################################################################
 
 def bed_check_six_col_format(bed_file):
     """
@@ -444,13 +488,13 @@
     return six_col_format
 
 
-################################################################################
+###############################################################################
 
 def bed_check_unique_ids(bed_file):
     """
-    Check whether .bed file (6 column format with IDs in column 4) 
+    Check whether .bed file (6 column format with IDs in column 4)
     has unique column 4 IDs.
-    
+
     >>> test_bed = "test-data/test1.bed"
     >>> bed_check_unique_ids(test_bed)
     True
@@ -468,7 +512,7 @@
         return True
 
 
-################################################################################
+###############################################################################
 
 def get_seq_lengths_from_seqs_dic(seqs_dic):
     """
@@ -483,7 +527,7 @@
     return seq_len_dic
 
 
-################################################################################
+###############################################################################
 
 def bed_get_region_lengths(bed_file):
     """
@@ -499,20 +543,24 @@
     id2len_dic = {}
     with open(bed_file) as f:
         for line in f:
-            row = line.strip()
             cols = line.strip().split("\t")
             site_s = int(cols[1])
             site_e = int(cols[2])
             site_id = cols[3]
             site_l = site_e - site_s
-            assert site_id not in id2len_dic, "column 4 IDs not unique in given .bed file \"%s\"" %(bed_file)
+            assert site_id \
+                not in id2len_dic, \
+                "column 4 IDs not unique in given .bed file \"%s\"" \
+                % (bed_file)
             id2len_dic[site_id] = site_l
     f.closed
-    assert id2len_dic, "No IDs read into dictionary (input file \"%s\" empty or malformatted?)" % (in_bed)
+    assert id2len_dic, \
+        "No IDs read into dic (input file \"%s\" empty or malformatted?)" \
+        % (bed_file)
     return id2len_dic
 
 
-################################################################################
+###############################################################################
 
 def graphprot_get_param_dic(params_file):
     """
@@ -522,7 +570,19 @@
 
     >>> params_file = "test-data/test.params"
     >>> graphprot_get_param_dic(params_file)
-    {'epochs': '20', 'lambda': '0.01', 'R': '1', 'D': '3', 'bitsize': '14', 'model_type': 'sequence', 'pos_train_ws_pred_median': '0.760321', 'pos_train_profile_median': '5.039610', 'pos_train_avg_profile_median_1': '4.236340', 'pos_train_avg_profile_median_2': '3.868431', 'pos_train_avg_profile_median_3': '3.331277', 'pos_train_avg_profile_median_4': '2.998667', 'pos_train_avg_profile_median_5': '2.829782', 'pos_train_avg_profile_median_6': '2.626623', 'pos_train_avg_profile_median_7': '2.447083', 'pos_train_avg_profile_median_8': '2.349919', 'pos_train_avg_profile_median_9': '2.239829', 'pos_train_avg_profile_median_10': '2.161676'}
+    {'epochs': '20', 'lambda': '0.01', 'R': '1', 'D': '3', 'bitsize': '14', \
+'model_type': 'sequence', 'pos_train_ws_pred_median': '0.760321', \
+'pos_train_profile_median': '5.039610', \
+'pos_train_avg_profile_median_1': '4.236340', \
+'pos_train_avg_profile_median_2': '3.868431', \
+'pos_train_avg_profile_median_3': '3.331277', \
+'pos_train_avg_profile_median_4': '2.998667', \
+'pos_train_avg_profile_median_5': '2.829782', \
+'pos_train_avg_profile_median_6': '2.626623', \
+'pos_train_avg_profile_median_7': '2.447083', \
+'pos_train_avg_profile_median_8': '2.349919', \
+'pos_train_avg_profile_median_9': '2.239829', \
+'pos_train_avg_profile_median_10': '2.161676'}
 
     """
     param_dic = {}
@@ -539,7 +599,7 @@
     return param_dic
 
 
-################################################################################
+###############################################################################
 
 def graphprot_filter_predictions_file(in_file, out_file,
                                       sc_thr=0):
@@ -554,16 +614,16 @@
             score = float(cols[2])
             if score < sc_thr:
                 continue
-            OUTPRED.write("%s\n" %(row))
+            OUTPRED.write("%s\n" % (row))
     f.close()
     OUTPRED.close()
 
 
-################################################################################
+###############################################################################
 
 def fasta_read_in_ids(fasta_file):
     """
-    Given a .fa file, read in header IDs in order appearing in file, 
+    Given a .fa file, read in header IDs in order appearing in file,
     and store in list.
 
     >>> test_file = "test-data/test3.fa"
@@ -582,19 +642,19 @@
     return ids_list
 
 
-################################################################################
+###############################################################################
 
-def graphprot_profile_calculate_avg_profile(in_file, out_file,
-                                            ap_extlr=5,
-                                            seq_ids_list=False,
-                                            method=1):
+def graphprot_profile_calc_avg_profile(in_file, out_file,
+                                       ap_extlr=5,
+                                       seq_ids_list=False,
+                                       method=1):
     """
-    Given a GraphProt .profile file, calculate average profiles and output 
+    Given a GraphProt .profile file, calculate average profiles and output
     average profile file.
-    Average profile means that the position-wise scores will get smoothed 
-    out by calculating for each position a new score, taking a sequence 
-    window -ap_extlr to +ap_extlr relative to the position 
-    and calculate the mean score over this window. The mean score then 
+    Average profile means that the position-wise scores will get smoothed
+    out by calculating for each position a new score, taking a sequence
+    window -ap_extlr to +ap_extlr relative to the position
+    and calculate the mean score over this window. The mean score then
     becomes the new average profile score at this position.
     Two different implementations of the task are given:
     method=1 (new python implementation, slower + more memory but easy to read)
@@ -604,14 +664,17 @@
     >>> out_file1 = "test-data/test2_1.avg_profile"
     >>> out_file2 = "test-data/test2_2.avg_profile"
     >>> out_file4 = "test-data/test2_3.avg_profile"
-    >>> graphprot_profile_calculate_avg_profile(in_file, out_file1, ap_extlr=2, method=1)
-    >>> graphprot_profile_calculate_avg_profile(in_file, out_file2, ap_extlr=2, method=2)
+    >>> graphprot_profile_calc_avg_profile(in_file, \
+    out_file1, ap_extlr=2, method=1)
+    >>> graphprot_profile_calc_avg_profile(in_file, \
+    out_file2, ap_extlr=2, method=2)
     >>> diff_two_files_identical(out_file1, out_file2)
     True
     >>> test_list = ["s1", "s2", "s3", "s4"]
     >>> out_file3_exp = "test-data/test3_added_ids_exp.avg_profile"
     >>> out_file3 = "test-data/test3_added_ids_out.avg_profile"
-    >>> graphprot_profile_calculate_avg_profile(in_file, out_file3, ap_extlr=2, method=1, seq_ids_list=test_list)
+    >>> graphprot_profile_calc_avg_profile(in_file, out_file3, \
+    ap_extlr=2, method=1, seq_ids_list=test_list)
     >>> diff_two_files_identical(out_file3_exp, out_file3)
     True
 
@@ -624,7 +687,7 @@
             for line in f:
                 cols = line.strip().split("\t")
                 site_id = int(cols[0])
-                pos = int(cols[1]) # 0-based.
+                pos = int(cols[1])  # 0-based.
                 score = float(cols[2])
                 # Store first position of site.
                 if site_id not in site_starts_dic:
@@ -635,14 +698,15 @@
                     lists_dic[site_id] = []
                     lists_dic[site_id].append(score)
         f.close()
-        # Check number of IDs (# FASTA sequence IDs has to be same as # site IDs).
+        # Check number of IDs (# FASTA IDs has to be same as # site IDs).
         if seq_ids_list:
             c_seq_ids = len(seq_ids_list)
             c_site_ids = len(site_starts_dic)
-            assert c_seq_ids == c_site_ids, "# sequence IDs != # site IDs (%i != %i)" %(c_seq_ids, c_site_ids)
+            assert c_seq_ids == c_site_ids, \
+                "# sequence IDs != # site IDs (%i != %i)" \
+                % (c_seq_ids, c_site_ids)
         OUTPROF = open(out_file, "w")
         # For each site, calculate average profile scores list.
-        max_list = []
         for site_id in lists_dic:
             # Convert profile score list to average profile scores list.
             aps_list = list_moving_window_average_values(lists_dic[site_id],
@@ -652,8 +716,8 @@
             if seq_ids_list:
                 site_id = seq_ids_list[site_id]
             for i, sc in enumerate(aps_list):
-                pos = i + start_pos + 1 # make 1-based.
-                OUTPROF.write("%s\t%i\t%f\n" %(site_id, pos, sc))
+                pos = i + start_pos + 1  # make 1-based.
+                OUTPROF.write("%s\t%i\t%f\n" % (site_id, pos, sc))
         OUTPROF.close()
     elif method == 2:
         OUTPROF = open(out_file, "w")
@@ -668,7 +732,7 @@
             for line in f:
                 cols = line.strip().split("\t")
                 cur_id = int(cols[0])
-                pos = int(cols[1]) # 0-based.
+                pos = int(cols[1])  # 0-based.
                 score = float(cols[2])
                 # Store first position of site.
                 if cur_id not in site_starts_dic:
@@ -677,16 +741,18 @@
                 if cur_id != old_id:
                     # Process old id scores.
                     if scores_list:
-                        aps_list = list_moving_window_average_values(scores_list,
-                                                                     win_extlr=ap_extlr)
+                        aps_list = \
+                            list_moving_window_average_values(
+                                scores_list,
+                                win_extlr=ap_extlr)
                         start_pos = site_starts_dic[old_id]
                         seq_id = old_id
                         # Get original FASTA sequence ID.
                         if seq_ids_list:
                             seq_id = seq_ids_list[old_id]
                         for i, sc in enumerate(aps_list):
-                            pos = i + start_pos + 1 # make 1-based.
-                            OUTPROF.write("%s\t%i\t%f\n" %(seq_id, pos, sc))
+                            pos = i + start_pos + 1  # make 1-based.
+                            OUTPROF.write("%s\t%i\t%f\n" % (seq_id, pos, sc))
                         # Reset list.
                         scores_list = []
                     old_id = cur_id
@@ -705,19 +771,19 @@
             if seq_ids_list:
                 seq_id = seq_ids_list[old_id]
             for i, sc in enumerate(aps_list):
-                pos = i + start_pos + 1 # make 1-based.
-                OUTPROF.write("%s\t%i\t%f\n" %(seq_id, pos, sc))
+                pos = i + start_pos + 1  # make 1-based.
+                OUTPROF.write("%s\t%i\t%f\n" % (seq_id, pos, sc))
         OUTPROF.close()
 
 
-################################################################################
+###############################################################################
 
 def graphprot_profile_extract_peak_regions(in_file, out_file,
                                            max_merge_dist=0,
                                            sc_thr=0):
     """
     Extract peak regions from GraphProt .profile file.
-    Store the peak regions (defined as regions with scores >= sc_thr) 
+    Store the peak regions (defined as regions with scores >= sc_thr)
     as to out_file in 6-column .bed.
 
     TODO:
@@ -735,7 +801,8 @@
     >>> graphprot_profile_extract_peak_regions(in_file, out_file, sc_thr=10)
     >>> diff_two_files_identical(out_file, empty_file)
     True
-    >>> graphprot_profile_extract_peak_regions(in_file, out_file, max_merge_dist=2)
+    >>> graphprot_profile_extract_peak_regions(in_file, out_file, \
+    max_merge_dist=2)
     >>> diff_two_files_identical(out_file, exp2_file)
     True
 
@@ -753,7 +820,7 @@
         for line in f:
             cols = line.strip().split("\t")
             cur_id = cols[0]
-            pos = int(cols[1]) # 0-based.
+            pos = int(cols[1])  # 0-based.
             score = float(cols[2])
             # Store first position of site.
             if cur_id not in site_starts_dic:
@@ -768,17 +835,21 @@
                 # Process old id scores.
                 if scores_list:
                     # Extract peaks from region.
-                    peak_list = list_extract_peaks(scores_list, 
-                                                   max_merge_dist=max_merge_dist,
-                                                   coords="bed",
-                                                   sc_thr=sc_thr)
+                    peak_list = \
+                        list_extract_peaks(scores_list,
+                                           max_merge_dist=max_merge_dist,
+                                           coords="bed",
+                                           sc_thr=sc_thr)
                     start_pos = site_starts_dic[old_id]
                     # Print out peaks in .bed format.
-                    for l in peak_list:
-                        peak_s = start_pos + l[0]
-                        peak_e = start_pos + l[1]
-                        site_id = "%s,%i" %(old_id, l[2])
-                        OUTPEAKS.write("%s\t%i\t%i\t%s\t%f\t+\n" %(old_id, peak_s, peak_e, site_id, l[3]))
+                    for ln in peak_list:
+                        peak_s = start_pos + ln[0]
+                        peak_e = start_pos + ln[1]
+                        site_id = "%s,%i" % (old_id, ln[2])
+                        OUTPEAKS.write("%s\t%i\t%i"
+                                       "\t%s\t%f\t+\n"
+                                       % (old_id, peak_s,
+                                          peak_e, site_id, ln[3]))
                     # Reset list.
                     scores_list = []
                 old_id = cur_id
@@ -790,33 +861,34 @@
     # Process last block.
     if scores_list:
         # Extract peaks from region.
-        peak_list = list_extract_peaks(scores_list, 
+        peak_list = list_extract_peaks(scores_list,
                                        max_merge_dist=max_merge_dist,
                                        coords="bed",
                                        sc_thr=sc_thr)
         start_pos = site_starts_dic[old_id]
         # Print out peaks in .bed format.
-        for l in peak_list:
-            peak_s = start_pos + l[0]
-            peak_e = start_pos + l[1]
-            site_id = "%s,%i" %(old_id, l[2]) # best score also 1-based.
-            OUTPEAKS.write("%s\t%i\t%i\t%s\t%f\t+\n" %(old_id, peak_s, peak_e, site_id, l[3]))
+        for ln in peak_list:
+            peak_s = start_pos + ln[0]
+            peak_e = start_pos + ln[1]
+            site_id = "%s,%i" % (old_id, ln[2])  # best score also 1-based.
+            OUTPEAKS.write("%s\t%i\t%i\t%s\t%f\t+\n"
+                           % (old_id, peak_s, peak_e, site_id, ln[3]))
     OUTPEAKS.close()
 
 
-################################################################################
+###############################################################################
 
 def list_extract_peaks(in_list,
                        max_merge_dist=0,
                        coords="list",
                        sc_thr=0):
     """
-    Extract peak regions from list. 
+    Extract peak regions from list.
     Peak region is defined as region >= score threshold.
-    
+
     coords=bed  :  peak start 0-based, peak end 1-based.
     coords=list :  peak start 0-based, peak end 0-based.
-    
+
     >>> test_list = [-1, 0, 2, 4.5, 1, -1, 5, 6.5]
     >>> list_extract_peaks(test_list)
     [[1, 4, 3, 4.5], [6, 7, 7, 6.5]]
@@ -862,7 +934,6 @@
             # Before was peak region?
             if inside:
                 # Store peak region.
-                #peak_infos = "%i,%i,%i,%f" %(pr_s, pr_e, pr_top_pos, pr_top_sc)
                 peak_infos = [pr_s, pr_e, pr_top_pos, pr_top_sc]
                 peak_list.append(peak_infos)
                 inside = False
@@ -898,7 +969,8 @@
                     if peak_list[i][3] < peak_list[j][3]:
                         new_top_pos = peak_list[j][2]
                         new_top_sc = peak_list[j][3]
-                    new_peak = [peak_list[i][0], peak_list[j][1], new_top_pos, new_top_sc]
+                    new_peak = [peak_list[i][0], peak_list[j][1],
+                                new_top_pos, new_top_sc]
                 # If two peaks were merged.
                 if new_peak:
                     merged_peak_list.append(new_peak)
@@ -915,15 +987,16 @@
     if coords == "bed":
         for i in range(len(peak_list)):
             peak_list[i][1] += 1
-            peak_list[i][2] += 1 # 1-base best score position too.
+            peak_list[i][2] += 1  # 1-base best score position too.
     return peak_list
 
 
-################################################################################
+###############################################################################
 
-def bed_peaks_to_genomic_peaks(peak_file, genomic_peak_file, genomic_sites_bed, print_rows=False):
+def bed_peaks_to_genomic_peaks(peak_file, genomic_peak_file, genomic_sites_bed,
+                               print_rows=False):
     """
-    Given a .bed file of sequence peak regions (possible coordinates from 
+    Given a .bed file of sequence peak regions (possible coordinates from
     0 to length of s), convert peak coordinates to genomic coordinates.
     Do this by taking genomic regions of sequences as input.
 
@@ -944,7 +1017,10 @@
             row = line.strip()
             cols = line.strip().split("\t")
             site_id = cols[3]
-            assert site_id not in id2row_dic, "column 4 IDs not unique in given .bed file \"%s\"" %(args.genomic_sites_bed)
+            assert site_id \
+                not in id2row_dic, \
+                "column 4 IDs not unique in given .bed file \"%s\"" \
+                % (genomic_sites_bed)
             id2row_dic[site_id] = row
     f.close()
 
@@ -958,10 +1034,13 @@
             site_e = int(cols[2])
             site_id2 = cols[3]
             site_sc = float(cols[4])
-            assert re.search(".+,.+", site_id2), "regular expression failed for ID \"%s\"" %(site_id2)
-            m = re.search(".+,(\d+)", site_id2)
-            sc_pos = int(m.group(1)) # 1-based.
-            assert site_id in id2row_dic, "site ID \"%s\" not found in genomic sites dictionary" %(site_id)
+            assert re.search(".+,.+", site_id2), \
+                "regular expression failed for ID \"%s\"" % (site_id2)
+            m = re.search(r".+,(\d+)", site_id2)
+            sc_pos = int(m.group(1))  # 1-based.
+            assert site_id in id2row_dic, \
+                "site ID \"%s\" not found in genomic sites dictionary" \
+                % (site_id)
             row = id2row_dic[site_id]
             rowl = row.split("\t")
             gen_chr = rowl[0]
@@ -974,21 +1053,23 @@
             if gen_pol == "-":
                 new_s = gen_e - site_e
                 new_e = gen_e - site_s
-                new_sc_pos = gen_e - sc_pos + 1 # keep 1-based.
-            new_row = "%s\t%i\t%i\t%s,%i\t%f\t%s" %(gen_chr, new_s, new_e, site_id, new_sc_pos, site_sc, gen_pol)
-            OUTPEAKS.write("%s\n" %(new_row))
+                new_sc_pos = gen_e - sc_pos + 1  # keep 1-based.
+            new_row = "%s\t%i\t%i\t%s,%i\t%f\t%s" \
+                      % (gen_chr, new_s, new_e,
+                         site_id, new_sc_pos, site_sc, gen_pol)
+            OUTPEAKS.write("%s\n" % (new_row))
             if print_rows:
                 print(new_row)
     OUTPEAKS.close()
 
 
-################################################################################
+###############################################################################
 
 def diff_two_files_identical(file1, file2):
     """
-    Check whether two files are identical. Return true if diff reports no 
+    Check whether two files are identical. Return true if diff reports no
     differences.
-    
+
     >>> file1 = "test-data/file1"
     >>> file2 = "test-data/file2"
     >>> diff_two_files_identical(file1, file2)
@@ -1006,6 +1087,4 @@
     return same
 
 
-################################################################################
-
-
+###############################################################################
--- a/graphprot_predict_wrapper.py	Mon Jan 27 18:37:05 2020 -0500
+++ b/graphprot_predict_wrapper.py	Wed Jan 27 19:27:47 2021 +0000
@@ -1,12 +1,11 @@
 #!/usr/bin/env python3
 
+import argparse as ap
+import os
 import subprocess
-import argparse
-import shutil
+import sys
+
 import gplib
-import gzip
-import sys
-import os
 
 
 """
@@ -48,46 +47,63 @@
 EXAMPLE CALLS
 =============
 
-python graphprot_predict_wrapper.py --model test2.model --params test2.params --fasta gp_data/test10_predict.fa --data-id test2pred --gp-output
-python graphprot_predict_wrapper.py --model test2.model --params test2.params --fasta gp_data/test10_predict.fa --data-id test2pred --gen-site-bed gp_data/test10_predict.bed
-python graphprot_predict_wrapper.py --model test2.model --params test2.params --fasta gp_data/test10_predict.fa --data-id test2pred --gen-site-bed gp_data/test10_predict.bed --conf-out
-python graphprot_predict_wrapper.py --model test2.model --params test2.params --fasta gp_data/test10_predict.fa --data-id test2pred --conf-out --ws-pred
-
-python graphprot_predict_wrapper.py --model test-data/test.model --params test-data/test.params --fasta test-data/test_predict.fa --data-id predtest
-
-python graphprot_predict_wrapper.py --model test-data/test.model --params test-data/test.params --fasta test-data/test_predict.fa --data-id predtest --gen-site-bed test-data/test_predict.bed --sc-thr 0.0 --max-merge-dist 0 --conf-out  --ap-extlr 5
-
-python graphprot_predict_wrapper.py --data-id GraphProt --fasta test-data/test_predict.fa --model test-data/test.model --params test-data/test.params --gen-site-bed test-data/test_predict.bed --sc-thr 0.0 --max-merge-dist 0 --conf-out  --ap-extlr 5
+flake8 coming out of hotel room. FB enters.
+FB: Who is this f*** ???
 
 
-pwd && python '/home/uhlm/Dokumente/Projekte/GraphProt_galaxy_new/galaxytools/tools/rna_tools/graphprot/graphprot_predict_wrapper.py' --data-id GraphProt --fasta /tmp/tmpmuslpc1h/files/0/8/c/dataset_08c48d88-e3b5-423b-acf6-bf89b8c60660.dat --model /tmp/tmpmuslpc1h/files/e/6/4/dataset_e6471bb4-e74c-4372-bc49-656f900e7191.dat --params /tmp/tmpmuslpc1h/files/b/6/5/dataset_b65e8cf4-d3e6-429e-8d57-1d401adf4b3c.dat --gen-site-bed /tmp/tmpmuslpc1h/files/5/1/a/dataset_51a38b65-5943-472d-853e-5d845fa8ac3e.dat --sc-thr 0.0 --max-merge-dist 0 --conf-out  --ap-extlr 5
+python graphprot_predict_wrapper.py --model test2.model --params test2.params
+--fasta gp_data/test10_predict.fa --data-id test2pred --gp-output
+python graphprot_predict_wrapper.py --model test2.model --params test2.params
+--fasta gp_data/test10_predict.fa --data-id test2pred
+--gen-site-bed gp_data/test10_predict.bed
+
+python graphprot_predict_wrapper.py --model test2.model --params test2.params
+--fasta gp_data/test10_predict.fa --data-id test2pred
+--gen-site-bed gp_data/test10_predict.bed --conf-out
+
+python graphprot_predict_wrapper.py --model test2.model --params test2.params
+--fasta gp_data/test10_predict.fa --data-id test2pred --conf-out --ws-pred
 
+python graphprot_predict_wrapper.py --model test-data/test.model
+--params test-data/test.params --fasta test-data/test_predict.fa
+--data-id predtest
+
+python graphprot_predict_wrapper.py --model test-data/test.model
+--params test-data/test.params --fasta test-data/test_predict.fa
+--data-id predtest --gen-site-bed test-data/test_predict.bed
+--sc-thr 0.0 --max-merge-dist 0 --conf-out  --ap-extlr 5
+
+python graphprot_predict_wrapper.py --data-id GraphProt
+--fasta test-data/test_predict.fa --model test-data/test.model
+--params test-data/test.params --gen-site-bed test-data/test_predict.bed
+--sc-thr 0.0 --max-merge-dist 0 --conf-out  --ap-extlr 5
 
 """
 
-################################################################################
+
+###############################################################################
 
 def setup_argument_parser():
     """Setup argparse parser."""
     help_description = """
-    Galaxy wrapper script for GraphProt (-action predict and -action 
-    predict_profile) to compute whole site or position-wise scores for input 
+    Galaxy wrapper script for GraphProt (-action predict and -action
+    predict_profile) to compute whole site or position-wise scores for input
     FASTA sequences.
-    By default, profile predictions are calculated, followed by average 
+    By default, profile predictions are calculated, followed by average
     profiles computions and peak regions extraction from average profiles.
     If --ws-pred is set, whole site score predictions on input sequences
     will be run instead.
-    If --conf-out is set, sites or peak regions with a score >= the median 
+    If --conf-out is set, sites or peak regions with a score >= the median
     score of positive training sites will be output.
-    If --gen-site-bed .bed file is provided, peak regions will be output 
+    If --gen-site-bed .bed file is provided, peak regions will be output
     with genomic coordinates too.
 
     """
     # Define argument parser.
-    p = argparse.ArgumentParser(add_help=False,
-                                prog="graphprot_predict_wrapper.py",
-                                description=help_description,
-                                formatter_class=argparse.MetavarTypeHelpFormatter)
+    p = ap.ArgumentParser(add_help=False,
+                          prog="graphprot_predict_wrapper.py",
+                          description=help_description,
+                          formatter_class=ap.MetavarTypeHelpFormatter)
 
     # Argument groups.
     p_man = p.add_argument_group("REQUIRED ARGUMENTS")
@@ -95,70 +111,87 @@
 
     # Required arguments.
     p_opt.add_argument("-h", "--help",
-           action="help",
-           help="Print help message")
+                       action="help",
+                       help="Print help message")
     p_man.add_argument("--fasta",
-           dest="in_fa",
-           type=str,
-           required = True,
-           help = "Sequences .fa file to predict on (option -fasta)")
+                       dest="in_fa",
+                       type=str,
+                       required=True,
+                       help="Sequences .fa file to predict"
+                            " on (option -fasta)")
     p_man.add_argument("--model",
-           dest="in_model",
-           type=str,
-           required = True,
-           help = "GraphProt model file to use for predictions (option -model)")
+                       dest="in_model",
+                       type=str,
+                       required=True,
+                       help="GraphProt model file to use for predictions"
+                            " (option -model)")
     p_man.add_argument("--params",
-           dest="in_params",
-           type=str,
-           required = True,
-           help = "Parameter file for given model")
+                       dest="in_params",
+                       type=str,
+                       required=True,
+                       help="Parameter file for given model")
     p_man.add_argument("--data-id",
-           dest="data_id",
-           type=str,
-           required = True,
-           help = "Data ID (option -prefix)")
+                       dest="data_id",
+                       type=str,
+                       required=True,
+                       help="Data ID (option -prefix)")
     # ---> I'm  a conditional argument <---
     p_opt.add_argument("--ws-pred",
-           dest = "ws_pred",
-           default = False,
-           action = "store_true",
-           help = "Run a whole site prediction instead of calculating profiles (default: false)")
+                       dest="ws_pred",
+                       default=False,
+                       action="store_true",
+                       help="Run a whole site prediction instead "
+                            "of calculating profiles (default: false)")
     # Additional arguments.
     p_opt.add_argument("--sc-thr",
-           dest="score_thr",
-           type = float,
-           default = 0,
-           help = "Score threshold for extracting average profile peak regions (default: 0)")
+                       dest="score_thr",
+                       type=float,
+                       default=0,
+                       help="Score threshold for extracting "
+                            "average profile peak regions (default: 0)")
     p_opt.add_argument("--max-merge-dist",
-           dest="max_merge_dist",
-           type = int,
-           default = 0,
-           choices = [0,1,2,3,4,5,6,7,8,9,10],
-           help = "Maximum merge distance for nearby peak regions (default: report all non-overlapping regions)")
+                       dest="max_merge_dist",
+                       type=int,
+                       default=0,
+                       choices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+                       help="Maximum merge distance for nearby peak regions"
+                            " (default: report all non-overlapping regions)")
     p_opt.add_argument("--gen-site-bed",
-           dest="genomic_sites_bed",
-           type=str,
-           help = ".bed file specifying the genomic regions of the input .fa sequences. Corrupt .bed information will be punished (default: false)")
+                       dest="genomic_sites_bed",
+                       type=str,
+                       help=".bed file specifying the genomic regions of "
+                            "the input .fa sequences. Corrupt .bed "
+                            "information will be punished (default: false)")
     p_opt.add_argument("--conf-out",
-           dest="conf_out",
-           default = False,
-           action = "store_true",
-           help = "Output filtered peak regions BED file or predictions file (if --ws-pred) using the median positive training site score for filtering (default: false)")
+                       dest="conf_out",
+                       default=False,
+                       action="store_true",
+                       help="Output filtered peak regions BED file or "
+                            "predictions file (if --ws-pred) using the median "
+                            "positive training site score for filtering "
+                            "(default: false)")
     p_opt.add_argument("--gp-output",
-           dest = "gp_output",
-           default = False,
-           action = "store_true",
-           help = "Print output produced by GraphProt (default: false)")
+                       dest="gp_output",
+                       default=False,
+                       action="store_true",
+                       help="Print output produced by GraphProt "
+                            "(default: false)")
     p_opt.add_argument("--ap-extlr",
-           dest="ap_extlr",
-           type = int,
-           default = 5,
-           choices = [0,1,2,3,4,5,6,7,8,9,10],
-           help = "Define average profile up- and downstream extension to produce the average profile. The mean over small sequence windows (window length = --ap-extlr*2 + 1) is used to get position scores, thus the average profile is more smooth than the initial profile output by GraphProt (default: 5)")
+                       dest="ap_extlr",
+                       type=int,
+                       default=5,
+                       choices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+                       help="Define average profile up- and downstream "
+                            "extension to produce the average profile. "
+                            "The mean over small sequence windows "
+                            "(window length = --ap-extlr*2 + 1) is used to "
+                            "get position scores, thus the average profile "
+                            "is more smooth than the initial profile output "
+                            "by GraphProt (default: 5)")
     return p
 
 
-################################################################################
+###############################################################################
 
 if __name__ == '__main__':
 
@@ -166,49 +199,68 @@
     parser = setup_argument_parser()
     # Read in command line arguments.
     args = parser.parse_args()
-    
+
     """
     Do all sorts of sanity checking.
-    
+
     """
     # Check for Linux.
     assert "linux" in sys.platform, "please use Linux"
     # Check tool availability.
     assert gplib.is_tool("GraphProt.pl"), "GraphProt.pl not in PATH"
     # Check file inputs.
-    assert os.path.exists(args.in_fa), "input .fa file \"%s\" not found" %(args.in_fa)
-    assert os.path.exists(args.in_model), "input .model file \"%s\" not found" %(args.in_model)
-    assert os.path.exists(args.in_params), "input .params file \"%s\" not found" %(args.in_params)
+    assert os.path.exists(args.in_fa), \
+        "input .fa file \"%s\" not found" % (args.in_fa)
+    assert os.path.exists(args.in_model), \
+        "input .model file \"%s\" not found" % (args.in_model)
+    assert os.path.exists(args.in_params), \
+        "input .params file \"%s\" not found" % (args.in_params)
     # Count .fa entries.
     c_in_fa = gplib.count_fasta_headers(args.in_fa)
-    assert c_in_fa, "input .fa file \"%s\" no headers found" %(args.in_fa)
-    print("# input .fa sequences:   %i" %(c_in_fa))
+    assert c_in_fa, "input .fa file \"%s\" no headers found" % (args.in_fa)
+    print("# input .fa sequences:   %i" % (c_in_fa))
     # Read in FASTA sequences to check for uppercase sequences.
     seqs_dic = gplib.read_fasta_into_dic(args.in_fa)
     c_uc_nt = gplib.seqs_dic_count_uc_nts(seqs_dic)
-    assert c_uc_nt, "no uppercase nucleotides in input .fa sequences. Please change sequences to uppercase (keep in mind GraphProt only scores uppercase regions (according to its viewpoint concept)"
+    assert c_uc_nt, "no uppercase nucleotides in input .fa sequences. "\
+                    "Please change sequences to uppercase (keep in mind "\
+                    "GraphProt only scores uppercase regions (according "\
+                    "to its viewpoint concept)"
     if not args.ws_pred:
         # Check for lowercase sequences.
         c_lc_nt = gplib.seqs_dic_count_lc_nts(seqs_dic)
-        assert not c_lc_nt, "lowercase nucleotides in input .fa not allowed in profile predictions, since GraphProt only scores uppercase regions (according to its viewpoint concept)"
+        assert not c_lc_nt, "lowercase nucleotides in input .fa not allowed"\
+            " in profile predictions, since GraphProt only scores "\
+            "uppercase regions (according to its viewpoint concept)"
     # Check .bed.
     if args.genomic_sites_bed:
         # An array of checks, marvelous.
-        assert os.path.exists(args.genomic_sites_bed), "genomic .bed file \"%s\" not found" %(args.genomic_sites_bed)
+        assert \
+            os.path.exists(args.genomic_sites_bed), \
+            "genomic .bed file \"%s\" not found" % (args.genomic_sites_bed)
         # Check .bed for content.
-        assert gplib.count_file_rows(args.genomic_sites_bed), "genomic .bed file \"%s\" is empty" %(args.genomic_sites_bed)
+        assert gplib.count_file_rows(args.genomic_sites_bed), \
+            "genomic .bed file \"%s\" is empty" % (args.genomic_sites_bed)
         # Check .bed for 6-column format.
-        assert gplib.bed_check_six_col_format(args.genomic_sites_bed), "genomic .bed file \"%s\" appears to not be in 6-column .bed format" %(args.genomic_sites_bed)
+        assert gplib.bed_check_six_col_format(args.genomic_sites_bed), \
+            "genomic .bed file \"%s\" appears to not "\
+            "be in 6-column .bed format" % (args.genomic_sites_bed)
         # Check for unique column 4 IDs.
-        assert gplib.bed_check_unique_ids(args.genomic_sites_bed), "genomic .bed file \"%s\" column 4 IDs not unique" %(args.genomic_sites_bed)
-        # Read in .bed regions, compare to FASTA sequences (compare IDs + lengths)
+        assert gplib.bed_check_unique_ids(args.genomic_sites_bed), \
+            "genomic .bed file \"%s\" column 4 IDs not unique" % \
+            (args.genomic_sites_bed)
+        # Read in .bed regions, compare to FASTA sequences.
         seq_len_dic = gplib.get_seq_lengths_from_seqs_dic(seqs_dic)
         reg_len_dic = gplib.bed_get_region_lengths(args.genomic_sites_bed)
         for seq_id in seq_len_dic:
             seq_l = seq_len_dic[seq_id]
-            assert seq_id in reg_len_dic, "sequence ID \"\" missing in input .bed \"\"" %(seq_id, args.genomic_sites_bed)
+            assert seq_id in reg_len_dic, \
+                "sequence ID \"%s\" missing in input .bed \"%s\"" % \
+                (seq_id, args.genomic_sites_bed)
             reg_l = reg_len_dic[seq_id]
-            assert seq_l == reg_l, "sequence length differs from .bed region length (%i != %i)" %(seq_l, reg_l)
+            assert seq_l == reg_l, \
+                "sequence length differs from .bed region length (%i != %i)" \
+                % (seq_l, reg_l)
     # Read in model parameters.
     param_dic = gplib.graphprot_get_param_dic(args.in_params)
     # Create GraphProt parameter string.
@@ -216,83 +268,114 @@
 
     """
     Run predictions.
-    
+
     """
     if args.ws_pred:
         # Do whole site prediction.
-        print("Starting whole site predictions on input .fa file (-action predict) ... ")
-        check_cmd = "GraphProt.pl -action predict -prefix " + args.data_id + " -fasta " + args.in_fa + " " + param_string + " -model " + args.in_model
+        print("Starting whole site predictions on input .fa file"
+              " (-action predict) ... ")
+        check_cmd = "GraphProt.pl -action predict -prefix " \
+            + args.data_id + " -fasta " + args.in_fa + " " \
+            + param_string + " -model " + args.in_model
         output = subprocess.getoutput(check_cmd)
-        assert output, "the following call of GraphProt.pl produced no output:\n%s" %(check_cmd)
+        assert output, "the following call of GraphProt.pl "\
+            "produced no output:\n%s" % (check_cmd)
         if args.gp_output:
             print(output)
         ws_predictions_file = args.data_id + ".predictions"
-        assert os.path.exists(ws_predictions_file), "Whole site prediction output .predictions file \"%s\" not found" %(ws_predictions_file)
+        assert os.path.exists(ws_predictions_file), \
+            "Whole site prediction output .predictions file \"%s\" not found" \
+            % (ws_predictions_file)
         if args.conf_out:
             # Filter by pos_train_ws_pred_median median.
-            assert "pos_train_ws_pred_median" in param_dic, "whole site top scores median information missing in .params file"
-            pos_train_ws_pred_median = float(param_dic["pos_train_ws_pred_median"])
+            assert "pos_train_ws_pred_median" in param_dic, \
+                "whole site top scores median information "\
+                "missing in .params file"
+            pos_tr_ws_pred_med = \
+                float(param_dic["pos_train_ws_pred_median"])
             # Filtered file.
             filt_ws_predictions_file = args.data_id + ".p50.predictions"
-            print("Extracting p50 sites from whole site predictions (score threshold = %f) ... " %(pos_train_ws_pred_median))
-            gplib.graphprot_filter_predictions_file(ws_predictions_file, filt_ws_predictions_file,
-                                                      sc_thr=pos_train_ws_pred_median)
+            print("Extracting p50 sites from whole site predictions"
+                  " (score threshold = %f) ... " % (pos_tr_ws_pred_med))
+            gplib.graphprot_filter_predictions_file(ws_predictions_file,
+                                                    filt_ws_predictions_file,
+                                                    sc_thr=pos_tr_ws_pred_med)
     else:
         # Do profile prediction.
-        print("Starting profile predictions on on input .fa file (-action predict_profile) ... ")
-        check_cmd = "GraphProt.pl -action predict_profile -prefix " + args.data_id + " -fasta " + args.in_fa + " " + param_string + " -model " + args.in_model
+        print("Starting profile predictions on on input .fa file "
+              "(-action predict_profile) ... ")
+        check_cmd = "GraphProt.pl -action predict_profile -prefix " \
+                    + args.data_id + " -fasta " + args.in_fa + " " + \
+                    param_string + " -model " + args.in_model
         output = subprocess.getoutput(check_cmd)
-        assert output, "the following call of GraphProt.pl produced no output:\n%s" %(check_cmd)
+        assert output, "the following call of GraphProt.pl produced "\
+                       "no output:\n%s" % (check_cmd)
         if args.gp_output:
             print(output)
         profile_predictions_file = args.data_id + ".profile"
-        assert os.path.exists(profile_predictions_file), "Profile prediction output .profile file \"%s\" not found" %(profile_predictions_file)
+        assert os.path.exists(profile_predictions_file), \
+            "Profile prediction output .profile file \"%s\" not found" % \
+            (profile_predictions_file)
 
         # Profile prediction output files.
         avg_prof_file = args.data_id + ".avg_profile"
         avg_prof_peaks_file = args.data_id + ".avg_profile.peaks.bed"
-        avg_prof_gen_peaks_file = args.data_id + ".avg_profile.genomic_peaks.bed"
-        avg_prof_peaks_p50_file = args.data_id + ".avg_profile.p50.peaks.bed"
-        avg_prof_gen_peaks_p50_file = args.data_id + ".avg_profile.p50.genomic_peaks.bed"
+        avg_prof_gen_peaks_file = args.data_id + \
+            ".avg_profile.genomic_peaks.bed"
+        avg_prof_peaks_p50_file = args.data_id + \
+            ".avg_profile.p50.peaks.bed"
+        avg_prof_gen_peaks_p50_file = args.data_id + \
+            ".avg_profile.p50.genomic_peaks.bed"
 
         # Get sequence IDs in order from input .fa file.
         seq_ids_list = gplib.fasta_read_in_ids(args.in_fa)
         # Calculate average profiles.
-        print("Getting average profile from profile (extlr for smoothing: %i) ... " %(args.ap_extlr))
-        gplib.graphprot_profile_calculate_avg_profile(profile_predictions_file,
-                                                      avg_prof_file,
-                                                      ap_extlr=args.ap_extlr,
-                                                      seq_ids_list=seq_ids_list,
-                                                      method=2)
+        print("Getting average profile from profile "
+              "(extlr for smoothing: %i) ... " % (args.ap_extlr))
+        gplib.graphprot_profile_calc_avg_profile(profile_predictions_file,
+                                                 avg_prof_file,
+                                                 ap_extlr=args.ap_extlr,
+                                                 seq_ids_list=seq_ids_list,
+                                                 method=2)
         # Extract peak regions on sequences with threshold score 0.
-        print("Extracting peak regions from average profile (score threshold = 0) ... ")
-        gplib.graphprot_profile_extract_peak_regions(avg_prof_file, avg_prof_peaks_file,
-                                               max_merge_dist=args.max_merge_dist,
-                                               sc_thr=args.score_thr)
+        print("Extracting peak regions from average profile "
+              "(score threshold = 0) ... ")
+        killpep8 = args.max_merge_dist
+        gplib.graphprot_profile_extract_peak_regions(avg_prof_file,
+                                                     avg_prof_peaks_file,
+                                                     max_merge_dist=killpep8,
+                                                     sc_thr=args.score_thr)
         # Convert peaks to genomic coordinates.
         if args.genomic_sites_bed:
             print("Converting peak regions to genomic coordinates ... ")
-            gplib.bed_peaks_to_genomic_peaks(avg_prof_peaks_file, avg_prof_gen_peaks_file,
+            killit = args.genomic_sites_bed
+            gplib.bed_peaks_to_genomic_peaks(avg_prof_peaks_file,
+                                             avg_prof_gen_peaks_file,
                                              print_rows=False,
-                                             genomic_sites_bed=args.genomic_sites_bed)
-            # gplib.make_file_copy(avg_prof_gen_peaks_file, avg_prof_peaks_file)
+                                             genomic_sites_bed=killit)
         # Extract peak regions with threshold score p50.
         if args.conf_out:
-            sc_id = "pos_train_avg_profile_median_%i" %(args.ap_extlr)
-            # Filter by pos_train_ws_pred_median median.
-            assert sc_id in param_dic, "average profile extlr %i median information missing in .params file" %(args.ap_extlr)
+            sc_id = "pos_train_avg_profile_median_%i" % (args.ap_extlr)
+            # Filter by pos_tr_ws_pred_med median.
+            assert sc_id in param_dic, "average profile extlr %i median "\
+                "information missing in .params file" % (args.ap_extlr)
             p50_sc_thr = float(param_dic[sc_id])
-            print("Extracting p50 peak regions from average profile (score threshold = %f) ... " %(p50_sc_thr))
-            gplib.graphprot_profile_extract_peak_regions(avg_prof_file, avg_prof_peaks_p50_file,
-                                                         max_merge_dist=args.max_merge_dist,
+            print("Extracting p50 peak regions from average profile "
+                  "(score threshold = %f) ... " % (p50_sc_thr))
+            despair = avg_prof_peaks_p50_file
+            pain = args.max_merge_dist
+            gplib.graphprot_profile_extract_peak_regions(avg_prof_file,
+                                                         despair,
+                                                         max_merge_dist=pain,
                                                          sc_thr=p50_sc_thr)
             # Convert peaks to genomic coordinates.
             if args.genomic_sites_bed:
-                print("Converting p50 peak regions to genomic coordinates ... ")
-                gplib.bed_peaks_to_genomic_peaks(avg_prof_peaks_p50_file, avg_prof_gen_peaks_p50_file,
-                                                 genomic_sites_bed=args.genomic_sites_bed)
+                print("Converting p50 peak regions to "
+                      "genomic coordinates ... ")
+                madness = args.genomic_sites_bed
+                gplib.bed_peaks_to_genomic_peaks(avg_prof_peaks_p50_file,
+                                                 avg_prof_gen_peaks_p50_file,
+                                                 genomic_sites_bed=madness)
     # Done.
     print("Script: I'm done.")
     print("Author: ... ")
-
-
--- a/graphprot_train_wrapper.py	Mon Jan 27 18:37:05 2020 -0500
+++ b/graphprot_train_wrapper.py	Wed Jan 27 19:27:47 2021 +0000
@@ -1,12 +1,11 @@
 #!/usr/bin/env python3
 
+import argparse as ap
+import os
 import subprocess
-import argparse
-import shutil
+import sys
+
 import gplib
-import gzip
-import sys
-import os
 
 
 """
@@ -38,58 +37,57 @@
     data_id.profile
 
 
-  --opt-set-size int  Hyperparameter optimization set size (taken away from both --pos and --neg) (default: 500)
-  --opt-pos str       Positive (= binding site) sequences .fa file for hyperparameter optimization (default: take
-                      --opt-set-size from --pos)
-  --opt-neg str       Negative sequences .fa file for hyperparameter optimization (default: take --opt-set-size
-                      from --neg)
-  --min-train int     Minimum amount of training sites demanded (default: 500)
-  --disable-cv        Disable cross validation step (default: false)
-  --disable-motifs    Disable motif generation step (default: false)
-  --gp-output         Print output produced by GraphProt (default: false)
-  --str-model         Train a structure model (default: train a sequence model)
-
-
 EXAMPLE CALLS
 =============
 
-python graphprot_train_wrapper.py --pos gp_data/SERBP1_positives.train.fa --neg gp_data/SERBP1_negatives.train.fa --data-id test2 --disable-cv --gp-output --opt-set-size 200 --min-train 400
+python graphprot_train_wrapper.py --pos gp_data/SERBP1_positives.train.fa
+ --neg gp_data/SERBP1_negatives.train.fa --data-id test2 --disable-cv
+ --gp-output --opt-set-size 200 --min-train 400
 
-python graphprot_train_wrapper.py --pos gp_data/SERBP1_positives.train.fa --neg gp_data/SERBP1_negatives.train.fa --data-id test2 --disable-cv --opt-set-size 100 --min-train 200
+python graphprot_train_wrapper.py --pos gp_data/SERBP1_positives.train.fa
+ --neg gp_data/SERBP1_negatives.train.fa --data-id test2 --disable-cv
+  --opt-set-size 100 --min-train 200
 
-python graphprot_train_wrapper.py --pos test-data/test_positives.train.fa --neg test-data/test_negatives.train.fa --data-id gptest2 --disable-cv --opt-pos test-data/test_positives.parop.fa --opt-neg test-data/test_negatives.parop.fa
+python graphprot_train_wrapper.py --pos test-data/test_positives.train.fa
+ --neg test-data/test_negatives.train.fa --data-id gptest2 --disable-cv
+  --opt-pos test-data/test_positives.parop.fa
+   --opt-neg test-data/test_negatives.parop.fa
 
-python graphprot_train_wrapper.py --pos test-data/test_positives.train.fa --neg test-data/test_negatives.train.fa --data-id gptest2 --disable-cv --disable-motifs --opt-pos test-data/test_positives.parop.fa --opt-neg test-data/test_negatives.parop.fa
+python graphprot_train_wrapper.py --pos test-data/test_positives.train.fa
+ --neg test-data/test_negatives.train.fa --data-id gptest2 --disable-cv
+  --disable-motifs --opt-pos test-data/test_positives.parop.fa --opt-neg
+   test-data/test_negatives.parop.fa
 
 
 """
 
-################################################################################
+
+###############################################################################
 
 def setup_argument_parser():
     """Setup argparse parser."""
     help_description = """
-    Galaxy wrapper script for GraphProt to train a GraphProt model on 
-    a given set of input sequences (positives and negatives .fa). By 
-    default a sequence model is trained (due to structure models 
-    being much slower to train). Also by default take a portion of 
-    the input sequences for hyperparameter optimization (HPO) prior to 
-    model training, and run a 10-fold cross validation and motif 
-    generation after model training. Thus the following output 
-    files are produced: 
-    .model model file, .params model parameter file, .png motif files 
+    Galaxy wrapper script for GraphProt to train a GraphProt model on
+    a given set of input sequences (positives and negatives .fa). By
+    default a sequence model is trained (due to structure models
+    being much slower to train). Also by default take a portion of
+    the input sequences for hyperparameter optimization (HPO) prior to
+    model training, and run a 10-fold cross validation and motif
+    generation after model training. Thus the following output
+    files are produced:
+    .model model file, .params model parameter file, .png motif files
     (sequence, or sequence+structure), .cv_results CV results file.
-    After model training, predict on positives to get highest whole 
+    After model training, predict on positives to get highest whole
     site and profile scores found in binding sites. Take the median
     score out of these to store in .params file, using it later
     for outputting binding sites or peaks with higher confidence.
 
     """
     # Define argument parser.
-    p = argparse.ArgumentParser(add_help=False,
-                                prog="graphprot_train_wrapper.py",
-                                description=help_description,
-                                formatter_class=argparse.MetavarTypeHelpFormatter)
+    p = ap.ArgumentParser(add_help=False,
+                          prog="graphprot_train_wrapper.py",
+                          description=help_description,
+                          formatter_class=ap.MetavarTypeHelpFormatter)
 
     # Argument groups.
     p_man = p.add_argument_group("REQUIRED ARGUMENTS")
@@ -97,66 +95,76 @@
 
     # Required arguments.
     p_opt.add_argument("-h", "--help",
-           action="help",
-           help="Print help message")
+                       action="help",
+                       help="Print help message")
     p_man.add_argument("--pos",
-           dest="in_pos_fa",
-           type=str,
-           required = True,
-           help = "Positive (= binding site) sequences .fa file for model training (option -fasta)")
+                       dest="in_pos_fa",
+                       type=str,
+                       required=True,
+                       help="Positive (= binding site) sequences .fa file "
+                            "for model training (option -fasta)")
     p_man.add_argument("--neg",
-           dest="in_neg_fa",
-           type=str,
-           required = True,
-           help = "Negative sequences .fa file for model training (option -negfasta)")
+                       dest="in_neg_fa",
+                       type=str,
+                       required=True,
+                       help="Negative sequences .fa file for model "
+                            "training (option -negfasta)")
     p_man.add_argument("--data-id",
-           dest="data_id",
-           type=str,
-           required = True,
-           help = "Data ID (option -prefix)")
+                       dest="data_id",
+                       type=str,
+                       required=True,
+                       help="Data ID (option -prefix)")
     # Additional arguments.
     p_opt.add_argument("--opt-set-size",
-           dest="opt_set_size",
-           type = int,
-           default = 500,
-           help = "Hyperparameter optimization set size (taken away from both --pos and --neg) (default: 500)")
+                       dest="opt_set_size",
+                       type=int,
+                       default=500,
+                       help="Hyperparameter optimization set size (taken "
+                            "away from both --pos and --neg) (default: 500)")
     p_opt.add_argument("--opt-pos",
-           dest="opt_pos_fa",
-           type=str,
-           help = "Positive (= binding site) sequences .fa file for hyperparameter optimization (default: take --opt-set-size from --pos)")
+                       dest="opt_pos_fa",
+                       type=str,
+                       help="Positive (= binding site) sequences .fa file "
+                            "for hyperparameter optimization (default: take "
+                            "--opt-set-size from --pos)")
     p_opt.add_argument("--opt-neg",
-           dest="opt_neg_fa",
-           type=str,
-           help = "Negative sequences .fa file for hyperparameter optimization (default: take --opt-set-size from --neg)")
+                       dest="opt_neg_fa",
+                       type=str,
+                       help="Negative sequences .fa file for hyperparameter "
+                            "optimization (default: take --opt-set-size "
+                            "from --neg)")
     p_opt.add_argument("--min-train",
-           dest="min_train",
-           type = int,
-           default = 500,
-           help = "Minimum amount of training sites demanded (default: 500)")
+                       dest="min_train",
+                       type=int,
+                       default=500,
+                       help="Minimum amount of training sites demanded "
+                            "(default: 500)")
     p_opt.add_argument("--disable-cv",
-           dest = "disable_cv",
-           default = False,
-           action = "store_true",
-           help = "Disable cross validation step (default: false)")
+                       dest="disable_cv",
+                       default=False,
+                       action="store_true",
+                       help="Disable cross validation step (default: false)")
     p_opt.add_argument("--disable-motifs",
-           dest = "disable_motifs",
-           default = False,
-           action = "store_true",
-           help = "Disable motif generation step (default: false)")
+                       dest="disable_motifs",
+                       default=False,
+                       action="store_true",
+                       help="Disable motif generation step (default: false)")
     p_opt.add_argument("--gp-output",
-           dest = "gp_output",
-           default = False,
-           action = "store_true",
-           help = "Print output produced by GraphProt (default: false)")
+                       dest="gp_output",
+                       default=False,
+                       action="store_true",
+                       help="Print output produced by GraphProt "
+                            "(default: false)")
     p_opt.add_argument("--str-model",
-           dest = "train_str_model",
-           default = False,
-           action = "store_true",
-           help = "Train a structure model (default: train a sequence model)")
+                       dest="train_str_model",
+                       default=False,
+                       action="store_true",
+                       help="Train a structure model (default: train "
+                            "a sequence model)")
     return p
 
 
-################################################################################
+###############################################################################
 
 if __name__ == '__main__':
 
@@ -164,63 +172,117 @@
     parser = setup_argument_parser()
     # Read in command line arguments.
     args = parser.parse_args()
-    
+
     """
     Do all sorts of sanity checking.
-    
+
     """
     # Check for Linux.
     assert "linux" in sys.platform, "please use Linux"
     # Check tool availability.
     assert gplib.is_tool("GraphProt.pl"), "GraphProt.pl not in PATH"
     # Check file inputs.
-    assert os.path.exists(args.in_pos_fa), "positives .fa file \"%s\" not found" %(args.in_pos_fa)
-    assert os.path.exists(args.in_neg_fa), "negatives .fa file \"%s\" not found" %(args.in_neg_fa)
+    assert os.path.exists(args.in_pos_fa), \
+        "positives .fa file \"%s\" not found" % (args.in_pos_fa)
+    assert os.path.exists(args.in_neg_fa), \
+        "negatives .fa file \"%s\" not found" % (args.in_neg_fa)
     # Count .fa entries.
     c_pos_fa = gplib.count_fasta_headers(args.in_pos_fa)
     c_neg_fa = gplib.count_fasta_headers(args.in_neg_fa)
-    assert c_pos_fa, "positives .fa file \"%s\" no headers found" %(args.in_pos_fa)
-    assert c_neg_fa, "negatives .fa file \"%s\" no headers found" %(args.in_neg_fa)
-    print("# positive .fa sequences:   %i" %(c_pos_fa))
-    print("# negative .fa sequences:   %i" %(c_neg_fa))
+    assert c_pos_fa, "positives .fa file \"%s\" no headers found" % \
+        (args.in_pos_fa)
+    assert c_neg_fa, "negatives .fa file \"%s\" no headers found" % \
+        (args.in_neg_fa)
+    print("# positive .fa sequences:   %i" % (c_pos_fa))
+    print("# negative .fa sequences:   %i" % (c_neg_fa))
     # Check additional files.
     if args.opt_pos_fa:
         assert args.opt_neg_fa, "--opt-pos but no --opt-neg given"
     if args.opt_neg_fa:
         assert args.opt_pos_fa, "--opt-neg but no --opt-pos given"
+    # Check for lowercase only sequences, which cause GP to crash.
+    error_mess = "input sequences encountered containing "\
+        "only lowercase characters or lowercase characters in between "\
+        "uppercase characters. Please provide either all uppercase "\
+        "sequences or sequences containing uppercase regions surrounded "\
+        "by lowercase context regions for structure calculation (see "\
+        "viewpoint concept in original GraphProt publication "\
+        "for more details)"
+    seqs_dic = gplib.read_fasta_into_dic(args.in_pos_fa)
+    bad_ids = gplib.check_seqs_dic_format(seqs_dic)
+    assert not bad_ids, "%s" % (error_mess)
+    seqs_dic = gplib.read_fasta_into_dic(args.in_neg_fa)
+    bad_ids = gplib.check_seqs_dic_format(seqs_dic)
+    assert not bad_ids, "%s" % (error_mess)
+    if args.opt_pos_fa:
+        seqs_dic = gplib.read_fasta_into_dic(args.opt_pos_fa)
+        bad_ids = gplib.check_seqs_dic_format(seqs_dic)
+        assert not bad_ids, "%s" % (error_mess)
+    if args.opt_neg_fa:
+        seqs_dic = gplib.read_fasta_into_dic(args.opt_neg_fa)
+        bad_ids = gplib.check_seqs_dic_format(seqs_dic)
+        assert not bad_ids, "%s" % (error_mess)
+
     # If parop .fa files given.
     if args.opt_pos_fa and args.opt_neg_fa:
         c_parop_pos_fa = gplib.count_fasta_headers(args.opt_pos_fa)
         c_parop_neg_fa = gplib.count_fasta_headers(args.opt_neg_fa)
-        assert c_parop_pos_fa, "--opt-pos .fa file \"%s\" no headers found" %(args.opt_pos_fa)
-        assert c_parop_neg_fa, "--opt-neg .fa file \"%s\" no headers found" %(args.opt_neg_fa)
+        assert c_parop_pos_fa, "--opt-pos .fa file \"%s\" no headers found" \
+            % (args.opt_pos_fa)
+        assert c_parop_neg_fa, "--opt-neg .fa file \"%s\" no headers found" \
+            % (args.opt_neg_fa)
         # Less than 500 for training?? You gotta be kidding.
-        assert c_pos_fa >= args.min_train, "--pos for training < %i, please provide more (try at least > 1000, the more the better)" %(args.min_train)
-        assert c_neg_fa >= args.min_train, "--neg for training < %i, please provide more (try at least > 1000, the more the better)" %(args.min_train)
+        assert c_pos_fa >= args.min_train, \
+            "--pos for training < %i, please provide more (try at least "\
+            "> 1000, the more the better)" % (args.min_train)
+        assert c_neg_fa >= args.min_train, \
+            "--neg for training < %i, please provide more (try at least "\
+            "> 1000, the more the better)" % (args.min_train)
         # Looking closer at ratios.
         pos_neg_ratio = c_parop_pos_fa / c_parop_neg_fa
         if pos_neg_ratio < 0.8 or pos_neg_ratio > 1.25:
-            assert 0, "ratio of --opt-pos to --opt-neg < 0.8 or > 1.25 (ratio = %f). Try to keep ratio closer to 1 or better use identical numbers (keep in mind that performance measures such as accuracy or AUROC are not suitable for imbalanced datasets!)" %(pos_neg_ratio)
+            assert 0, "ratio of --opt-pos to --opt-neg < 0.8 or > 1.25 "\
+                "(ratio = %f). Try to keep ratio closer to 1 or better use "\
+                "identical numbers (keep in mind that performance measures "\
+                "such as accuracy or AUROC are not suitable for imbalanced "\
+                " datasets!)" % (pos_neg_ratio)
     else:
         # Define some minimum amount of training sites for the sake of sanity.
         c_pos_train = c_pos_fa - args.opt_set_size
         c_neg_train = c_neg_fa - args.opt_set_size
         # Start complaining.
-        assert c_pos_fa >= args.opt_set_size, "# positives < --opt-set-size (%i < %i)" %(c_pos_fa, args.opt_set_size)
-        assert c_neg_fa >= args.opt_set_size, "# negatives < --opt-set-size (%i < %i)" %(c_neg_fa, args.opt_set_size)
-        assert c_pos_train >= args.opt_set_size, "# positives remaining for training < --opt-set-size (%i < %i)" %(c_pos_train, args.opt_set_size)
-        assert c_neg_train >= args.opt_set_size, "# negatives remaining for training < --opt-set-size (%i < %i)" %(c_neg_train, args.opt_set_size)
+        assert c_pos_fa >= args.opt_set_size, \
+            "# positives < --opt-set-size (%i < %i)" \
+            % (c_pos_fa, args.opt_set_size)
+        assert c_neg_fa >= args.opt_set_size, \
+            "# negatives < --opt-set-size (%i < %i)" \
+            % (c_neg_fa, args.opt_set_size)
+        assert c_pos_train >= args.opt_set_size, \
+            "# positives remaining for training < --opt-set-size "\
+            "(%i < %i)" % (c_pos_train, args.opt_set_size)
+        assert c_neg_train >= args.opt_set_size, "# negatives remaining "\
+            "for training < --opt-set-size (%i < %i)" \
+            % (c_neg_train, args.opt_set_size)
         # Less than 500?? You gotta be kidding.
-        assert c_pos_train >= args.min_train, "# positives remaining for training < %i, please provide more (try at least > 1000, the more the better)" %(args.min_train)
-        assert c_neg_train >= args.min_train, "# negatives remaining for training < %i, please provide more (try at least > 1000, the more the better)" %(args.min_train)
+        assert c_pos_train >= args.min_train, \
+            "# positives remaining for training < %i, please provide more "\
+            " (try at least > 1000, the more the better)" % (args.min_train)
+        assert c_neg_train >= args.min_train, \
+            "# negatives remaining for training < %i, please provide more "\
+            "(try at least > 1000, the more the better)" % (args.min_train)
         # Looking closer at ratios.
         pos_neg_ratio = c_pos_train / c_neg_train
         if pos_neg_ratio < 0.8 or pos_neg_ratio > 1.25:
-            assert 0, "ratio of --pos to --neg < 0.8 or > 1.25 (ratio = %f). Try to keep ratio closer to 1 or better use identical numbers (keep in mind that performance measures such as accuracy or AUROC are not suitable for imbalanced datasets!)" %(pos_neg_ratio)
+            assert 0, "ratio of --pos to --neg < 0.8 or > 1.25 "\
+                "(ratio = %f). Try to keep ratio closer to 1 or better use "\
+                "identical numbers (keep in mind that performance measures "\
+                "such as accuracy or AUROC are not suitable for imbalanced "\
+                "datasets!)" % (pos_neg_ratio)
 
     """
-    Generate parop + train .fa output files for hyperparameter optimization + training.
-    
+    Generate parop + train .fa output files for hyperparameter
+    optimization + training.
+
     """
     # Output files for training.
     pos_parop_fa = args.data_id + ".positives.parop.fa"
@@ -237,27 +299,28 @@
         gplib.make_file_copy(args.in_neg_fa, neg_train_fa)
     else:
         # Generate parop + train .fa files from input .fa files.
-        gplib.split_fasta_into_test_train_files(args.in_pos_fa, pos_parop_fa, pos_train_fa,
-                                                  test_size=args.opt_set_size)
-        gplib.split_fasta_into_test_train_files(args.in_neg_fa, neg_parop_fa, neg_train_fa,
-                                                  test_size=args.opt_set_size)
+        gplib.split_fasta_into_test_train_files(args.in_pos_fa, pos_parop_fa,
+                                                pos_train_fa,
+                                                test_size=args.opt_set_size)
+        gplib.split_fasta_into_test_train_files(args.in_neg_fa, neg_parop_fa,
+                                                neg_train_fa,
+                                                test_size=args.opt_set_size)
 
     """
     Do the hyperparameter optimization.
-    
+
     """
     print("Starting hyperparameter optimization (-action ls) ... ")
-    check_cmd = "GraphProt.pl -action ls -prefix " + args.data_id + " -fasta " + pos_parop_fa + " -negfasta " + neg_parop_fa
+    check_cmd = "GraphProt.pl -action ls -prefix " + args.data_id + \
+        " -fasta " + pos_parop_fa + " -negfasta " + neg_parop_fa
     # If sequence model should be trained (default).
     if not args.train_str_model:
         check_cmd += " -onlyseq"
     print(check_cmd)
     output = subprocess.getoutput(check_cmd)
-    #assert output, "The following call of GraphProt.pl produced no output:\n%s" %(check_cmd)
-    #if args.gp_output:
-    #    print(output)
     params_file = args.data_id + ".params"
-    assert os.path.exists(params_file), "Hyperparameter optimization output .params file \"%s\" not found" %(params_file)
+    assert os.path.exists(params_file), "Hyperparameter optimization output "\
+        " .params file \"%s\" not found" % (params_file)
     # Add model type to params file.
     if args.train_str_model:
         gplib.echo_add_to_file("model_type: structure", params_file)
@@ -268,164 +331,151 @@
 
     """
     Do the model training. (Yowza!)
-    
+
     """
     print("Starting model training (-action train) ... ")
-    check_cmd = "GraphProt.pl -action train -prefix " + args.data_id + " -fasta " + pos_train_fa + " -negfasta " + neg_train_fa + " " + param_string
+    check_cmd = "GraphProt.pl -action train -prefix " + args.data_id \
+        + " -fasta " + pos_train_fa + " -negfasta " + neg_train_fa \
+        + " " + param_string
     print(check_cmd)
     output = subprocess.getoutput(check_cmd)
-    assert output, "The following call of GraphProt.pl produced no output:\n%s" %(check_cmd)
+    assert output, \
+        "The following call of GraphProt.pl produced no output:\n%s" \
+        % (check_cmd)
     if args.gp_output:
         print(output)
     model_file = args.data_id + ".model"
-    assert os.path.exists(model_file), "Training output .model file \"%s\" not found" %(model_file)
+    assert os.path.exists(model_file), \
+        "Training output .model file \"%s\" not found" % (model_file)
 
     """
     Do the 10-fold cross validation.
-    
+
     """
     if not args.disable_cv:
         print("Starting 10-fold cross validation (-action cv) ... ")
-        check_cmd = "GraphProt.pl -action cv -prefix " + args.data_id + " -fasta " + pos_train_fa + " -negfasta " + neg_train_fa + " " + param_string + " -model " + model_file
+        check_cmd = "GraphProt.pl -action cv -prefix " + args.data_id \
+            + " -fasta " + pos_train_fa + " -negfasta " + neg_train_fa \
+            + " " + param_string + " -model " + model_file
         print(check_cmd)
         output = subprocess.getoutput(check_cmd)
-        assert output, "The following call of GraphProt.pl produced no output:\n%s" %(check_cmd)
+        assert output, \
+            "The following call of GraphProt.pl produced no output:\n%s" \
+            % (check_cmd)
         if args.gp_output:
             print(output)
         cv_results_file = args.data_id + ".cv_results"
-        assert os.path.exists(cv_results_file), "CV output .cv_results file \"%s\" not found" %(cv_results_file)
+        assert os.path.exists(cv_results_file), \
+            "CV output .cv_results file \"%s\" not found" % (cv_results_file)
 
     """
     Do the motif generation.
-    
+
     """
     if not args.disable_motifs:
         print("Starting motif generation (-action motif) ... ")
-        check_cmd = "GraphProt.pl -action motif -prefix " + args.data_id + " -fasta " + pos_train_fa + " -negfasta " + neg_train_fa + " " + param_string + " -model " + model_file
+        check_cmd = "GraphProt.pl -action motif -prefix " + args.data_id \
+            + " -fasta " + pos_train_fa + " -negfasta " + neg_train_fa \
+            + " " + param_string + " -model " + model_file
         print(check_cmd)
         output = subprocess.getoutput(check_cmd)
-        assert output, "The following call of GraphProt.pl produced no output:\n%s" %(check_cmd)
+        assert output, \
+            "The following call of GraphProt.pl produced no output:\n%s" \
+            % (check_cmd)
         if args.gp_output:
             print(output)
         seq_motif_file = args.data_id + ".sequence_motif"
         seq_motif_png_file = args.data_id + ".sequence_motif.png"
-        assert os.path.exists(seq_motif_file), "Motif output .sequence_motif file \"%s\" not found" %(seq_motif_file)
-        assert os.path.exists(seq_motif_png_file), "Motif output .sequence_motif.png file \"%s\" not found" %(seq_motif_png_file)
+        assert os.path.exists(seq_motif_file), \
+            "Motif output .sequence_motif file \"%s\" not found" \
+            % (seq_motif_file)
+        assert os.path.exists(seq_motif_png_file), \
+            "Motif output .sequence_motif.png file \"%s\" not found" \
+            % (seq_motif_png_file)
         if args.train_str_model:
             str_motif_file = args.data_id + ".structure_motif"
             str_motif_png_file = args.data_id + ".structure_motif.png"
-            assert os.path.exists(str_motif_file), "Motif output .structure_motif file \"%s\" not found" %(str_motif_file)
-            assert os.path.exists(str_motif_png_file), "Motif output .structure_motif.png file \"%s\" not found" %(str_motif_png_file)
+            assert os.path.exists(str_motif_file), \
+                "Motif output .structure_motif file \"%s\" not found" \
+                % (str_motif_file)
+            assert os.path.exists(str_motif_png_file), \
+                "Motif output .structure_motif.png file \"%s\" not found" \
+                % (str_motif_png_file)
 
     """
     Do whole site predictions on positive training set.
-    
+
     """
-    print("Starting whole site predictions on positive training set (-action predict) ... ")
-    check_cmd = "GraphProt.pl -action predict -prefix " + args.data_id + " -fasta " + pos_train_fa + " " + param_string + " -model " + model_file
+    print("Starting whole site predictions on positive training set "
+          " (-action predict) ... ")
+    check_cmd = "GraphProt.pl -action predict -prefix " + args.data_id \
+        + " -fasta " + pos_train_fa + " " + param_string \
+        + " -model " + model_file
     print(check_cmd)
     output = subprocess.getoutput(check_cmd)
-    assert output, "The following call of GraphProt.pl produced no output:\n%s" %(check_cmd)
+    assert output, \
+        "The following call of GraphProt.pl produced no output:\n%s" \
+        % (check_cmd)
     if args.gp_output:
         print(output)
     ws_predictions_file = args.data_id + ".predictions"
-    assert os.path.exists(ws_predictions_file), "Whole site prediction output .predictions file \"%s\" not found" %(ws_predictions_file)
+    assert os.path.exists(ws_predictions_file), \
+        "Whole site prediction output .predictions file \"%s\" not found" \
+        % (ws_predictions_file)
 
     """
     Do profile predictions on positive training set.
-    
+
     """
-    print("Starting profile predictions on positive training set (-action predict_profile) ... ")
-    check_cmd = "GraphProt.pl -action predict_profile -prefix " + args.data_id + " -fasta " + pos_train_fa + " " + param_string + " -model " + model_file
+    print("Starting profile predictions on positive training set "
+          "-action predict_profile) ... ")
+    check_cmd = "GraphProt.pl -action predict_profile -prefix " \
+        + args.data_id + " -fasta " + pos_train_fa + " " \
+        + param_string + " -model " + model_file
     print(check_cmd)
     output = subprocess.getoutput(check_cmd)
-    assert output, "The following call of GraphProt.pl produced no output:\n%s" %(check_cmd)
+    assert output, \
+        "The following call of GraphProt.pl produced no output:\n%s" \
+        % (check_cmd)
     if args.gp_output:
         print(output)
     profile_predictions_file = args.data_id + ".profile"
-    assert os.path.exists(profile_predictions_file), "Profile prediction output .profile file \"%s\" not found" %(profile_predictions_file)
+    assert os.path.exists(profile_predictions_file), \
+        "Profile prediction output .profile file \"%s\" not found" \
+        % (profile_predictions_file)
 
     """
     Get 50 % score (median) for .predictions and .profile file.
-    For .profile, first extract for each site the maximum score, and then 
+    For .profile, first extract for each site the maximum score, and then
     from the list of maximum site scores get the median.
     For whole site .predictions, get the median from the site scores list.
-    
+
     """
     print("Getting .profile and .predictions median scores ... ")
 
     # Whole site scores median.
-    ws_pred_median = gplib.graphprot_predictions_get_median(ws_predictions_file)
+    ws_pred_median = \
+        gplib.graphprot_predictions_get_median(ws_predictions_file)
     # Profile top site scores median.
-    profile_median = gplib.graphprot_profile_get_top_scores_median(profile_predictions_file, 
-                                                                     profile_type="profile")
-    ws_pred_string = "pos_train_ws_pred_median: %f" %(ws_pred_median)
-    profile_string = "pos_train_profile_median: %f" %(profile_median)
+    profile_median = \
+        gplib.graphprot_profile_get_tsm(profile_predictions_file,
+                                        profile_type="profile")
+    ws_pred_string = "pos_train_ws_pred_median: %f" % (ws_pred_median)
+    profile_string = "pos_train_profile_median: %f" % (profile_median)
     gplib.echo_add_to_file(ws_pred_string, params_file)
     gplib.echo_add_to_file(profile_string, params_file)
     # Average profile top site scores median for extlr 1 to 10.
     for i in range(10):
         i += 1
-        avg_profile_median = gplib.graphprot_profile_get_top_scores_median(profile_predictions_file,
-                                                                             profile_type="avg_profile",
-                                                                             avg_profile_extlr=i)
-                                                                        
-        avg_profile_string = "pos_train_avg_profile_median_%i: %f" %(i, avg_profile_median)
+        avg_profile_median = \
+            gplib.graphprot_profile_get_tsm(profile_predictions_file,
+                                            profile_type="avg_profile",
+                                            avg_profile_extlr=i)
+
+        avg_profile_string = "pos_train_avg_profile_median_%i: %f" \
+            % (i, avg_profile_median)
         gplib.echo_add_to_file(avg_profile_string, params_file)
 
     print("Script: I'm done.")
     print("Author: Good. Now go back to your file system directory.")
     print("Script: Ok.")
-
-
-"""
-
-OLD CODE ...
-
-    p.add_argument("--ap-extlr",
-                   dest="ap_extlr",
-                   type = int,
-                   default = 5,
-                   help = "Define average profile up- and downstream extension for averaging scores to produce the average profile. This is used to get the median average profile score, which will be stored in the .params file to later be used in a prediction setting as a second filter value to get more confident peak regions. NOTE that you have to use the same value in model training and prediction! (default: 5)")
-
-
-    p.add_argument("--disable-opt",
-                   dest = "disable_opt",
-                   default = False,
-                   action = "store_true",
-                   help = "Disable hyperparameter optimization (HPO) (default: optimize hyperparameters)")
-    p.add_argument("--R",
-                   dest = "param_r",
-                   type = int,
-                   default = False,
-                   help = "GraphProt model R parameter (default: determined by HPO)")
-    p.add_argument("--D",
-                   dest = "param_d",
-                   type = int,
-                   default = False,
-                   help = "GraphProt model D parameter (default: determined by HPO)")
-    p.add_argument("--epochs",
-                   dest = "param_epochs",
-                   type = int,
-                   default = False,
-                   help = "GraphProt model epochs parameter (default: determined by HPO)")
-    p.add_argument("--lambda",
-                   dest = "param_lambda",
-                   type = float,
-                   default = False,
-                   help = "GraphProt model lambda parameter (default: determined by HPO)")
-    p.add_argument("--bitsize",
-                   dest = "param_bitsize",
-                   type = int,
-                   default = False,
-                   help = "GraphProt model bitsize parameter (default: determined by HPO)")
-    p.add_argument("--abstraction",
-                   dest = "param_abstraction",
-                   type = int,
-                   default = False,
-                   help = "GraphProt model RNAshapes abstraction level parameter for training structure models (default: determined by HPO)")
-
-"""
-
-
-
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test4.fa	Wed Jan 27 19:27:47 2021 +0000
@@ -0,0 +1,4 @@
+>1
+gccuAUGUuuua
+>2
+ctgaAACTatgt