Mercurial > repos > rnateam > graphprot_predict_profile
changeset 5:ddcf35a868b8 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/graphprot commit ad60258f5759eaa205fec4af6143c728ea131419
author | bgruening |
---|---|
date | Wed, 05 Jun 2024 16:40:51 +0000 |
parents | 4ad83aed5c3c |
children | 33b590aa07c1 |
files | gplib.py graphprot_predict_wrapper.py graphprot_train_predict.xml graphprot_train_wrapper.py |
diffstat | 4 files changed, 723 insertions(+), 518 deletions(-) [+] |
line wrap: on
line diff
--- a/gplib.py Thu Jan 28 15:06:14 2021 +0000 +++ b/gplib.py Wed Jun 05 16:40:51 2024 +0000 @@ -1,4 +1,3 @@ - import gzip import random import re @@ -6,7 +5,6 @@ import subprocess from distutils.spawn import find_executable - """ Run doctests: @@ -17,7 +15,8 @@ """ -############################################################################### +####################################################################### + def graphprot_predictions_get_median(predictions_file): """ @@ -41,11 +40,12 @@ return statistics.median(sc_list) -############################################################################### +####################################################################### + -def graphprot_profile_get_tsm(profile_file, - profile_type="profile", - avg_profile_extlr=5): +def graphprot_profile_get_tsm( + profile_file, profile_type="profile", avg_profile_extlr=5 +): """ Given a GraphProt .profile file, extract for each site (identified by @@ -88,23 +88,21 @@ max_list.append(max_sc) elif profile_type == "avg_profile": # Convert profile score list to average profile scores list. - aps_list = \ - list_moving_window_average_values(lists_dic[seq_id], - win_extlr=avg_profile_extlr) + aps_list = list_moving_window_average_values( + lists_dic[seq_id], win_extlr=avg_profile_extlr + ) max_sc = max(aps_list) max_list.append(max_sc) else: - assert 0, "invalid profile_type argument given: \"%s\"" \ - % (profile_type) + assert 0, 'invalid profile_type argument given: "%s"' % (profile_type) # Return the median. return statistics.median(max_list) -############################################################################### +####################################################################### -def list_moving_window_average_values(in_list, - win_extlr=5, - method=1): + +def list_moving_window_average_values(in_list, win_extlr=5, method=1): """ Take a list of numeric values, and calculate for each position a new value, by taking the mean value of the window of positions -win_extlr and @@ -152,7 +150,8 @@ return new_list -############################################################################### +####################################################################### + def echo_add_to_file(echo_string, out_file): """ @@ -167,14 +166,16 @@ assert not error, "echo is complaining:\n%s\n%s" % (check_cmd, output) -############################################################################### +####################################################################### + def is_tool(name): """Check whether tool "name" is in PATH.""" return find_executable(name) is not None -############################################################################### +####################################################################### + def count_fasta_headers(fasta_file): """ @@ -194,7 +195,8 @@ return row_count -############################################################################### +####################################################################### + def make_file_copy(in_file, out_file): """ @@ -202,21 +204,26 @@ """ check_cmd = "cat " + in_file + " > " + out_file - assert in_file != out_file, \ - "cat does not like to cat file into same file (%s)" % (check_cmd) + assert in_file != out_file, "cat does not like to cat file into same file (%s)" % ( + check_cmd + ) output = subprocess.getoutput(check_cmd) error = False if output: error = True - assert not error, \ - "cat did not like your input (in_file: %s, out_file: %s):\n%s" \ - % (in_file, out_file, output) + assert not error, "cat did not like your input (in_file: %s, out_file: %s):\n%s" % ( + in_file, + out_file, + output, + ) -############################################################################### +####################################################################### + -def split_fasta_into_test_train_files(in_fasta, test_out_fa, train_out_fa, - test_size=500): +def split_fasta_into_test_train_files( + in_fasta, test_out_fa, train_out_fa, test_size=500 +): """ Split in_fasta .fa file into two files (e.g. test, train). @@ -230,7 +237,7 @@ TRAINOUT = open(train_out_fa, "w") for seq_id in rand_ids_list: seq = seqs_dic[seq_id] - if (c_out >= test_size): + if c_out >= test_size: TRAINOUT.write(">%s\n%s\n" % (seq_id, seq)) else: TESTOUT.write(">%s\n%s\n" % (seq_id, seq)) @@ -239,7 +246,8 @@ TRAINOUT.close() -############################################################################### +####################################################################### + def check_seqs_dic_format(seqs_dic): """ @@ -267,16 +275,19 @@ return bad_seq_ids -############################################################################### +####################################################################### + -def read_fasta_into_dic(fasta_file, - seqs_dic=False, - ids_dic=False, - read_dna=False, - short_ensembl=False, - reject_lc=False, - convert_to_uc=False, - skip_n_seqs=True): +def read_fasta_into_dic( + fasta_file, + seqs_dic=False, + ids_dic=False, + read_dna=False, + short_ensembl=False, + reject_lc=False, + convert_to_uc=False, + skip_n_seqs=True, +): """ Read in FASTA sequences, convert to RNA, store in dictionary and return dictionary. @@ -302,7 +313,7 @@ # Go through FASTA file, extract sequences. if re.search(r".+\.gz$", fasta_file): - f = gzip.open(fasta_file, 'rt') + f = gzip.open(fasta_file, "rt") else: f = open(fasta_file, "r") for line in f: @@ -315,9 +326,10 @@ if re.search(r".+\..+", seq_id): m = re.search(r"(.+?)\..+", seq_id) seq_id = m.group(1) - assert seq_id not in seqs_dic, \ - "non-unique FASTA header \"%s\" in \"%s\"" \ - % (seq_id, fasta_file) + assert seq_id not in seqs_dic, 'non-unique FASTA header "%s" in "%s"' % ( + seq_id, + fasta_file, + ) if ids_dic: if seq_id in ids_dic: seqs_dic[seq_id] = "" @@ -328,31 +340,31 @@ m = re.search("([ACGTUN]+)", line, re.I) seq = m.group(1) if reject_lc: - assert \ - not re.search("[a-z]", seq), \ - "lc char detected in seq \"%i\" (reject_lc=True)" \ - % (seq_id) + assert not re.search( + "[a-z]", seq + ), 'lc char detected in seq "%i" (reject_lc=True)' % (seq_id) if convert_to_uc: seq = seq.upper() # If sequences with N nucleotides should be skipped. if skip_n_seqs: if "n" in m.group(1) or "N" in m.group(1): - print("WARNING: \"%s\" contains N. Discarding " - "sequence ... " % (seq_id)) + print( + 'WARNING: "%s" contains N. Discarding ' + "sequence ... " % (seq_id) + ) del seqs_dic[seq_id] continue # Convert to RNA, concatenate sequence. if read_dna: - seqs_dic[seq_id] += \ - m.group(1).replace("U", "T").replace("u", "t") + seqs_dic[seq_id] += m.group(1).replace("U", "T").replace("u", "t") else: - seqs_dic[seq_id] += \ - m.group(1).replace("T", "U").replace("t", "u") + seqs_dic[seq_id] += m.group(1).replace("T", "U").replace("t", "u") f.close() return seqs_dic -############################################################################### +####################################################################### + def random_order_dic_keys_into_list(in_dic): """ @@ -366,7 +378,8 @@ return id_list -############################################################################### +####################################################################### + def graphprot_get_param_string(params_file): """ @@ -394,11 +407,12 @@ else: param_string += "-%s %s " % (par, setting) else: - assert 0, "pattern matching failed for string \"%s\"" % (param) + assert 0, 'pattern matching failed for string "%s"' % (param) return param_string -############################################################################### +####################################################################### + def seqs_dic_count_uc_nts(seqs_dic): """ @@ -416,11 +430,12 @@ assert seqs_dic, "Given sequence dictionary empty" c_uc = 0 for seq_id in seqs_dic: - c_uc += len(re.findall(r'[A-Z]', seqs_dic[seq_id])) + c_uc += len(re.findall(r"[A-Z]", seqs_dic[seq_id])) return c_uc -############################################################################### +####################################################################### + def seqs_dic_count_lc_nts(seqs_dic): """ @@ -438,11 +453,12 @@ assert seqs_dic, "Given sequence dictionary empty" c_uc = 0 for seq_id in seqs_dic: - c_uc += len(re.findall(r'[a-z]', seqs_dic[seq_id])) + c_uc += len(re.findall(r"[a-z]", seqs_dic[seq_id])) return c_uc -############################################################################### +####################################################################### + def count_file_rows(in_file): """ @@ -462,7 +478,8 @@ return row_count -############################################################################### +####################################################################### + def bed_check_six_col_format(bed_file): """ @@ -488,7 +505,8 @@ return six_col_format -############################################################################### +####################################################################### + def bed_check_unique_ids(bed_file): """ @@ -512,7 +530,8 @@ return True -############################################################################### +####################################################################### + def get_seq_lengths_from_seqs_dic(seqs_dic): """ @@ -527,7 +546,8 @@ return seq_len_dic -############################################################################### +####################################################################### + def bed_get_region_lengths(bed_file): """ @@ -548,19 +568,19 @@ site_e = int(cols[2]) site_id = cols[3] site_l = site_e - site_s - assert site_id \ - not in id2len_dic, \ - "column 4 IDs not unique in given .bed file \"%s\"" \ - % (bed_file) + assert ( + site_id not in id2len_dic + ), 'column 4 IDs not unique in given .bed file "%s"' % (bed_file) id2len_dic[site_id] = site_l f.closed - assert id2len_dic, \ - "No IDs read into dic (input file \"%s\" empty or malformatted?)" \ - % (bed_file) + assert ( + id2len_dic + ), 'No IDs read into dic (input file "%s" empty or malformatted?)' % (bed_file) return id2len_dic -############################################################################### +####################################################################### + def graphprot_get_param_dic(params_file): """ @@ -599,10 +619,10 @@ return param_dic -############################################################################### +####################################################################### -def graphprot_filter_predictions_file(in_file, out_file, - sc_thr=0): + +def graphprot_filter_predictions_file(in_file, out_file, sc_thr=0): """ Filter GraphProt .predictions file by given score thr_sc. """ @@ -619,7 +639,8 @@ OUTPRED.close() -############################################################################### +####################################################################### + def fasta_read_in_ids(fasta_file): """ @@ -642,12 +663,12 @@ return ids_list -############################################################################### +####################################################################### + -def graphprot_profile_calc_avg_profile(in_file, out_file, - ap_extlr=5, - seq_ids_list=False, - method=1): +def graphprot_profile_calc_avg_profile( + in_file, out_file, ap_extlr=5, seq_ids_list=False, method=1 +): """ Given a GraphProt .profile file, calculate average profiles and output average profile file. @@ -702,15 +723,16 @@ if seq_ids_list: c_seq_ids = len(seq_ids_list) c_site_ids = len(site_starts_dic) - assert c_seq_ids == c_site_ids, \ - "# sequence IDs != # site IDs (%i != %i)" \ - % (c_seq_ids, c_site_ids) + assert ( + c_seq_ids == c_site_ids + ), "# sequence IDs != # site IDs (%i != %i)" % (c_seq_ids, c_site_ids) OUTPROF = open(out_file, "w") # For each site, calculate average profile scores list. for site_id in lists_dic: # Convert profile score list to average profile scores list. - aps_list = list_moving_window_average_values(lists_dic[site_id], - win_extlr=ap_extlr) + aps_list = list_moving_window_average_values( + lists_dic[site_id], win_extlr=ap_extlr + ) start_pos = site_starts_dic[site_id] # Get original FASTA sequence ID. if seq_ids_list: @@ -741,10 +763,9 @@ if cur_id != old_id: # Process old id scores. if scores_list: - aps_list = \ - list_moving_window_average_values( - scores_list, - win_extlr=ap_extlr) + aps_list = list_moving_window_average_values( + scores_list, win_extlr=ap_extlr + ) start_pos = site_starts_dic[old_id] seq_id = old_id # Get original FASTA sequence ID. @@ -763,8 +784,9 @@ f.close() # Process last block. if scores_list: - aps_list = list_moving_window_average_values(scores_list, - win_extlr=ap_extlr) + aps_list = list_moving_window_average_values( + scores_list, win_extlr=ap_extlr + ) start_pos = site_starts_dic[old_id] seq_id = old_id # Get original FASTA sequence ID. @@ -776,11 +798,12 @@ OUTPROF.close() -############################################################################### +####################################################################### + -def graphprot_profile_extract_peak_regions(in_file, out_file, - max_merge_dist=0, - sc_thr=0): +def graphprot_profile_extract_peak_regions( + in_file, out_file, max_merge_dist=0, sc_thr=0 +): """ Extract peak regions from GraphProt .profile file. Store the peak regions (defined as regions with scores >= sc_thr) @@ -835,21 +858,22 @@ # Process old id scores. if scores_list: # Extract peaks from region. - peak_list = \ - list_extract_peaks(scores_list, - max_merge_dist=max_merge_dist, - coords="bed", - sc_thr=sc_thr) + peak_list = list_extract_peaks( + scores_list, + max_merge_dist=max_merge_dist, + coords="bed", + sc_thr=sc_thr, + ) start_pos = site_starts_dic[old_id] # Print out peaks in .bed format. for ln in peak_list: peak_s = start_pos + ln[0] peak_e = start_pos + ln[1] site_id = "%s,%i" % (old_id, ln[2]) - OUTPEAKS.write("%s\t%i\t%i" - "\t%s\t%f\t+\n" - % (old_id, peak_s, - peak_e, site_id, ln[3])) + OUTPEAKS.write( + "%s\t%i\t%i" + "\t%s\t%f\t+\n" % (old_id, peak_s, peak_e, site_id, ln[3]) + ) # Reset list. scores_list = [] old_id = cur_id @@ -861,27 +885,25 @@ # Process last block. if scores_list: # Extract peaks from region. - peak_list = list_extract_peaks(scores_list, - max_merge_dist=max_merge_dist, - coords="bed", - sc_thr=sc_thr) + peak_list = list_extract_peaks( + scores_list, max_merge_dist=max_merge_dist, coords="bed", sc_thr=sc_thr + ) start_pos = site_starts_dic[old_id] # Print out peaks in .bed format. for ln in peak_list: peak_s = start_pos + ln[0] peak_e = start_pos + ln[1] site_id = "%s,%i" % (old_id, ln[2]) # best score also 1-based. - OUTPEAKS.write("%s\t%i\t%i\t%s\t%f\t+\n" - % (old_id, peak_s, peak_e, site_id, ln[3])) + OUTPEAKS.write( + "%s\t%i\t%i\t%s\t%f\t+\n" % (old_id, peak_s, peak_e, site_id, ln[3]) + ) OUTPEAKS.close() -############################################################################### +####################################################################### -def list_extract_peaks(in_list, - max_merge_dist=0, - coords="list", - sc_thr=0): + +def list_extract_peaks(in_list, max_merge_dist=0, coords="list", sc_thr=0): """ Extract peak regions from list. Peak region is defined as region >= score threshold. @@ -969,8 +991,12 @@ if peak_list[i][3] < peak_list[j][3]: new_top_pos = peak_list[j][2] new_top_sc = peak_list[j][3] - new_peak = [peak_list[i][0], peak_list[j][1], - new_top_pos, new_top_sc] + new_peak = [ + peak_list[i][0], + peak_list[j][1], + new_top_pos, + new_top_sc, + ] # If two peaks were merged. if new_peak: merged_peak_list.append(new_peak) @@ -991,10 +1017,12 @@ return peak_list -############################################################################### +####################################################################### + -def bed_peaks_to_genomic_peaks(peak_file, genomic_peak_file, genomic_sites_bed, - print_rows=False): +def bed_peaks_to_genomic_peaks( + peak_file, genomic_peak_file, genomic_sites_bed, print_rows=False +): """ Given a .bed file of sequence peak regions (possible coordinates from 0 to length of s), convert peak coordinates to genomic coordinates. @@ -1017,10 +1045,9 @@ row = line.strip() cols = line.strip().split("\t") site_id = cols[3] - assert site_id \ - not in id2row_dic, \ - "column 4 IDs not unique in given .bed file \"%s\"" \ - % (genomic_sites_bed) + assert ( + site_id not in id2row_dic + ), 'column 4 IDs not unique in given .bed file "%s"' % (genomic_sites_bed) id2row_dic[site_id] = row f.close() @@ -1034,13 +1061,14 @@ site_e = int(cols[2]) site_id2 = cols[3] site_sc = float(cols[4]) - assert re.search(".+,.+", site_id2), \ - "regular expression failed for ID \"%s\"" % (site_id2) + assert re.search( + ".+,.+", site_id2 + ), 'regular expression failed for ID "%s"' % (site_id2) m = re.search(r".+,(\d+)", site_id2) sc_pos = int(m.group(1)) # 1-based. - assert site_id in id2row_dic, \ - "site ID \"%s\" not found in genomic sites dictionary" \ - % (site_id) + assert ( + site_id in id2row_dic + ), 'site ID "%s" not found in genomic sites dictionary' % (site_id) row = id2row_dic[site_id] rowl = row.split("\t") gen_chr = rowl[0] @@ -1054,16 +1082,23 @@ new_s = gen_e - site_e new_e = gen_e - site_s new_sc_pos = gen_e - sc_pos + 1 # keep 1-based. - new_row = "%s\t%i\t%i\t%s,%i\t%f\t%s" \ - % (gen_chr, new_s, new_e, - site_id, new_sc_pos, site_sc, gen_pol) + new_row = "%s\t%i\t%i\t%s,%i\t%f\t%s" % ( + gen_chr, + new_s, + new_e, + site_id, + new_sc_pos, + site_sc, + gen_pol, + ) OUTPEAKS.write("%s\n" % (new_row)) if print_rows: print(new_row) OUTPEAKS.close() -############################################################################### +####################################################################### + def diff_two_files_identical(file1, file2): """ @@ -1087,4 +1122,4 @@ return same -############################################################################### +#######################################################################
--- a/graphprot_predict_wrapper.py Thu Jan 28 15:06:14 2021 +0000 +++ b/graphprot_predict_wrapper.py Wed Jun 05 16:40:51 2024 +0000 @@ -7,7 +7,6 @@ import gplib - """ TOOL DEPENDENCIES @@ -81,7 +80,8 @@ """ -############################################################################### +####################################################################### + def setup_argument_parser(): """Setup argparse parser.""" @@ -100,100 +100,119 @@ """ # Define argument parser. - p = ap.ArgumentParser(add_help=False, - prog="graphprot_predict_wrapper.py", - description=help_description, - formatter_class=ap.MetavarTypeHelpFormatter) + p = ap.ArgumentParser( + add_help=False, + prog="graphprot_predict_wrapper.py", + description=help_description, + formatter_class=ap.MetavarTypeHelpFormatter, + ) # Argument groups. p_man = p.add_argument_group("REQUIRED ARGUMENTS") p_opt = p.add_argument_group("OPTIONAL ARGUMENTS") # Required arguments. - p_opt.add_argument("-h", "--help", - action="help", - help="Print help message") - p_man.add_argument("--fasta", - dest="in_fa", - type=str, - required=True, - help="Sequences .fa file to predict" - " on (option -fasta)") - p_man.add_argument("--model", - dest="in_model", - type=str, - required=True, - help="GraphProt model file to use for predictions" - " (option -model)") - p_man.add_argument("--params", - dest="in_params", - type=str, - required=True, - help="Parameter file for given model") - p_man.add_argument("--data-id", - dest="data_id", - type=str, - required=True, - help="Data ID (option -prefix)") + p_opt.add_argument("-h", "--help", action="help", help="Print help message") + p_man.add_argument( + "--fasta", + dest="in_fa", + type=str, + required=True, + help="Sequences .fa file to predict" " on (option -fasta)", + ) + p_man.add_argument( + "--model", + dest="in_model", + type=str, + required=True, + help="GraphProt model file to use for predictions" " (option -model)", + ) + p_man.add_argument( + "--params", + dest="in_params", + type=str, + required=True, + help="Parameter file for given model", + ) + p_man.add_argument( + "--data-id", + dest="data_id", + type=str, + required=True, + help="Data ID (option -prefix)", + ) # ---> I'm a conditional argument <--- - p_opt.add_argument("--ws-pred", - dest="ws_pred", - default=False, - action="store_true", - help="Run a whole site prediction instead " - "of calculating profiles (default: false)") + p_opt.add_argument( + "--ws-pred", + dest="ws_pred", + default=False, + action="store_true", + help="Run a whole site prediction instead " + "of calculating profiles (default: false)", + ) # Additional arguments. - p_opt.add_argument("--sc-thr", - dest="score_thr", - type=float, - default=0, - help="Score threshold for extracting " - "average profile peak regions (default: 0)") - p_opt.add_argument("--max-merge-dist", - dest="max_merge_dist", - type=int, - default=0, - choices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], - help="Maximum merge distance for nearby peak regions" - " (default: report all non-overlapping regions)") - p_opt.add_argument("--gen-site-bed", - dest="genomic_sites_bed", - type=str, - help=".bed file specifying the genomic regions of " - "the input .fa sequences. Corrupt .bed " - "information will be punished (default: false)") - p_opt.add_argument("--conf-out", - dest="conf_out", - default=False, - action="store_true", - help="Output filtered peak regions BED file or " - "predictions file (if --ws-pred) using the median " - "positive training site score for filtering " - "(default: false)") - p_opt.add_argument("--gp-output", - dest="gp_output", - default=False, - action="store_true", - help="Print output produced by GraphProt " - "(default: false)") - p_opt.add_argument("--ap-extlr", - dest="ap_extlr", - type=int, - default=5, - choices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], - help="Define average profile up- and downstream " - "extension to produce the average profile. " - "The mean over small sequence windows " - "(window length = --ap-extlr*2 + 1) is used to " - "get position scores, thus the average profile " - "is more smooth than the initial profile output " - "by GraphProt (default: 5)") + p_opt.add_argument( + "--sc-thr", + dest="score_thr", + type=float, + default=0, + help="Score threshold for extracting " + "average profile peak regions (default: 0)", + ) + p_opt.add_argument( + "--max-merge-dist", + dest="max_merge_dist", + type=int, + default=0, + choices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + help="Maximum merge distance for nearby peak regions" + " (default: report all non-overlapping regions)", + ) + p_opt.add_argument( + "--gen-site-bed", + dest="genomic_sites_bed", + type=str, + help=".bed file specifying the genomic regions of " + "the input .fa sequences. Corrupt .bed " + "information will be punished (default: false)", + ) + p_opt.add_argument( + "--conf-out", + dest="conf_out", + default=False, + action="store_true", + help="Output filtered peak regions BED file or " + "predictions file (if --ws-pred) using the median " + "positive training site score for filtering " + "(default: false)", + ) + p_opt.add_argument( + "--gp-output", + dest="gp_output", + default=False, + action="store_true", + help="Print output produced by GraphProt " "(default: false)", + ) + p_opt.add_argument( + "--ap-extlr", + dest="ap_extlr", + type=int, + default=5, + choices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + help="Define average profile up- and downstream " + "extension to produce the average profile. " + "The mean over small sequence windows " + "(window length = --ap-extlr*2 + 1) is used to " + "get position scores, thus the average profile " + "is more smooth than the initial profile output " + "by GraphProt (default: 5)", + ) return p -############################################################################### +####################################################################### -if __name__ == '__main__': +if __name__ == "__main__": # Setup argparse. parser = setup_argument_parser() @@ -209,70 +228,86 @@ # Check tool availability. assert gplib.is_tool("GraphProt.pl"), "GraphProt.pl not in PATH" # Check file inputs. - assert os.path.exists(args.in_fa), \ - "input .fa file \"%s\" not found" % (args.in_fa) - assert os.path.exists(args.in_model), \ - "input .model file \"%s\" not found" % (args.in_model) - assert os.path.exists(args.in_params), \ - "input .params file \"%s\" not found" % (args.in_params) + assert os.path.exists(args.in_fa), 'input .fa file "%s" not found' % (args.in_fa) + assert os.path.exists(args.in_model), 'input .model file "%s" not found' % ( + args.in_model + ) + assert os.path.exists(args.in_params), 'input .params file "%s" not found' % ( + args.in_params + ) # Count .fa entries. c_in_fa = gplib.count_fasta_headers(args.in_fa) - assert c_in_fa, "input .fa file \"%s\" no headers found" % (args.in_fa) + assert c_in_fa, 'input .fa file "%s" no headers found' % (args.in_fa) print("# input .fa sequences: %i" % (c_in_fa)) # Read in FASTA sequences to check for uppercase sequences. seqs_dic = gplib.read_fasta_into_dic(args.in_fa) # Check for lowercase only sequences, which cause GP to crash. - error_mess = "input sequences encountered containing "\ - "only lowercase characters or lowercase characters in between "\ - "uppercase characters. Please provide either all uppercase "\ - "sequences or sequences containing uppercase regions surrounded "\ - "by lowercase context regions for structure calculation (see "\ - "viewpoint concept in original GraphProt publication "\ + error_mess = ( + "input sequences encountered containing " + "only lowercase characters or lowercase characters in between " + "uppercase characters. Please provide either all uppercase " + "sequences or sequences containing uppercase regions surrounded " + "by lowercase context regions for structure calculation (see " + "viewpoint concept in original GraphProt publication " "for more details)" + ) if args.ws_pred: bad_ids = gplib.check_seqs_dic_format(seqs_dic) assert not bad_ids, "%s" % (error_mess) c_uc_nt = gplib.seqs_dic_count_uc_nts(seqs_dic) - assert c_uc_nt, "no uppercase nucleotides in input .fa sequences. "\ - "Please change sequences to uppercase (keep in mind "\ - "GraphProt only scores uppercase regions (according "\ - "to its viewpoint concept)" + assert c_uc_nt, ( + "no uppercase nucleotides in input .fa sequences. " + "Please change sequences to uppercase (keep in mind " + "GraphProt only scores uppercase regions (according " + "to its viewpoint concept)" + ) if not args.ws_pred: # Check for lowercase sequences. c_lc_nt = gplib.seqs_dic_count_lc_nts(seqs_dic) - assert not c_lc_nt, "lowercase nucleotides in input .fa not allowed"\ - " in profile predictions, since GraphProt only scores "\ + assert not c_lc_nt, ( + "lowercase nucleotides in input .fa not allowed" + " in profile predictions, since GraphProt only scores " "uppercase regions (according to its viewpoint concept)" + ) # Check .bed. if args.genomic_sites_bed: # An array of checks, marvelous. - assert \ - os.path.exists(args.genomic_sites_bed), \ - "genomic .bed file \"%s\" not found" % (args.genomic_sites_bed) + assert os.path.exists( + args.genomic_sites_bed + ), 'genomic .bed file "%s" not found' % (args.genomic_sites_bed) # Check .bed for content. - assert gplib.count_file_rows(args.genomic_sites_bed), \ - "genomic .bed file \"%s\" is empty" % (args.genomic_sites_bed) + assert gplib.count_file_rows( + args.genomic_sites_bed + ), 'genomic .bed file "%s" is empty' % (args.genomic_sites_bed) # Check .bed for 6-column format. - assert gplib.bed_check_six_col_format(args.genomic_sites_bed), \ - "genomic .bed file \"%s\" appears to not "\ - "be in 6-column .bed format" % (args.genomic_sites_bed) + assert gplib.bed_check_six_col_format( + args.genomic_sites_bed + ), 'genomic .bed file "%s" appears to not ' "be in 6-column .bed format" % ( + args.genomic_sites_bed + ) # Check for unique column 4 IDs. - assert gplib.bed_check_unique_ids(args.genomic_sites_bed), \ - "genomic .bed file \"%s\" column 4 IDs not unique" % \ - (args.genomic_sites_bed) + assert gplib.bed_check_unique_ids( + args.genomic_sites_bed + ), 'genomic .bed file "%s" column 4 IDs not unique' % (args.genomic_sites_bed) # Read in .bed regions, compare to FASTA sequences. seq_len_dic = gplib.get_seq_lengths_from_seqs_dic(seqs_dic) reg_len_dic = gplib.bed_get_region_lengths(args.genomic_sites_bed) for seq_id in seq_len_dic: seq_l = seq_len_dic[seq_id] - assert seq_id in reg_len_dic, \ - "sequence ID \"%s\" missing in input .bed \"%s\"" % \ - (seq_id, args.genomic_sites_bed) + assert ( + seq_id in reg_len_dic + ), 'sequence ID "%s" missing in input .bed "%s"' % ( + seq_id, + args.genomic_sites_bed, + ) reg_l = reg_len_dic[seq_id] - assert seq_l == reg_l, \ - "sequence length differs from .bed region length (%i != %i)" \ - % (seq_l, reg_l) + assert ( + seq_l == reg_l + ), "sequence length differs from .bed region length (%i != %i)" % ( + seq_l, + reg_l, + ) # Read in model parameters. param_dic = gplib.graphprot_get_param_dic(args.in_params) # Create GraphProt parameter string. @@ -284,110 +319,147 @@ """ if args.ws_pred: # Do whole site prediction. - print("Starting whole site predictions on input .fa file" - " (-action predict) ... ") - check_cmd = "GraphProt.pl -action predict -prefix " \ - + args.data_id + " -fasta " + args.in_fa + " " \ - + param_string + " -model " + args.in_model + print( + "Starting whole site predictions on input .fa file" + " (-action predict) ... " + ) + check_cmd = ( + "GraphProt.pl -action predict -prefix " + + args.data_id + + " -fasta " + + args.in_fa + + " " + + param_string + + " -model " + + args.in_model + ) output = subprocess.getoutput(check_cmd) - assert output, "the following call of GraphProt.pl "\ - "produced no output:\n%s" % (check_cmd) + assert ( + output + ), "the following call of GraphProt.pl " "produced no output:\n%s" % (check_cmd) if args.gp_output: print(output) ws_predictions_file = args.data_id + ".predictions" - assert os.path.exists(ws_predictions_file), \ - "Whole site prediction output .predictions file \"%s\" not found" \ - % (ws_predictions_file) + assert os.path.exists( + ws_predictions_file + ), 'Whole site prediction output .predictions file "%s" not found' % ( + ws_predictions_file + ) if args.conf_out: # Filter by pos_train_ws_pred_median median. - assert "pos_train_ws_pred_median" in param_dic, \ - "whole site top scores median information "\ - "missing in .params file" - pos_tr_ws_pred_med = \ - float(param_dic["pos_train_ws_pred_median"]) + assert "pos_train_ws_pred_median" in param_dic, ( + "whole site top scores median information " "missing in .params file" + ) + pos_tr_ws_pred_med = float(param_dic["pos_train_ws_pred_median"]) # Filtered file. filt_ws_predictions_file = args.data_id + ".p50.predictions" - print("Extracting p50 sites from whole site predictions" - " (score threshold = %f) ... " % (pos_tr_ws_pred_med)) - gplib.graphprot_filter_predictions_file(ws_predictions_file, - filt_ws_predictions_file, - sc_thr=pos_tr_ws_pred_med) + print( + "Extracting p50 sites from whole site predictions" + " (score threshold = %f) ... " % (pos_tr_ws_pred_med) + ) + gplib.graphprot_filter_predictions_file( + ws_predictions_file, filt_ws_predictions_file, sc_thr=pos_tr_ws_pred_med + ) else: # Do profile prediction. - print("Starting profile predictions on on input .fa file " - "(-action predict_profile) ... ") - check_cmd = "GraphProt.pl -action predict_profile -prefix " \ - + args.data_id + " -fasta " + args.in_fa + " " + \ - param_string + " -model " + args.in_model + print( + "Starting profile predictions on on input .fa file " + "(-action predict_profile) ... " + ) + check_cmd = ( + "GraphProt.pl -action predict_profile -prefix " + + args.data_id + + " -fasta " + + args.in_fa + + " " + + param_string + + " -model " + + args.in_model + ) output = subprocess.getoutput(check_cmd) - assert output, "the following call of GraphProt.pl produced "\ - "no output:\n%s" % (check_cmd) + assert ( + output + ), "the following call of GraphProt.pl produced " "no output:\n%s" % (check_cmd) if args.gp_output: print(output) profile_predictions_file = args.data_id + ".profile" - assert os.path.exists(profile_predictions_file), \ - "Profile prediction output .profile file \"%s\" not found" % \ - (profile_predictions_file) + assert os.path.exists( + profile_predictions_file + ), 'Profile prediction output .profile file "%s" not found' % ( + profile_predictions_file + ) # Profile prediction output files. avg_prof_file = args.data_id + ".avg_profile" avg_prof_peaks_file = args.data_id + ".avg_profile.peaks.bed" - avg_prof_gen_peaks_file = args.data_id + \ - ".avg_profile.genomic_peaks.bed" - avg_prof_peaks_p50_file = args.data_id + \ - ".avg_profile.p50.peaks.bed" - avg_prof_gen_peaks_p50_file = args.data_id + \ - ".avg_profile.p50.genomic_peaks.bed" + avg_prof_gen_peaks_file = args.data_id + ".avg_profile.genomic_peaks.bed" + avg_prof_peaks_p50_file = args.data_id + ".avg_profile.p50.peaks.bed" + avg_prof_gen_peaks_p50_file = ( + args.data_id + ".avg_profile.p50.genomic_peaks.bed" + ) # Get sequence IDs in order from input .fa file. seq_ids_list = gplib.fasta_read_in_ids(args.in_fa) # Calculate average profiles. - print("Getting average profile from profile " - "(extlr for smoothing: %i) ... " % (args.ap_extlr)) - gplib.graphprot_profile_calc_avg_profile(profile_predictions_file, - avg_prof_file, - ap_extlr=args.ap_extlr, - seq_ids_list=seq_ids_list, - method=2) + print( + "Getting average profile from profile " + "(extlr for smoothing: %i) ... " % (args.ap_extlr) + ) + gplib.graphprot_profile_calc_avg_profile( + profile_predictions_file, + avg_prof_file, + ap_extlr=args.ap_extlr, + seq_ids_list=seq_ids_list, + method=2, + ) # Extract peak regions on sequences with threshold score 0. - print("Extracting peak regions from average profile " - "(score threshold = 0) ... ") + print( + "Extracting peak regions from average profile " "(score threshold = 0) ... " + ) killpep8 = args.max_merge_dist - gplib.graphprot_profile_extract_peak_regions(avg_prof_file, - avg_prof_peaks_file, - max_merge_dist=killpep8, - sc_thr=args.score_thr) + gplib.graphprot_profile_extract_peak_regions( + avg_prof_file, + avg_prof_peaks_file, + max_merge_dist=killpep8, + sc_thr=args.score_thr, + ) # Convert peaks to genomic coordinates. if args.genomic_sites_bed: print("Converting peak regions to genomic coordinates ... ") killit = args.genomic_sites_bed - gplib.bed_peaks_to_genomic_peaks(avg_prof_peaks_file, - avg_prof_gen_peaks_file, - print_rows=False, - genomic_sites_bed=killit) + gplib.bed_peaks_to_genomic_peaks( + avg_prof_peaks_file, + avg_prof_gen_peaks_file, + print_rows=False, + genomic_sites_bed=killit, + ) # Extract peak regions with threshold score p50. if args.conf_out: sc_id = "pos_train_avg_profile_median_%i" % (args.ap_extlr) # Filter by pos_tr_ws_pred_med median. - assert sc_id in param_dic, "average profile extlr %i median "\ + assert sc_id in param_dic, ( + "average profile extlr %i median " "information missing in .params file" % (args.ap_extlr) + ) p50_sc_thr = float(param_dic[sc_id]) - print("Extracting p50 peak regions from average profile " - "(score threshold = %f) ... " % (p50_sc_thr)) + print( + "Extracting p50 peak regions from average profile " + "(score threshold = %f) ... " % (p50_sc_thr) + ) despair = avg_prof_peaks_p50_file pain = args.max_merge_dist - gplib.graphprot_profile_extract_peak_regions(avg_prof_file, - despair, - max_merge_dist=pain, - sc_thr=p50_sc_thr) + gplib.graphprot_profile_extract_peak_regions( + avg_prof_file, despair, max_merge_dist=pain, sc_thr=p50_sc_thr + ) # Convert peaks to genomic coordinates. if args.genomic_sites_bed: - print("Converting p50 peak regions to " - "genomic coordinates ... ") + print("Converting p50 peak regions to " "genomic coordinates ... ") madness = args.genomic_sites_bed - gplib.bed_peaks_to_genomic_peaks(avg_prof_peaks_p50_file, - avg_prof_gen_peaks_p50_file, - genomic_sites_bed=madness) + gplib.bed_peaks_to_genomic_peaks( + avg_prof_peaks_p50_file, + avg_prof_gen_peaks_p50_file, + genomic_sites_bed=madness, + ) # Done. print("Script: I'm done.") print("Author: ... ")
--- a/graphprot_train_predict.xml Thu Jan 28 15:06:14 2021 +0000 +++ b/graphprot_train_predict.xml Wed Jun 05 16:40:51 2024 +0000 @@ -20,6 +20,7 @@ $action_type.training_options.disable_cv $action_type.training_options.disable_motifs --min-train $action_type.training_options.min_train + --gp-output #elif $action_type.action_type_selector == 'predict': python '$__tool_directory__/graphprot_predict_wrapper.py' @@ -35,6 +36,7 @@ --ap-extlr $action_type.prediction_options.ap_extlr $action_type.prediction_options.conf_out $action_type.prediction_options.ws_pred_out + --gp-output #end if
--- a/graphprot_train_wrapper.py Thu Jan 28 15:06:14 2021 +0000 +++ b/graphprot_train_wrapper.py Wed Jun 05 16:40:51 2024 +0000 @@ -7,7 +7,6 @@ import gplib - """ TOOL DEPENDENCIES @@ -62,7 +61,8 @@ """ -############################################################################### +####################################################################### + def setup_argument_parser(): """Setup argparse parser.""" @@ -84,89 +84,107 @@ """ # Define argument parser. - p = ap.ArgumentParser(add_help=False, - prog="graphprot_train_wrapper.py", - description=help_description, - formatter_class=ap.MetavarTypeHelpFormatter) + p = ap.ArgumentParser( + add_help=False, + prog="graphprot_train_wrapper.py", + description=help_description, + formatter_class=ap.MetavarTypeHelpFormatter, + ) # Argument groups. p_man = p.add_argument_group("REQUIRED ARGUMENTS") p_opt = p.add_argument_group("OPTIONAL ARGUMENTS") # Required arguments. - p_opt.add_argument("-h", "--help", - action="help", - help="Print help message") - p_man.add_argument("--pos", - dest="in_pos_fa", - type=str, - required=True, - help="Positive (= binding site) sequences .fa file " - "for model training (option -fasta)") - p_man.add_argument("--neg", - dest="in_neg_fa", - type=str, - required=True, - help="Negative sequences .fa file for model " - "training (option -negfasta)") - p_man.add_argument("--data-id", - dest="data_id", - type=str, - required=True, - help="Data ID (option -prefix)") + p_opt.add_argument("-h", "--help", action="help", help="Print help message") + p_man.add_argument( + "--pos", + dest="in_pos_fa", + type=str, + required=True, + help="Positive (= binding site) sequences .fa file " + "for model training (option -fasta)", + ) + p_man.add_argument( + "--neg", + dest="in_neg_fa", + type=str, + required=True, + help="Negative sequences .fa file for model " "training (option -negfasta)", + ) + p_man.add_argument( + "--data-id", + dest="data_id", + type=str, + required=True, + help="Data ID (option -prefix)", + ) # Additional arguments. - p_opt.add_argument("--opt-set-size", - dest="opt_set_size", - type=int, - default=500, - help="Hyperparameter optimization set size (taken " - "away from both --pos and --neg) (default: 500)") - p_opt.add_argument("--opt-pos", - dest="opt_pos_fa", - type=str, - help="Positive (= binding site) sequences .fa file " - "for hyperparameter optimization (default: take " - "--opt-set-size from --pos)") - p_opt.add_argument("--opt-neg", - dest="opt_neg_fa", - type=str, - help="Negative sequences .fa file for hyperparameter " - "optimization (default: take --opt-set-size " - "from --neg)") - p_opt.add_argument("--min-train", - dest="min_train", - type=int, - default=500, - help="Minimum amount of training sites demanded " - "(default: 500)") - p_opt.add_argument("--disable-cv", - dest="disable_cv", - default=False, - action="store_true", - help="Disable cross validation step (default: false)") - p_opt.add_argument("--disable-motifs", - dest="disable_motifs", - default=False, - action="store_true", - help="Disable motif generation step (default: false)") - p_opt.add_argument("--gp-output", - dest="gp_output", - default=False, - action="store_true", - help="Print output produced by GraphProt " - "(default: false)") - p_opt.add_argument("--str-model", - dest="train_str_model", - default=False, - action="store_true", - help="Train a structure model (default: train " - "a sequence model)") + p_opt.add_argument( + "--opt-set-size", + dest="opt_set_size", + type=int, + default=500, + help="Hyperparameter optimization set size (taken " + "away from both --pos and --neg) (default: 500)", + ) + p_opt.add_argument( + "--opt-pos", + dest="opt_pos_fa", + type=str, + help="Positive (= binding site) sequences .fa file " + "for hyperparameter optimization (default: take " + "--opt-set-size from --pos)", + ) + p_opt.add_argument( + "--opt-neg", + dest="opt_neg_fa", + type=str, + help="Negative sequences .fa file for hyperparameter " + "optimization (default: take --opt-set-size " + "from --neg)", + ) + p_opt.add_argument( + "--min-train", + dest="min_train", + type=int, + default=500, + help="Minimum amount of training sites demanded " "(default: 500)", + ) + p_opt.add_argument( + "--disable-cv", + dest="disable_cv", + default=False, + action="store_true", + help="Disable cross validation step (default: false)", + ) + p_opt.add_argument( + "--disable-motifs", + dest="disable_motifs", + default=False, + action="store_true", + help="Disable motif generation step (default: false)", + ) + p_opt.add_argument( + "--gp-output", + dest="gp_output", + default=False, + action="store_true", + help="Print output produced by GraphProt " "(default: false)", + ) + p_opt.add_argument( + "--str-model", + dest="train_str_model", + default=False, + action="store_true", + help="Train a structure model (default: train " "a sequence model)", + ) return p -############################################################################### +####################################################################### -if __name__ == '__main__': +if __name__ == "__main__": # Setup argparse. parser = setup_argument_parser() @@ -182,17 +200,17 @@ # Check tool availability. assert gplib.is_tool("GraphProt.pl"), "GraphProt.pl not in PATH" # Check file inputs. - assert os.path.exists(args.in_pos_fa), \ - "positives .fa file \"%s\" not found" % (args.in_pos_fa) - assert os.path.exists(args.in_neg_fa), \ - "negatives .fa file \"%s\" not found" % (args.in_neg_fa) + assert os.path.exists(args.in_pos_fa), 'positives .fa file "%s" not found' % ( + args.in_pos_fa + ) + assert os.path.exists(args.in_neg_fa), 'negatives .fa file "%s" not found' % ( + args.in_neg_fa + ) # Count .fa entries. c_pos_fa = gplib.count_fasta_headers(args.in_pos_fa) c_neg_fa = gplib.count_fasta_headers(args.in_neg_fa) - assert c_pos_fa, "positives .fa file \"%s\" no headers found" % \ - (args.in_pos_fa) - assert c_neg_fa, "negatives .fa file \"%s\" no headers found" % \ - (args.in_neg_fa) + assert c_pos_fa, 'positives .fa file "%s" no headers found' % (args.in_pos_fa) + assert c_neg_fa, 'negatives .fa file "%s" no headers found' % (args.in_neg_fa) print("# positive .fa sequences: %i" % (c_pos_fa)) print("# negative .fa sequences: %i" % (c_neg_fa)) # Check additional files. @@ -201,13 +219,15 @@ if args.opt_neg_fa: assert args.opt_pos_fa, "--opt-neg but no --opt-pos given" # Check for lowercase only sequences, which cause GP to crash. - error_mess = "input sequences encountered containing "\ - "only lowercase characters or lowercase characters in between "\ - "uppercase characters. Please provide either all uppercase "\ - "sequences or sequences containing uppercase regions surrounded "\ - "by lowercase context regions for structure calculation (see "\ - "viewpoint concept in original GraphProt publication "\ + error_mess = ( + "input sequences encountered containing " + "only lowercase characters or lowercase characters in between " + "uppercase characters. Please provide either all uppercase " + "sequences or sequences containing uppercase regions surrounded " + "by lowercase context regions for structure calculation (see " + "viewpoint concept in original GraphProt publication " "for more details)" + ) seqs_dic = gplib.read_fasta_into_dic(args.in_pos_fa) bad_ids = gplib.check_seqs_dic_format(seqs_dic) assert not bad_ids, "%s" % (error_mess) @@ -227,57 +247,73 @@ if args.opt_pos_fa and args.opt_neg_fa: c_parop_pos_fa = gplib.count_fasta_headers(args.opt_pos_fa) c_parop_neg_fa = gplib.count_fasta_headers(args.opt_neg_fa) - assert c_parop_pos_fa, "--opt-pos .fa file \"%s\" no headers found" \ - % (args.opt_pos_fa) - assert c_parop_neg_fa, "--opt-neg .fa file \"%s\" no headers found" \ - % (args.opt_neg_fa) + assert c_parop_pos_fa, '--opt-pos .fa file "%s" no headers found' % ( + args.opt_pos_fa + ) + assert c_parop_neg_fa, '--opt-neg .fa file "%s" no headers found' % ( + args.opt_neg_fa + ) # Less than 500 for training?? You gotta be kidding. - assert c_pos_fa >= args.min_train, \ - "--pos for training < %i, please provide more (try at least "\ + assert c_pos_fa >= args.min_train, ( + "--pos for training < %i, please provide more (try at least " "> 1000, the more the better)" % (args.min_train) - assert c_neg_fa >= args.min_train, \ - "--neg for training < %i, please provide more (try at least "\ + ) + assert c_neg_fa >= args.min_train, ( + "--neg for training < %i, please provide more (try at least " "> 1000, the more the better)" % (args.min_train) + ) # Looking closer at ratios. pos_neg_ratio = c_parop_pos_fa / c_parop_neg_fa if pos_neg_ratio < 0.8 or pos_neg_ratio > 1.25: - assert 0, "ratio of --opt-pos to --opt-neg < 0.8 or > 1.25 "\ - "(ratio = %f). Try to keep ratio closer to 1 or better use "\ - "identical numbers (keep in mind that performance measures "\ - "such as accuracy or AUROC are not suitable for imbalanced "\ + assert 0, ( + "ratio of --opt-pos to --opt-neg < 0.8 or > 1.25 " + "(ratio = %f). Try to keep ratio closer to 1 or better use " + "identical numbers (keep in mind that performance measures " + "such as accuracy or AUROC are not suitable for imbalanced " " datasets!)" % (pos_neg_ratio) + ) else: # Define some minimum amount of training sites for the sake of sanity. c_pos_train = c_pos_fa - args.opt_set_size c_neg_train = c_neg_fa - args.opt_set_size # Start complaining. - assert c_pos_fa >= args.opt_set_size, \ - "# positives < --opt-set-size (%i < %i)" \ - % (c_pos_fa, args.opt_set_size) - assert c_neg_fa >= args.opt_set_size, \ - "# negatives < --opt-set-size (%i < %i)" \ - % (c_neg_fa, args.opt_set_size) - assert c_pos_train >= args.opt_set_size, \ - "# positives remaining for training < --opt-set-size "\ - "(%i < %i)" % (c_pos_train, args.opt_set_size) - assert c_neg_train >= args.opt_set_size, "# negatives remaining "\ - "for training < --opt-set-size (%i < %i)" \ - % (c_neg_train, args.opt_set_size) + assert ( + c_pos_fa >= args.opt_set_size + ), "# positives < --opt-set-size (%i < %i)" % (c_pos_fa, args.opt_set_size) + assert ( + c_neg_fa >= args.opt_set_size + ), "# negatives < --opt-set-size (%i < %i)" % (c_neg_fa, args.opt_set_size) + assert ( + c_pos_train >= args.opt_set_size + ), "# positives remaining for training < --opt-set-size " "(%i < %i)" % ( + c_pos_train, + args.opt_set_size, + ) + assert ( + c_neg_train >= args.opt_set_size + ), "# negatives remaining " "for training < --opt-set-size (%i < %i)" % ( + c_neg_train, + args.opt_set_size, + ) # Less than 500?? You gotta be kidding. - assert c_pos_train >= args.min_train, \ - "# positives remaining for training < %i, please provide more "\ + assert c_pos_train >= args.min_train, ( + "# positives remaining for training < %i, please provide more " " (try at least > 1000, the more the better)" % (args.min_train) - assert c_neg_train >= args.min_train, \ - "# negatives remaining for training < %i, please provide more "\ + ) + assert c_neg_train >= args.min_train, ( + "# negatives remaining for training < %i, please provide more " "(try at least > 1000, the more the better)" % (args.min_train) + ) # Looking closer at ratios. pos_neg_ratio = c_pos_train / c_neg_train if pos_neg_ratio < 0.8 or pos_neg_ratio > 1.25: - assert 0, "ratio of --pos to --neg < 0.8 or > 1.25 "\ - "(ratio = %f). Try to keep ratio closer to 1 or better use "\ - "identical numbers (keep in mind that performance measures "\ - "such as accuracy or AUROC are not suitable for imbalanced "\ + assert 0, ( + "ratio of --pos to --neg < 0.8 or > 1.25 " + "(ratio = %f). Try to keep ratio closer to 1 or better use " + "identical numbers (keep in mind that performance measures " + "such as accuracy or AUROC are not suitable for imbalanced " "datasets!)" % (pos_neg_ratio) + ) """ Generate parop + train .fa output files for hyperparameter @@ -299,28 +335,37 @@ gplib.make_file_copy(args.in_neg_fa, neg_train_fa) else: # Generate parop + train .fa files from input .fa files. - gplib.split_fasta_into_test_train_files(args.in_pos_fa, pos_parop_fa, - pos_train_fa, - test_size=args.opt_set_size) - gplib.split_fasta_into_test_train_files(args.in_neg_fa, neg_parop_fa, - neg_train_fa, - test_size=args.opt_set_size) + gplib.split_fasta_into_test_train_files( + args.in_pos_fa, pos_parop_fa, pos_train_fa, test_size=args.opt_set_size + ) + gplib.split_fasta_into_test_train_files( + args.in_neg_fa, neg_parop_fa, neg_train_fa, test_size=args.opt_set_size + ) """ Do the hyperparameter optimization. """ print("Starting hyperparameter optimization (-action ls) ... ") - check_cmd = "GraphProt.pl -action ls -prefix " + args.data_id + \ - " -fasta " + pos_parop_fa + " -negfasta " + neg_parop_fa + check_cmd = ( + "GraphProt.pl -action ls -prefix " + + args.data_id + + " -fasta " + + pos_parop_fa + + " -negfasta " + + neg_parop_fa + ) # If sequence model should be trained (default). if not args.train_str_model: check_cmd += " -onlyseq" print(check_cmd) output = subprocess.getoutput(check_cmd) params_file = args.data_id + ".params" - assert os.path.exists(params_file), "Hyperparameter optimization output "\ - " .params file \"%s\" not found" % (params_file) + assert os.path.exists( + params_file + ), "Hyperparameter optimization output " ' .params file "%s" not found' % ( + params_file + ) # Add model type to params file. if args.train_str_model: gplib.echo_add_to_file("model_type: structure", params_file) @@ -334,19 +379,27 @@ """ print("Starting model training (-action train) ... ") - check_cmd = "GraphProt.pl -action train -prefix " + args.data_id \ - + " -fasta " + pos_train_fa + " -negfasta " + neg_train_fa \ - + " " + param_string + check_cmd = ( + "GraphProt.pl -action train -prefix " + + args.data_id + + " -fasta " + + pos_train_fa + + " -negfasta " + + neg_train_fa + + " " + + param_string + ) print(check_cmd) output = subprocess.getoutput(check_cmd) - assert output, \ - "The following call of GraphProt.pl produced no output:\n%s" \ - % (check_cmd) + assert output, "The following call of GraphProt.pl produced no output:\n%s" % ( + check_cmd + ) if args.gp_output: print(output) model_file = args.data_id + ".model" - assert os.path.exists(model_file), \ - "Training output .model file \"%s\" not found" % (model_file) + assert os.path.exists(model_file), 'Training output .model file "%s" not found' % ( + model_file + ) """ Do the 10-fold cross validation. @@ -354,19 +407,29 @@ """ if not args.disable_cv: print("Starting 10-fold cross validation (-action cv) ... ") - check_cmd = "GraphProt.pl -action cv -prefix " + args.data_id \ - + " -fasta " + pos_train_fa + " -negfasta " + neg_train_fa \ - + " " + param_string + " -model " + model_file + check_cmd = ( + "GraphProt.pl -action cv -prefix " + + args.data_id + + " -fasta " + + pos_train_fa + + " -negfasta " + + neg_train_fa + + " " + + param_string + + " -model " + + model_file + ) print(check_cmd) output = subprocess.getoutput(check_cmd) - assert output, \ - "The following call of GraphProt.pl produced no output:\n%s" \ - % (check_cmd) + assert output, "The following call of GraphProt.pl produced no output:\n%s" % ( + check_cmd + ) if args.gp_output: print(output) cv_results_file = args.data_id + ".cv_results" - assert os.path.exists(cv_results_file), \ - "CV output .cv_results file \"%s\" not found" % (cv_results_file) + assert os.path.exists( + cv_results_file + ), 'CV output .cv_results file "%s" not found' % (cv_results_file) """ Do the motif generation. @@ -374,75 +437,108 @@ """ if not args.disable_motifs: print("Starting motif generation (-action motif) ... ") - check_cmd = "GraphProt.pl -action motif -prefix " + args.data_id \ - + " -fasta " + pos_train_fa + " -negfasta " + neg_train_fa \ - + " " + param_string + " -model " + model_file + check_cmd = ( + "GraphProt.pl -action motif -prefix " + + args.data_id + + " -fasta " + + pos_train_fa + + " -negfasta " + + neg_train_fa + + " " + + param_string + + " -model " + + model_file + ) print(check_cmd) output = subprocess.getoutput(check_cmd) - assert output, \ - "The following call of GraphProt.pl produced no output:\n%s" \ - % (check_cmd) + assert output, "The following call of GraphProt.pl produced no output:\n%s" % ( + check_cmd + ) if args.gp_output: print(output) seq_motif_file = args.data_id + ".sequence_motif" seq_motif_png_file = args.data_id + ".sequence_motif.png" - assert os.path.exists(seq_motif_file), \ - "Motif output .sequence_motif file \"%s\" not found" \ - % (seq_motif_file) - assert os.path.exists(seq_motif_png_file), \ - "Motif output .sequence_motif.png file \"%s\" not found" \ - % (seq_motif_png_file) + assert os.path.exists( + seq_motif_file + ), 'Motif output .sequence_motif file "%s" not found' % (seq_motif_file) + assert os.path.exists( + seq_motif_png_file + ), 'Motif output .sequence_motif.png file "%s" not found' % (seq_motif_png_file) if args.train_str_model: str_motif_file = args.data_id + ".structure_motif" str_motif_png_file = args.data_id + ".structure_motif.png" - assert os.path.exists(str_motif_file), \ - "Motif output .structure_motif file \"%s\" not found" \ - % (str_motif_file) - assert os.path.exists(str_motif_png_file), \ - "Motif output .structure_motif.png file \"%s\" not found" \ - % (str_motif_png_file) + assert os.path.exists( + str_motif_file + ), 'Motif output .structure_motif file "%s" not found' % (str_motif_file) + assert os.path.exists( + str_motif_png_file + ), 'Motif output .structure_motif.png file "%s" not found' % ( + str_motif_png_file + ) """ Do whole site predictions on positive training set. """ - print("Starting whole site predictions on positive training set " - " (-action predict) ... ") - check_cmd = "GraphProt.pl -action predict -prefix " + args.data_id \ - + " -fasta " + pos_train_fa + " " + param_string \ - + " -model " + model_file + print( + "Starting whole site predictions on positive training set " + " (-action predict) ... " + ) + check_cmd = ( + "GraphProt.pl -action predict -prefix " + + args.data_id + + " -fasta " + + pos_train_fa + + " " + + param_string + + " -model " + + model_file + ) print(check_cmd) output = subprocess.getoutput(check_cmd) - assert output, \ - "The following call of GraphProt.pl produced no output:\n%s" \ - % (check_cmd) + assert output, "The following call of GraphProt.pl produced no output:\n%s" % ( + check_cmd + ) if args.gp_output: print(output) ws_predictions_file = args.data_id + ".predictions" - assert os.path.exists(ws_predictions_file), \ - "Whole site prediction output .predictions file \"%s\" not found" \ - % (ws_predictions_file) + assert os.path.exists( + ws_predictions_file + ), 'Whole site prediction output .predictions file "%s" not found' % ( + ws_predictions_file + ) """ Do profile predictions on positive training set. """ - print("Starting profile predictions on positive training set " - "-action predict_profile) ... ") - check_cmd = "GraphProt.pl -action predict_profile -prefix " \ - + args.data_id + " -fasta " + pos_train_fa + " " \ - + param_string + " -model " + model_file + print( + "Starting profile predictions on positive training set " + "-action predict_profile) ... " + ) + check_cmd = ( + "GraphProt.pl -action predict_profile -prefix " + + args.data_id + + " -fasta " + + pos_train_fa + + " " + + param_string + + " -model " + + model_file + ) print(check_cmd) output = subprocess.getoutput(check_cmd) - assert output, \ - "The following call of GraphProt.pl produced no output:\n%s" \ - % (check_cmd) + assert output, "The following call of GraphProt.pl produced no output:\n%s" % ( + check_cmd + ) if args.gp_output: print(output) profile_predictions_file = args.data_id + ".profile" - assert os.path.exists(profile_predictions_file), \ - "Profile prediction output .profile file \"%s\" not found" \ - % (profile_predictions_file) + assert os.path.exists( + profile_predictions_file + ), 'Profile prediction output .profile file "%s" not found' % ( + profile_predictions_file + ) """ Get 50 % score (median) for .predictions and .profile file. @@ -454,12 +550,11 @@ print("Getting .profile and .predictions median scores ... ") # Whole site scores median. - ws_pred_median = \ - gplib.graphprot_predictions_get_median(ws_predictions_file) + ws_pred_median = gplib.graphprot_predictions_get_median(ws_predictions_file) # Profile top site scores median. - profile_median = \ - gplib.graphprot_profile_get_tsm(profile_predictions_file, - profile_type="profile") + profile_median = gplib.graphprot_profile_get_tsm( + profile_predictions_file, profile_type="profile" + ) ws_pred_string = "pos_train_ws_pred_median: %f" % (ws_pred_median) profile_string = "pos_train_profile_median: %f" % (profile_median) gplib.echo_add_to_file(ws_pred_string, params_file) @@ -467,13 +562,14 @@ # Average profile top site scores median for extlr 1 to 10. for i in range(10): i += 1 - avg_profile_median = \ - gplib.graphprot_profile_get_tsm(profile_predictions_file, - profile_type="avg_profile", - avg_profile_extlr=i) + avg_profile_median = gplib.graphprot_profile_get_tsm( + profile_predictions_file, profile_type="avg_profile", avg_profile_extlr=i + ) - avg_profile_string = "pos_train_avg_profile_median_%i: %f" \ - % (i, avg_profile_median) + avg_profile_string = "pos_train_avg_profile_median_%i: %f" % ( + i, + avg_profile_median, + ) gplib.echo_add_to_file(avg_profile_string, params_file) print("Script: I'm done.")