Previous changeset 4:4ad83aed5c3c (2021-01-28) Next changeset 6:33b590aa07c1 (2024-08-06) |
Commit message:
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/graphprot commit ad60258f5759eaa205fec4af6143c728ea131419 |
modified:
gplib.py graphprot_predict_wrapper.py graphprot_train_predict.xml graphprot_train_wrapper.py |
b |
diff -r 4ad83aed5c3c -r ddcf35a868b8 gplib.py --- a/gplib.py Thu Jan 28 15:06:14 2021 +0000 +++ b/gplib.py Wed Jun 05 16:40:51 2024 +0000 |
[ |
b'@@ -1,4 +1,3 @@\n-\n import gzip\n import random\n import re\n@@ -6,7 +5,6 @@\n import subprocess\n from distutils.spawn import find_executable\n \n-\n """\n \n Run doctests:\n@@ -17,7 +15,8 @@\n """\n \n \n-###############################################################################\n+#######################################################################\n+\n \n def graphprot_predictions_get_median(predictions_file):\n """\n@@ -41,11 +40,12 @@\n return statistics.median(sc_list)\n \n \n-###############################################################################\n+#######################################################################\n+\n \n-def graphprot_profile_get_tsm(profile_file,\n- profile_type="profile",\n- avg_profile_extlr=5):\n+def graphprot_profile_get_tsm(\n+ profile_file, profile_type="profile", avg_profile_extlr=5\n+):\n \n """\n Given a GraphProt .profile file, extract for each site (identified by\n@@ -88,23 +88,21 @@\n max_list.append(max_sc)\n elif profile_type == "avg_profile":\n # Convert profile score list to average profile scores list.\n- aps_list = \\\n- list_moving_window_average_values(lists_dic[seq_id],\n- win_extlr=avg_profile_extlr)\n+ aps_list = list_moving_window_average_values(\n+ lists_dic[seq_id], win_extlr=avg_profile_extlr\n+ )\n max_sc = max(aps_list)\n max_list.append(max_sc)\n else:\n- assert 0, "invalid profile_type argument given: \\"%s\\"" \\\n- % (profile_type)\n+ assert 0, \'invalid profile_type argument given: "%s"\' % (profile_type)\n # Return the median.\n return statistics.median(max_list)\n \n \n-###############################################################################\n+#######################################################################\n \n-def list_moving_window_average_values(in_list,\n- win_extlr=5,\n- method=1):\n+\n+def list_moving_window_average_values(in_list, win_extlr=5, method=1):\n """\n Take a list of numeric values, and calculate for each position a new value,\n by taking the mean value of the window of positions -win_extlr and\n@@ -152,7 +150,8 @@\n return new_list\n \n \n-###############################################################################\n+#######################################################################\n+\n \n def echo_add_to_file(echo_string, out_file):\n """\n@@ -167,14 +166,16 @@\n assert not error, "echo is complaining:\\n%s\\n%s" % (check_cmd, output)\n \n \n-###############################################################################\n+#######################################################################\n+\n \n def is_tool(name):\n """Check whether tool "name" is in PATH."""\n return find_executable(name) is not None\n \n \n-###############################################################################\n+#######################################################################\n+\n \n def count_fasta_headers(fasta_file):\n """\n@@ -194,7 +195,8 @@\n return row_count\n \n \n-###############################################################################\n+#######################################################################\n+\n \n def make_file_copy(in_file, out_file):\n """\n@@ -202,21 +204,26 @@\n \n """\n check_cmd = "cat " + in_file + " > " + out_file\n- assert in_file != out_file, \\\n- "cat does not like to cat file into same file (%s)" % (check_cmd)\n+ assert in_file != out_file, "cat does not like to cat file into same file (%s)" % (\n+ check_cmd\n+ )\n output = subprocess.getoutput(check_cmd)\n error = False\n if output:\n error = True\n- assert not error, \\\n- "cat did not like your input (in_file: %s, out_file: %s):\\n%s" \\\n- % (in_file, out_file, output)\n'..b' sc_thr=0):\n+\n+def list_extract_peaks(in_list, max_merge_dist=0, coords="list", sc_thr=0):\n """\n Extract peak regions from list.\n Peak region is defined as region >= score threshold.\n@@ -969,8 +991,12 @@\n if peak_list[i][3] < peak_list[j][3]:\n new_top_pos = peak_list[j][2]\n new_top_sc = peak_list[j][3]\n- new_peak = [peak_list[i][0], peak_list[j][1],\n- new_top_pos, new_top_sc]\n+ new_peak = [\n+ peak_list[i][0],\n+ peak_list[j][1],\n+ new_top_pos,\n+ new_top_sc,\n+ ]\n # If two peaks were merged.\n if new_peak:\n merged_peak_list.append(new_peak)\n@@ -991,10 +1017,12 @@\n return peak_list\n \n \n-###############################################################################\n+#######################################################################\n+\n \n-def bed_peaks_to_genomic_peaks(peak_file, genomic_peak_file, genomic_sites_bed,\n- print_rows=False):\n+def bed_peaks_to_genomic_peaks(\n+ peak_file, genomic_peak_file, genomic_sites_bed, print_rows=False\n+):\n """\n Given a .bed file of sequence peak regions (possible coordinates from\n 0 to length of s), convert peak coordinates to genomic coordinates.\n@@ -1017,10 +1045,9 @@\n row = line.strip()\n cols = line.strip().split("\\t")\n site_id = cols[3]\n- assert site_id \\\n- not in id2row_dic, \\\n- "column 4 IDs not unique in given .bed file \\"%s\\"" \\\n- % (genomic_sites_bed)\n+ assert (\n+ site_id not in id2row_dic\n+ ), \'column 4 IDs not unique in given .bed file "%s"\' % (genomic_sites_bed)\n id2row_dic[site_id] = row\n f.close()\n \n@@ -1034,13 +1061,14 @@\n site_e = int(cols[2])\n site_id2 = cols[3]\n site_sc = float(cols[4])\n- assert re.search(".+,.+", site_id2), \\\n- "regular expression failed for ID \\"%s\\"" % (site_id2)\n+ assert re.search(\n+ ".+,.+", site_id2\n+ ), \'regular expression failed for ID "%s"\' % (site_id2)\n m = re.search(r".+,(\\d+)", site_id2)\n sc_pos = int(m.group(1)) # 1-based.\n- assert site_id in id2row_dic, \\\n- "site ID \\"%s\\" not found in genomic sites dictionary" \\\n- % (site_id)\n+ assert (\n+ site_id in id2row_dic\n+ ), \'site ID "%s" not found in genomic sites dictionary\' % (site_id)\n row = id2row_dic[site_id]\n rowl = row.split("\\t")\n gen_chr = rowl[0]\n@@ -1054,16 +1082,23 @@\n new_s = gen_e - site_e\n new_e = gen_e - site_s\n new_sc_pos = gen_e - sc_pos + 1 # keep 1-based.\n- new_row = "%s\\t%i\\t%i\\t%s,%i\\t%f\\t%s" \\\n- % (gen_chr, new_s, new_e,\n- site_id, new_sc_pos, site_sc, gen_pol)\n+ new_row = "%s\\t%i\\t%i\\t%s,%i\\t%f\\t%s" % (\n+ gen_chr,\n+ new_s,\n+ new_e,\n+ site_id,\n+ new_sc_pos,\n+ site_sc,\n+ gen_pol,\n+ )\n OUTPEAKS.write("%s\\n" % (new_row))\n if print_rows:\n print(new_row)\n OUTPEAKS.close()\n \n \n-###############################################################################\n+#######################################################################\n+\n \n def diff_two_files_identical(file1, file2):\n """\n@@ -1087,4 +1122,4 @@\n return same\n \n \n-###############################################################################\n+#######################################################################\n' |
b |
diff -r 4ad83aed5c3c -r ddcf35a868b8 graphprot_predict_wrapper.py --- a/graphprot_predict_wrapper.py Thu Jan 28 15:06:14 2021 +0000 +++ b/graphprot_predict_wrapper.py Wed Jun 05 16:40:51 2024 +0000 |
[ |
b'@@ -7,7 +7,6 @@\n \n import gplib\n \n-\n """\n \n TOOL DEPENDENCIES\n@@ -81,7 +80,8 @@\n """\n \n \n-###############################################################################\n+#######################################################################\n+\n \n def setup_argument_parser():\n """Setup argparse parser."""\n@@ -100,100 +100,119 @@\n \n """\n # Define argument parser.\n- p = ap.ArgumentParser(add_help=False,\n- prog="graphprot_predict_wrapper.py",\n- description=help_description,\n- formatter_class=ap.MetavarTypeHelpFormatter)\n+ p = ap.ArgumentParser(\n+ add_help=False,\n+ prog="graphprot_predict_wrapper.py",\n+ description=help_description,\n+ formatter_class=ap.MetavarTypeHelpFormatter,\n+ )\n \n # Argument groups.\n p_man = p.add_argument_group("REQUIRED ARGUMENTS")\n p_opt = p.add_argument_group("OPTIONAL ARGUMENTS")\n \n # Required arguments.\n- p_opt.add_argument("-h", "--help",\n- action="help",\n- help="Print help message")\n- p_man.add_argument("--fasta",\n- dest="in_fa",\n- type=str,\n- required=True,\n- help="Sequences .fa file to predict"\n- " on (option -fasta)")\n- p_man.add_argument("--model",\n- dest="in_model",\n- type=str,\n- required=True,\n- help="GraphProt model file to use for predictions"\n- " (option -model)")\n- p_man.add_argument("--params",\n- dest="in_params",\n- type=str,\n- required=True,\n- help="Parameter file for given model")\n- p_man.add_argument("--data-id",\n- dest="data_id",\n- type=str,\n- required=True,\n- help="Data ID (option -prefix)")\n+ p_opt.add_argument("-h", "--help", action="help", help="Print help message")\n+ p_man.add_argument(\n+ "--fasta",\n+ dest="in_fa",\n+ type=str,\n+ required=True,\n+ help="Sequences .fa file to predict" " on (option -fasta)",\n+ )\n+ p_man.add_argument(\n+ "--model",\n+ dest="in_model",\n+ type=str,\n+ required=True,\n+ help="GraphProt model file to use for predictions" " (option -model)",\n+ )\n+ p_man.add_argument(\n+ "--params",\n+ dest="in_params",\n+ type=str,\n+ required=True,\n+ help="Parameter file for given model",\n+ )\n+ p_man.add_argument(\n+ "--data-id",\n+ dest="data_id",\n+ type=str,\n+ required=True,\n+ help="Data ID (option -prefix)",\n+ )\n # ---> I\'m a conditional argument <---\n- p_opt.add_argument("--ws-pred",\n- dest="ws_pred",\n- default=False,\n- action="store_true",\n- help="Run a whole site prediction instead "\n- "of calculating profiles (default: false)")\n+ p_opt.add_argument(\n+ "--ws-pred",\n+ dest="ws_pred",\n+ default=False,\n+ action="store_true",\n+ help="Run a whole site prediction instead "\n+ "of calculating profiles (default: false)",\n+ )\n # Additional arguments.\n- p_opt.add_argument("--sc-thr",\n- dest="score_thr",\n- type=float,\n- default=0,\n- help="Score threshold for extracting "\n- "average profile peak regions (default: 0)")\n- p_opt.add_argument("--max-merge-dist",\n- dest="max_merge_dist",\n- type=int,\n- default=0,\n- choices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],\n- '..b'args.ap_extlr)\n+ )\n+ gplib.graphprot_profile_calc_avg_profile(\n+ profile_predictions_file,\n+ avg_prof_file,\n+ ap_extlr=args.ap_extlr,\n+ seq_ids_list=seq_ids_list,\n+ method=2,\n+ )\n # Extract peak regions on sequences with threshold score 0.\n- print("Extracting peak regions from average profile "\n- "(score threshold = 0) ... ")\n+ print(\n+ "Extracting peak regions from average profile " "(score threshold = 0) ... "\n+ )\n killpep8 = args.max_merge_dist\n- gplib.graphprot_profile_extract_peak_regions(avg_prof_file,\n- avg_prof_peaks_file,\n- max_merge_dist=killpep8,\n- sc_thr=args.score_thr)\n+ gplib.graphprot_profile_extract_peak_regions(\n+ avg_prof_file,\n+ avg_prof_peaks_file,\n+ max_merge_dist=killpep8,\n+ sc_thr=args.score_thr,\n+ )\n # Convert peaks to genomic coordinates.\n if args.genomic_sites_bed:\n print("Converting peak regions to genomic coordinates ... ")\n killit = args.genomic_sites_bed\n- gplib.bed_peaks_to_genomic_peaks(avg_prof_peaks_file,\n- avg_prof_gen_peaks_file,\n- print_rows=False,\n- genomic_sites_bed=killit)\n+ gplib.bed_peaks_to_genomic_peaks(\n+ avg_prof_peaks_file,\n+ avg_prof_gen_peaks_file,\n+ print_rows=False,\n+ genomic_sites_bed=killit,\n+ )\n # Extract peak regions with threshold score p50.\n if args.conf_out:\n sc_id = "pos_train_avg_profile_median_%i" % (args.ap_extlr)\n # Filter by pos_tr_ws_pred_med median.\n- assert sc_id in param_dic, "average profile extlr %i median "\\\n+ assert sc_id in param_dic, (\n+ "average profile extlr %i median "\n "information missing in .params file" % (args.ap_extlr)\n+ )\n p50_sc_thr = float(param_dic[sc_id])\n- print("Extracting p50 peak regions from average profile "\n- "(score threshold = %f) ... " % (p50_sc_thr))\n+ print(\n+ "Extracting p50 peak regions from average profile "\n+ "(score threshold = %f) ... " % (p50_sc_thr)\n+ )\n despair = avg_prof_peaks_p50_file\n pain = args.max_merge_dist\n- gplib.graphprot_profile_extract_peak_regions(avg_prof_file,\n- despair,\n- max_merge_dist=pain,\n- sc_thr=p50_sc_thr)\n+ gplib.graphprot_profile_extract_peak_regions(\n+ avg_prof_file, despair, max_merge_dist=pain, sc_thr=p50_sc_thr\n+ )\n # Convert peaks to genomic coordinates.\n if args.genomic_sites_bed:\n- print("Converting p50 peak regions to "\n- "genomic coordinates ... ")\n+ print("Converting p50 peak regions to " "genomic coordinates ... ")\n madness = args.genomic_sites_bed\n- gplib.bed_peaks_to_genomic_peaks(avg_prof_peaks_p50_file,\n- avg_prof_gen_peaks_p50_file,\n- genomic_sites_bed=madness)\n+ gplib.bed_peaks_to_genomic_peaks(\n+ avg_prof_peaks_p50_file,\n+ avg_prof_gen_peaks_p50_file,\n+ genomic_sites_bed=madness,\n+ )\n # Done.\n print("Script: I\'m done.")\n print("Author: ... ")\n' |
b |
diff -r 4ad83aed5c3c -r ddcf35a868b8 graphprot_train_predict.xml --- a/graphprot_train_predict.xml Thu Jan 28 15:06:14 2021 +0000 +++ b/graphprot_train_predict.xml Wed Jun 05 16:40:51 2024 +0000 |
b |
@@ -20,6 +20,7 @@ $action_type.training_options.disable_cv $action_type.training_options.disable_motifs --min-train $action_type.training_options.min_train + --gp-output #elif $action_type.action_type_selector == 'predict': python '$__tool_directory__/graphprot_predict_wrapper.py' @@ -35,6 +36,7 @@ --ap-extlr $action_type.prediction_options.ap_extlr $action_type.prediction_options.conf_out $action_type.prediction_options.ws_pred_out + --gp-output #end if |
b |
diff -r 4ad83aed5c3c -r ddcf35a868b8 graphprot_train_wrapper.py --- a/graphprot_train_wrapper.py Thu Jan 28 15:06:14 2021 +0000 +++ b/graphprot_train_wrapper.py Wed Jun 05 16:40:51 2024 +0000 |
b |
b'@@ -7,7 +7,6 @@\n \n import gplib\n \n-\n """\n \n TOOL DEPENDENCIES\n@@ -62,7 +61,8 @@\n """\n \n \n-###############################################################################\n+#######################################################################\n+\n \n def setup_argument_parser():\n """Setup argparse parser."""\n@@ -84,89 +84,107 @@\n \n """\n # Define argument parser.\n- p = ap.ArgumentParser(add_help=False,\n- prog="graphprot_train_wrapper.py",\n- description=help_description,\n- formatter_class=ap.MetavarTypeHelpFormatter)\n+ p = ap.ArgumentParser(\n+ add_help=False,\n+ prog="graphprot_train_wrapper.py",\n+ description=help_description,\n+ formatter_class=ap.MetavarTypeHelpFormatter,\n+ )\n \n # Argument groups.\n p_man = p.add_argument_group("REQUIRED ARGUMENTS")\n p_opt = p.add_argument_group("OPTIONAL ARGUMENTS")\n \n # Required arguments.\n- p_opt.add_argument("-h", "--help",\n- action="help",\n- help="Print help message")\n- p_man.add_argument("--pos",\n- dest="in_pos_fa",\n- type=str,\n- required=True,\n- help="Positive (= binding site) sequences .fa file "\n- "for model training (option -fasta)")\n- p_man.add_argument("--neg",\n- dest="in_neg_fa",\n- type=str,\n- required=True,\n- help="Negative sequences .fa file for model "\n- "training (option -negfasta)")\n- p_man.add_argument("--data-id",\n- dest="data_id",\n- type=str,\n- required=True,\n- help="Data ID (option -prefix)")\n+ p_opt.add_argument("-h", "--help", action="help", help="Print help message")\n+ p_man.add_argument(\n+ "--pos",\n+ dest="in_pos_fa",\n+ type=str,\n+ required=True,\n+ help="Positive (= binding site) sequences .fa file "\n+ "for model training (option -fasta)",\n+ )\n+ p_man.add_argument(\n+ "--neg",\n+ dest="in_neg_fa",\n+ type=str,\n+ required=True,\n+ help="Negative sequences .fa file for model " "training (option -negfasta)",\n+ )\n+ p_man.add_argument(\n+ "--data-id",\n+ dest="data_id",\n+ type=str,\n+ required=True,\n+ help="Data ID (option -prefix)",\n+ )\n # Additional arguments.\n- p_opt.add_argument("--opt-set-size",\n- dest="opt_set_size",\n- type=int,\n- default=500,\n- help="Hyperparameter optimization set size (taken "\n- "away from both --pos and --neg) (default: 500)")\n- p_opt.add_argument("--opt-pos",\n- dest="opt_pos_fa",\n- type=str,\n- help="Positive (= binding site) sequences .fa file "\n- "for hyperparameter optimization (default: take "\n- "--opt-set-size from --pos)")\n- p_opt.add_argument("--opt-neg",\n- dest="opt_neg_fa",\n- type=str,\n- help="Negative sequences .fa file for hyperparameter "\n- "optimization (default: take --opt-set-size "\n- "from --neg)")\n- p_opt.add_argument("--min-train",\n- dest="min_train",\n- type=int,\n- default=500,\n- help="Minimum amount of training sites demanded "\n- "(default: 500)")\n- p_opt.add_argument("--disable-cv",\n- dest="disable_cv",\n- default=False,\n- action="store_true",\n- '..b' " -model "\n+ + model_file\n+ )\n print(check_cmd)\n output = subprocess.getoutput(check_cmd)\n- assert output, \\\n- "The following call of GraphProt.pl produced no output:\\n%s" \\\n- % (check_cmd)\n+ assert output, "The following call of GraphProt.pl produced no output:\\n%s" % (\n+ check_cmd\n+ )\n if args.gp_output:\n print(output)\n ws_predictions_file = args.data_id + ".predictions"\n- assert os.path.exists(ws_predictions_file), \\\n- "Whole site prediction output .predictions file \\"%s\\" not found" \\\n- % (ws_predictions_file)\n+ assert os.path.exists(\n+ ws_predictions_file\n+ ), \'Whole site prediction output .predictions file "%s" not found\' % (\n+ ws_predictions_file\n+ )\n \n """\n Do profile predictions on positive training set.\n \n """\n- print("Starting profile predictions on positive training set "\n- "-action predict_profile) ... ")\n- check_cmd = "GraphProt.pl -action predict_profile -prefix " \\\n- + args.data_id + " -fasta " + pos_train_fa + " " \\\n- + param_string + " -model " + model_file\n+ print(\n+ "Starting profile predictions on positive training set "\n+ "-action predict_profile) ... "\n+ )\n+ check_cmd = (\n+ "GraphProt.pl -action predict_profile -prefix "\n+ + args.data_id\n+ + " -fasta "\n+ + pos_train_fa\n+ + " "\n+ + param_string\n+ + " -model "\n+ + model_file\n+ )\n print(check_cmd)\n output = subprocess.getoutput(check_cmd)\n- assert output, \\\n- "The following call of GraphProt.pl produced no output:\\n%s" \\\n- % (check_cmd)\n+ assert output, "The following call of GraphProt.pl produced no output:\\n%s" % (\n+ check_cmd\n+ )\n if args.gp_output:\n print(output)\n profile_predictions_file = args.data_id + ".profile"\n- assert os.path.exists(profile_predictions_file), \\\n- "Profile prediction output .profile file \\"%s\\" not found" \\\n- % (profile_predictions_file)\n+ assert os.path.exists(\n+ profile_predictions_file\n+ ), \'Profile prediction output .profile file "%s" not found\' % (\n+ profile_predictions_file\n+ )\n \n """\n Get 50 % score (median) for .predictions and .profile file.\n@@ -454,12 +550,11 @@\n print("Getting .profile and .predictions median scores ... ")\n \n # Whole site scores median.\n- ws_pred_median = \\\n- gplib.graphprot_predictions_get_median(ws_predictions_file)\n+ ws_pred_median = gplib.graphprot_predictions_get_median(ws_predictions_file)\n # Profile top site scores median.\n- profile_median = \\\n- gplib.graphprot_profile_get_tsm(profile_predictions_file,\n- profile_type="profile")\n+ profile_median = gplib.graphprot_profile_get_tsm(\n+ profile_predictions_file, profile_type="profile"\n+ )\n ws_pred_string = "pos_train_ws_pred_median: %f" % (ws_pred_median)\n profile_string = "pos_train_profile_median: %f" % (profile_median)\n gplib.echo_add_to_file(ws_pred_string, params_file)\n@@ -467,13 +562,14 @@\n # Average profile top site scores median for extlr 1 to 10.\n for i in range(10):\n i += 1\n- avg_profile_median = \\\n- gplib.graphprot_profile_get_tsm(profile_predictions_file,\n- profile_type="avg_profile",\n- avg_profile_extlr=i)\n+ avg_profile_median = gplib.graphprot_profile_get_tsm(\n+ profile_predictions_file, profile_type="avg_profile", avg_profile_extlr=i\n+ )\n \n- avg_profile_string = "pos_train_avg_profile_median_%i: %f" \\\n- % (i, avg_profile_median)\n+ avg_profile_string = "pos_train_avg_profile_median_%i: %f" % (\n+ i,\n+ avg_profile_median,\n+ )\n gplib.echo_add_to_file(avg_profile_string, params_file)\n \n print("Script: I\'m done.")\n' |