Galaxy |

Changeset 3:ace92c9a4653 (2021-01-27)

Previous changeset 2:7bbb7bf6304f (2020-01-27) Next changeset 4:4ad83aed5c3c (2021-01-28)

Commit message:
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/graphprot commit efcac98677c3ea9039c1c61eaa9e58f78287ccb3"

modified:
gplib.py
graphprot_predict_wrapper.py
graphprot_train_wrapper.py

added:
test-data/test4.fa

diff -r 7bbb7bf6304f -r ace92c9a4653 gplib.py
--- a/gplib.py Mon Jan 27 18:37:05 2020 -0500
+++ b/gplib.py Wed Jan 27 19:27:47 2021 +0000

[

b'@@ -1,13 +1,11 @@\n \n-from distutils.spawn import find_executable\n-import subprocess\n-import statistics\n+import gzip\n import random\n-import gzip\n-import uuid\n-import sys\n import re\n-import os\n+import statistics\n+import subprocess\n+from distutils.spawn import find_executable\n+\n \n """\n \n@@ -19,11 +17,11 @@\n """\n \n \n-################################################################################\n+###############################################################################\n \n def graphprot_predictions_get_median(predictions_file):\n """\n- Given a GraphProt .predictions file, read in site scores and return \n+ Given a GraphProt .predictions file, read in site scores and return\n the median value.\n \n >>> test_file = "test-data/test.predictions"\n@@ -43,29 +41,29 @@\n return statistics.median(sc_list)\n \n \n-################################################################################\n+###############################################################################\n \n-def graphprot_profile_get_top_scores_median(profile_file,\n- profile_type="profile",\n- avg_profile_extlr=5):\n+def graphprot_profile_get_tsm(profile_file,\n+ profile_type="profile",\n+ avg_profile_extlr=5):\n \n """\n- Given a GraphProt .profile file, extract for each site (identified by \n- column 1 ID) the top (= highest) score. Then return the median of these \n+ Given a GraphProt .profile file, extract for each site (identified by\n+ column 1 ID) the top (= highest) score. Then return the median of these\n top scores.\n- \n+\n profile_type can be either "profile" or "avg_profile".\n- "avg_profile means that the position-wise scores will first get smoothed \n- out by calculating for each position a new score through taking a \n- sequence window -avg_profile_extlr to +avg_profile_extlr of the position \n- and calculate the mean score over this window and assign it to the position.\n- After that, the maximum score of each site is chosen, and the median over \n- all maximum scores is returned.\n- "profile" leaves the position-wise scores as they are, directly extracting \n+ "avg_profile means that the position-wise scores will first get smoothed\n+ out by calculating for each position a new score through taking a\n+ sequence window -avg_profile_extlr to +avg_profile_extlr of the position\n+ and calculate the mean score over this window and assign it to the\n+ position. After that, the maximum score of each site is chosen, and the\n+ median over all maximum scores is returned.\n+ "profile" leaves the position-wise scores as they are, directly extracting\n the maximum for each site and then reporting the median.\n- \n+\n >>> test_file = "test-data/test.profile"\n- >>> graphprot_profile_get_top_scores_median(test_file)\n+ >>> graphprot_profile_get_tsm(test_file)\n 3.2\n \n """\n@@ -90,25 +88,27 @@\n max_list.append(max_sc)\n elif profile_type == "avg_profile":\n # Convert profile score list to average profile scores list.\n- aps_list = list_moving_window_average_values(lists_dic[seq_id],\n- win_extlr=avg_profile_extlr)\n+ aps_list = \\\n+ list_moving_window_average_values(lists_dic[seq_id],\n+ win_extlr=avg_profile_extlr)\n max_sc = max(aps_list)\n max_list.append(max_sc)\n else:\n- assert 0, "invalid profile_type argument given: \\"%s\\"" %(profile_type)\n+ assert 0, "invalid profile_type argument given: \\"%s\\"" \\\n+ % (profile_type)\n # Return the median.\n return statistics.median(max_list)\n \n \n-################################################################################\n+##################################################'..b' new_top_pos, new_top_sc]\n # If two peaks were merged.\n if new_peak:\n merged_peak_list.append(new_peak)\n@@ -915,15 +987,16 @@\n if coords == "bed":\n for i in range(len(peak_list)):\n peak_list[i][1] += 1\n- peak_list[i][2] += 1 # 1-base best score position too.\n+ peak_list[i][2] += 1 # 1-base best score position too.\n return peak_list\n \n \n-################################################################################\n+###############################################################################\n \n-def bed_peaks_to_genomic_peaks(peak_file, genomic_peak_file, genomic_sites_bed, print_rows=False):\n+def bed_peaks_to_genomic_peaks(peak_file, genomic_peak_file, genomic_sites_bed,\n+ print_rows=False):\n """\n- Given a .bed file of sequence peak regions (possible coordinates from \n+ Given a .bed file of sequence peak regions (possible coordinates from\n 0 to length of s), convert peak coordinates to genomic coordinates.\n Do this by taking genomic regions of sequences as input.\n \n@@ -944,7 +1017,10 @@\n row = line.strip()\n cols = line.strip().split("\\t")\n site_id = cols[3]\n- assert site_id not in id2row_dic, "column 4 IDs not unique in given .bed file \\"%s\\"" %(args.genomic_sites_bed)\n+ assert site_id \\\n+ not in id2row_dic, \\\n+ "column 4 IDs not unique in given .bed file \\"%s\\"" \\\n+ % (genomic_sites_bed)\n id2row_dic[site_id] = row\n f.close()\n \n@@ -958,10 +1034,13 @@\n site_e = int(cols[2])\n site_id2 = cols[3]\n site_sc = float(cols[4])\n- assert re.search(".+,.+", site_id2), "regular expression failed for ID \\"%s\\"" %(site_id2)\n- m = re.search(".+,(\\d+)", site_id2)\n- sc_pos = int(m.group(1)) # 1-based.\n- assert site_id in id2row_dic, "site ID \\"%s\\" not found in genomic sites dictionary" %(site_id)\n+ assert re.search(".+,.+", site_id2), \\\n+ "regular expression failed for ID \\"%s\\"" % (site_id2)\n+ m = re.search(r".+,(\\d+)", site_id2)\n+ sc_pos = int(m.group(1)) # 1-based.\n+ assert site_id in id2row_dic, \\\n+ "site ID \\"%s\\" not found in genomic sites dictionary" \\\n+ % (site_id)\n row = id2row_dic[site_id]\n rowl = row.split("\\t")\n gen_chr = rowl[0]\n@@ -974,21 +1053,23 @@\n if gen_pol == "-":\n new_s = gen_e - site_e\n new_e = gen_e - site_s\n- new_sc_pos = gen_e - sc_pos + 1 # keep 1-based.\n- new_row = "%s\\t%i\\t%i\\t%s,%i\\t%f\\t%s" %(gen_chr, new_s, new_e, site_id, new_sc_pos, site_sc, gen_pol)\n- OUTPEAKS.write("%s\\n" %(new_row))\n+ new_sc_pos = gen_e - sc_pos + 1 # keep 1-based.\n+ new_row = "%s\\t%i\\t%i\\t%s,%i\\t%f\\t%s" \\\n+ % (gen_chr, new_s, new_e,\n+ site_id, new_sc_pos, site_sc, gen_pol)\n+ OUTPEAKS.write("%s\\n" % (new_row))\n if print_rows:\n print(new_row)\n OUTPEAKS.close()\n \n \n-################################################################################\n+###############################################################################\n \n def diff_two_files_identical(file1, file2):\n """\n- Check whether two files are identical. Return true if diff reports no \n+ Check whether two files are identical. Return true if diff reports no\n differences.\n- \n+\n >>> file1 = "test-data/file1"\n >>> file2 = "test-data/file2"\n >>> diff_two_files_identical(file1, file2)\n@@ -1006,6 +1087,4 @@\n return same\n \n \n-################################################################################\n-\n-\n+###############################################################################\n'

diff -r 7bbb7bf6304f -r ace92c9a4653 graphprot_predict_wrapper.py
--- a/graphprot_predict_wrapper.py Mon Jan 27 18:37:05 2020 -0500
+++ b/graphprot_predict_wrapper.py Wed Jan 27 19:27:47 2021 +0000

[

b'@@ -1,12 +1,11 @@\n #!/usr/bin/env python3\n \n+import argparse as ap\n+import os\n import subprocess\n-import argparse\n-import shutil\n+import sys\n+\n import gplib\n-import gzip\n-import sys\n-import os\n \n \n """\n@@ -48,46 +47,63 @@\n EXAMPLE CALLS\n =============\n \n-python graphprot_predict_wrapper.py --model test2.model --params test2.params --fasta gp_data/test10_predict.fa --data-id test2pred --gp-output\n-python graphprot_predict_wrapper.py --model test2.model --params test2.params --fasta gp_data/test10_predict.fa --data-id test2pred --gen-site-bed gp_data/test10_predict.bed\n-python graphprot_predict_wrapper.py --model test2.model --params test2.params --fasta gp_data/test10_predict.fa --data-id test2pred --gen-site-bed gp_data/test10_predict.bed --conf-out\n-python graphprot_predict_wrapper.py --model test2.model --params test2.params --fasta gp_data/test10_predict.fa --data-id test2pred --conf-out --ws-pred\n-\n-python graphprot_predict_wrapper.py --model test-data/test.model --params test-data/test.params --fasta test-data/test_predict.fa --data-id predtest\n-\n-python graphprot_predict_wrapper.py --model test-data/test.model --params test-data/test.params --fasta test-data/test_predict.fa --data-id predtest --gen-site-bed test-data/test_predict.bed --sc-thr 0.0 --max-merge-dist 0 --conf-out --ap-extlr 5\n-\n-python graphprot_predict_wrapper.py --data-id GraphProt --fasta test-data/test_predict.fa --model test-data/test.model --params test-data/test.params --gen-site-bed test-data/test_predict.bed --sc-thr 0.0 --max-merge-dist 0 --conf-out --ap-extlr 5\n+flake8 coming out of hotel room. FB enters.\n+FB: Who is this f*** ???\n \n \n-pwd && python \'/home/uhlm/Dokumente/Projekte/GraphProt_galaxy_new/galaxytools/tools/rna_tools/graphprot/graphprot_predict_wrapper.py\' --data-id GraphProt --fasta /tmp/tmpmuslpc1h/files/0/8/c/dataset_08c48d88-e3b5-423b-acf6-bf89b8c60660.dat --model /tmp/tmpmuslpc1h/files/e/6/4/dataset_e6471bb4-e74c-4372-bc49-656f900e7191.dat --params /tmp/tmpmuslpc1h/files/b/6/5/dataset_b65e8cf4-d3e6-429e-8d57-1d401adf4b3c.dat --gen-site-bed /tmp/tmpmuslpc1h/files/5/1/a/dataset_51a38b65-5943-472d-853e-5d845fa8ac3e.dat --sc-thr 0.0 --max-merge-dist 0 --conf-out --ap-extlr 5\n+python graphprot_predict_wrapper.py --model test2.model --params test2.params\n+--fasta gp_data/test10_predict.fa --data-id test2pred --gp-output\n+python graphprot_predict_wrapper.py --model test2.model --params test2.params\n+--fasta gp_data/test10_predict.fa --data-id test2pred\n+--gen-site-bed gp_data/test10_predict.bed\n+\n+python graphprot_predict_wrapper.py --model test2.model --params test2.params\n+--fasta gp_data/test10_predict.fa --data-id test2pred\n+--gen-site-bed gp_data/test10_predict.bed --conf-out\n+\n+python graphprot_predict_wrapper.py --model test2.model --params test2.params\n+--fasta gp_data/test10_predict.fa --data-id test2pred --conf-out --ws-pred\n \n+python graphprot_predict_wrapper.py --model test-data/test.model\n+--params test-data/test.params --fasta test-data/test_predict.fa\n+--data-id predtest\n+\n+python graphprot_predict_wrapper.py --model test-data/test.model\n+--params test-data/test.params --fasta test-data/test_predict.fa\n+--data-id predtest --gen-site-bed test-data/test_predict.bed\n+--sc-thr 0.0 --max-merge-dist 0 --conf-out --ap-extlr 5\n+\n+python graphprot_predict_wrapper.py --data-id GraphProt\n+--fasta test-data/test_predict.fa --model test-data/test.model\n+--params test-data/test.params --gen-site-bed test-data/test_predict.bed\n+--sc-thr 0.0 --max-merge-dist 0 --conf-out --ap-extlr 5\n \n """\n \n-################################################################################\n+\n+###############################################################################\n \n def setup_argument_parser():\n """Setup argparse parser."""\n help_description = """\n- Galaxy wrapper script for GraphProt (-action predict and -action \n- predict_profile) to compute whole site or position-wise scores for input \n+ Galaxy wrapper script for GraphProt'..b's_list=seq_ids_list,\n+ method=2)\n # Extract peak regions on sequences with threshold score 0.\n- print("Extracting peak regions from average profile (score threshold = 0) ... ")\n- gplib.graphprot_profile_extract_peak_regions(avg_prof_file, avg_prof_peaks_file,\n- max_merge_dist=args.max_merge_dist,\n- sc_thr=args.score_thr)\n+ print("Extracting peak regions from average profile "\n+ "(score threshold = 0) ... ")\n+ killpep8 = args.max_merge_dist\n+ gplib.graphprot_profile_extract_peak_regions(avg_prof_file,\n+ avg_prof_peaks_file,\n+ max_merge_dist=killpep8,\n+ sc_thr=args.score_thr)\n # Convert peaks to genomic coordinates.\n if args.genomic_sites_bed:\n print("Converting peak regions to genomic coordinates ... ")\n- gplib.bed_peaks_to_genomic_peaks(avg_prof_peaks_file, avg_prof_gen_peaks_file,\n+ killit = args.genomic_sites_bed\n+ gplib.bed_peaks_to_genomic_peaks(avg_prof_peaks_file,\n+ avg_prof_gen_peaks_file,\n print_rows=False,\n- genomic_sites_bed=args.genomic_sites_bed)\n- # gplib.make_file_copy(avg_prof_gen_peaks_file, avg_prof_peaks_file)\n+ genomic_sites_bed=killit)\n # Extract peak regions with threshold score p50.\n if args.conf_out:\n- sc_id = "pos_train_avg_profile_median_%i" %(args.ap_extlr)\n- # Filter by pos_train_ws_pred_median median.\n- assert sc_id in param_dic, "average profile extlr %i median information missing in .params file" %(args.ap_extlr)\n+ sc_id = "pos_train_avg_profile_median_%i" % (args.ap_extlr)\n+ # Filter by pos_tr_ws_pred_med median.\n+ assert sc_id in param_dic, "average profile extlr %i median "\\\n+ "information missing in .params file" % (args.ap_extlr)\n p50_sc_thr = float(param_dic[sc_id])\n- print("Extracting p50 peak regions from average profile (score threshold = %f) ... " %(p50_sc_thr))\n- gplib.graphprot_profile_extract_peak_regions(avg_prof_file, avg_prof_peaks_p50_file,\n- max_merge_dist=args.max_merge_dist,\n+ print("Extracting p50 peak regions from average profile "\n+ "(score threshold = %f) ... " % (p50_sc_thr))\n+ despair = avg_prof_peaks_p50_file\n+ pain = args.max_merge_dist\n+ gplib.graphprot_profile_extract_peak_regions(avg_prof_file,\n+ despair,\n+ max_merge_dist=pain,\n sc_thr=p50_sc_thr)\n # Convert peaks to genomic coordinates.\n if args.genomic_sites_bed:\n- print("Converting p50 peak regions to genomic coordinates ... ")\n- gplib.bed_peaks_to_genomic_peaks(avg_prof_peaks_p50_file, avg_prof_gen_peaks_p50_file,\n- genomic_sites_bed=args.genomic_sites_bed)\n+ print("Converting p50 peak regions to "\n+ "genomic coordinates ... ")\n+ madness = args.genomic_sites_bed\n+ gplib.bed_peaks_to_genomic_peaks(avg_prof_peaks_p50_file,\n+ avg_prof_gen_peaks_p50_file,\n+ genomic_sites_bed=madness)\n # Done.\n print("Script: I\'m done.")\n print("Author: ... ")\n-\n-\n'

diff -r 7bbb7bf6304f -r ace92c9a4653 graphprot_train_wrapper.py
--- a/graphprot_train_wrapper.py Mon Jan 27 18:37:05 2020 -0500
+++ b/graphprot_train_wrapper.py Wed Jan 27 19:27:47 2021 +0000

b'@@ -1,12 +1,11 @@\n #!/usr/bin/env python3\n \n+import argparse as ap\n+import os\n import subprocess\n-import argparse\n-import shutil\n+import sys\n+\n import gplib\n-import gzip\n-import sys\n-import os\n \n \n """\n@@ -38,58 +37,57 @@\n data_id.profile\n \n \n- --opt-set-size int Hyperparameter optimization set size (taken away from both --pos and --neg) (default: 500)\n- --opt-pos str Positive (= binding site) sequences .fa file for hyperparameter optimization (default: take\n- --opt-set-size from --pos)\n- --opt-neg str Negative sequences .fa file for hyperparameter optimization (default: take --opt-set-size\n- from --neg)\n- --min-train int Minimum amount of training sites demanded (default: 500)\n- --disable-cv Disable cross validation step (default: false)\n- --disable-motifs Disable motif generation step (default: false)\n- --gp-output Print output produced by GraphProt (default: false)\n- --str-model Train a structure model (default: train a sequence model)\n-\n-\n EXAMPLE CALLS\n =============\n \n-python graphprot_train_wrapper.py --pos gp_data/SERBP1_positives.train.fa --neg gp_data/SERBP1_negatives.train.fa --data-id test2 --disable-cv --gp-output --opt-set-size 200 --min-train 400\n+python graphprot_train_wrapper.py --pos gp_data/SERBP1_positives.train.fa\n+ --neg gp_data/SERBP1_negatives.train.fa --data-id test2 --disable-cv\n+ --gp-output --opt-set-size 200 --min-train 400\n \n-python graphprot_train_wrapper.py --pos gp_data/SERBP1_positives.train.fa --neg gp_data/SERBP1_negatives.train.fa --data-id test2 --disable-cv --opt-set-size 100 --min-train 200\n+python graphprot_train_wrapper.py --pos gp_data/SERBP1_positives.train.fa\n+ --neg gp_data/SERBP1_negatives.train.fa --data-id test2 --disable-cv\n+ --opt-set-size 100 --min-train 200\n \n-python graphprot_train_wrapper.py --pos test-data/test_positives.train.fa --neg test-data/test_negatives.train.fa --data-id gptest2 --disable-cv --opt-pos test-data/test_positives.parop.fa --opt-neg test-data/test_negatives.parop.fa\n+python graphprot_train_wrapper.py --pos test-data/test_positives.train.fa\n+ --neg test-data/test_negatives.train.fa --data-id gptest2 --disable-cv\n+ --opt-pos test-data/test_positives.parop.fa\n+ --opt-neg test-data/test_negatives.parop.fa\n \n-python graphprot_train_wrapper.py --pos test-data/test_positives.train.fa --neg test-data/test_negatives.train.fa --data-id gptest2 --disable-cv --disable-motifs --opt-pos test-data/test_positives.parop.fa --opt-neg test-data/test_negatives.parop.fa\n+python graphprot_train_wrapper.py --pos test-data/test_positives.train.fa\n+ --neg test-data/test_negatives.train.fa --data-id gptest2 --disable-cv\n+ --disable-motifs --opt-pos test-data/test_positives.parop.fa --opt-neg\n+ test-data/test_negatives.parop.fa\n \n \n """\n \n-################################################################################\n+\n+###############################################################################\n \n def setup_argument_parser():\n """Setup argparse parser."""\n help_description = """\n- Galaxy wrapper script for GraphProt to train a GraphProt model on \n- a given set of input sequences (positives and negatives .fa). By \n- default a sequence model is trained (due to structure models \n- being much slower to train). Also by default take a portion of \n- the input sequences for hyperparameter optimization (HPO) prior to \n- model training, and run a 10-fold cross validation and motif \n- generation after model training. Thus the following output \n- files are produced: \n- .model model file, .params model parameter file, .png motif files \n+ Galaxy wrapper script for GraphProt to train a GraphProt model on\n+ a given set of input sequences (positives and negatives .fa). By\n+ default a sequence model is trained (due to structure models\n+ being much slower to train). Also by default take a portion of\n+ the input sequences for hyperparameter'..b'ws_pred_string = "pos_train_ws_pred_median: %f" %(ws_pred_median)\n- profile_string = "pos_train_profile_median: %f" %(profile_median)\n+ profile_median = \\\n+ gplib.graphprot_profile_get_tsm(profile_predictions_file,\n+ profile_type="profile")\n+ ws_pred_string = "pos_train_ws_pred_median: %f" % (ws_pred_median)\n+ profile_string = "pos_train_profile_median: %f" % (profile_median)\n gplib.echo_add_to_file(ws_pred_string, params_file)\n gplib.echo_add_to_file(profile_string, params_file)\n # Average profile top site scores median for extlr 1 to 10.\n for i in range(10):\n i += 1\n- avg_profile_median = gplib.graphprot_profile_get_top_scores_median(profile_predictions_file,\n- profile_type="avg_profile",\n- avg_profile_extlr=i)\n- \n- avg_profile_string = "pos_train_avg_profile_median_%i: %f" %(i, avg_profile_median)\n+ avg_profile_median = \\\n+ gplib.graphprot_profile_get_tsm(profile_predictions_file,\n+ profile_type="avg_profile",\n+ avg_profile_extlr=i)\n+\n+ avg_profile_string = "pos_train_avg_profile_median_%i: %f" \\\n+ % (i, avg_profile_median)\n gplib.echo_add_to_file(avg_profile_string, params_file)\n \n print("Script: I\'m done.")\n print("Author: Good. Now go back to your file system directory.")\n print("Script: Ok.")\n-\n-\n-"""\n-\n-OLD CODE ...\n-\n- p.add_argument("--ap-extlr",\n- dest="ap_extlr",\n- type = int,\n- default = 5,\n- help = "Define average profile up- and downstream extension for averaging scores to produce the average profile. This is used to get the median average profile score, which will be stored in the .params file to later be used in a prediction setting as a second filter value to get more confident peak regions. NOTE that you have to use the same value in model training and prediction! (default: 5)")\n-\n-\n- p.add_argument("--disable-opt",\n- dest = "disable_opt",\n- default = False,\n- action = "store_true",\n- help = "Disable hyperparameter optimization (HPO) (default: optimize hyperparameters)")\n- p.add_argument("--R",\n- dest = "param_r",\n- type = int,\n- default = False,\n- help = "GraphProt model R parameter (default: determined by HPO)")\n- p.add_argument("--D",\n- dest = "param_d",\n- type = int,\n- default = False,\n- help = "GraphProt model D parameter (default: determined by HPO)")\n- p.add_argument("--epochs",\n- dest = "param_epochs",\n- type = int,\n- default = False,\n- help = "GraphProt model epochs parameter (default: determined by HPO)")\n- p.add_argument("--lambda",\n- dest = "param_lambda",\n- type = float,\n- default = False,\n- help = "GraphProt model lambda parameter (default: determined by HPO)")\n- p.add_argument("--bitsize",\n- dest = "param_bitsize",\n- type = int,\n- default = False,\n- help = "GraphProt model bitsize parameter (default: determined by HPO)")\n- p.add_argument("--abstraction",\n- dest = "param_abstraction",\n- type = int,\n- default = False,\n- help = "GraphProt model RNAshapes abstraction level parameter for training structure models (default: determined by HPO)")\n-\n-"""\n-\n-\n-\n'

diff -r 7bbb7bf6304f -r ace92c9a4653 test-data/test4.fa
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test4.fa Wed Jan 27 19:27:47 2021 +0000

@@ -0,0 +1,4 @@
+>1
+gccuAUGUuuua
+>2
+ctgaAACTatgt