Mercurial > repos > rnateam > graphprot_predict_profile
diff graphprot_predict_wrapper.py @ 3:ace92c9a4653 draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/graphprot commit efcac98677c3ea9039c1c61eaa9e58f78287ccb3"
author | bgruening |
---|---|
date | Wed, 27 Jan 2021 19:27:47 +0000 |
parents | 7bbb7bf6304f |
children | 4ad83aed5c3c |
line wrap: on
line diff
--- a/graphprot_predict_wrapper.py Mon Jan 27 18:37:05 2020 -0500 +++ b/graphprot_predict_wrapper.py Wed Jan 27 19:27:47 2021 +0000 @@ -1,12 +1,11 @@ #!/usr/bin/env python3 +import argparse as ap +import os import subprocess -import argparse -import shutil +import sys + import gplib -import gzip -import sys -import os """ @@ -48,46 +47,63 @@ EXAMPLE CALLS ============= -python graphprot_predict_wrapper.py --model test2.model --params test2.params --fasta gp_data/test10_predict.fa --data-id test2pred --gp-output -python graphprot_predict_wrapper.py --model test2.model --params test2.params --fasta gp_data/test10_predict.fa --data-id test2pred --gen-site-bed gp_data/test10_predict.bed -python graphprot_predict_wrapper.py --model test2.model --params test2.params --fasta gp_data/test10_predict.fa --data-id test2pred --gen-site-bed gp_data/test10_predict.bed --conf-out -python graphprot_predict_wrapper.py --model test2.model --params test2.params --fasta gp_data/test10_predict.fa --data-id test2pred --conf-out --ws-pred - -python graphprot_predict_wrapper.py --model test-data/test.model --params test-data/test.params --fasta test-data/test_predict.fa --data-id predtest - -python graphprot_predict_wrapper.py --model test-data/test.model --params test-data/test.params --fasta test-data/test_predict.fa --data-id predtest --gen-site-bed test-data/test_predict.bed --sc-thr 0.0 --max-merge-dist 0 --conf-out --ap-extlr 5 - -python graphprot_predict_wrapper.py --data-id GraphProt --fasta test-data/test_predict.fa --model test-data/test.model --params test-data/test.params --gen-site-bed test-data/test_predict.bed --sc-thr 0.0 --max-merge-dist 0 --conf-out --ap-extlr 5 +flake8 coming out of hotel room. FB enters. +FB: Who is this f*** ??? -pwd && python '/home/uhlm/Dokumente/Projekte/GraphProt_galaxy_new/galaxytools/tools/rna_tools/graphprot/graphprot_predict_wrapper.py' --data-id GraphProt --fasta /tmp/tmpmuslpc1h/files/0/8/c/dataset_08c48d88-e3b5-423b-acf6-bf89b8c60660.dat --model /tmp/tmpmuslpc1h/files/e/6/4/dataset_e6471bb4-e74c-4372-bc49-656f900e7191.dat --params /tmp/tmpmuslpc1h/files/b/6/5/dataset_b65e8cf4-d3e6-429e-8d57-1d401adf4b3c.dat --gen-site-bed /tmp/tmpmuslpc1h/files/5/1/a/dataset_51a38b65-5943-472d-853e-5d845fa8ac3e.dat --sc-thr 0.0 --max-merge-dist 0 --conf-out --ap-extlr 5 +python graphprot_predict_wrapper.py --model test2.model --params test2.params +--fasta gp_data/test10_predict.fa --data-id test2pred --gp-output +python graphprot_predict_wrapper.py --model test2.model --params test2.params +--fasta gp_data/test10_predict.fa --data-id test2pred +--gen-site-bed gp_data/test10_predict.bed + +python graphprot_predict_wrapper.py --model test2.model --params test2.params +--fasta gp_data/test10_predict.fa --data-id test2pred +--gen-site-bed gp_data/test10_predict.bed --conf-out + +python graphprot_predict_wrapper.py --model test2.model --params test2.params +--fasta gp_data/test10_predict.fa --data-id test2pred --conf-out --ws-pred +python graphprot_predict_wrapper.py --model test-data/test.model +--params test-data/test.params --fasta test-data/test_predict.fa +--data-id predtest + +python graphprot_predict_wrapper.py --model test-data/test.model +--params test-data/test.params --fasta test-data/test_predict.fa +--data-id predtest --gen-site-bed test-data/test_predict.bed +--sc-thr 0.0 --max-merge-dist 0 --conf-out --ap-extlr 5 + +python graphprot_predict_wrapper.py --data-id GraphProt +--fasta test-data/test_predict.fa --model test-data/test.model +--params test-data/test.params --gen-site-bed test-data/test_predict.bed +--sc-thr 0.0 --max-merge-dist 0 --conf-out --ap-extlr 5 """ -################################################################################ + +############################################################################### def setup_argument_parser(): """Setup argparse parser.""" help_description = """ - Galaxy wrapper script for GraphProt (-action predict and -action - predict_profile) to compute whole site or position-wise scores for input + Galaxy wrapper script for GraphProt (-action predict and -action + predict_profile) to compute whole site or position-wise scores for input FASTA sequences. - By default, profile predictions are calculated, followed by average + By default, profile predictions are calculated, followed by average profiles computions and peak regions extraction from average profiles. If --ws-pred is set, whole site score predictions on input sequences will be run instead. - If --conf-out is set, sites or peak regions with a score >= the median + If --conf-out is set, sites or peak regions with a score >= the median score of positive training sites will be output. - If --gen-site-bed .bed file is provided, peak regions will be output + If --gen-site-bed .bed file is provided, peak regions will be output with genomic coordinates too. """ # Define argument parser. - p = argparse.ArgumentParser(add_help=False, - prog="graphprot_predict_wrapper.py", - description=help_description, - formatter_class=argparse.MetavarTypeHelpFormatter) + p = ap.ArgumentParser(add_help=False, + prog="graphprot_predict_wrapper.py", + description=help_description, + formatter_class=ap.MetavarTypeHelpFormatter) # Argument groups. p_man = p.add_argument_group("REQUIRED ARGUMENTS") @@ -95,70 +111,87 @@ # Required arguments. p_opt.add_argument("-h", "--help", - action="help", - help="Print help message") + action="help", + help="Print help message") p_man.add_argument("--fasta", - dest="in_fa", - type=str, - required = True, - help = "Sequences .fa file to predict on (option -fasta)") + dest="in_fa", + type=str, + required=True, + help="Sequences .fa file to predict" + " on (option -fasta)") p_man.add_argument("--model", - dest="in_model", - type=str, - required = True, - help = "GraphProt model file to use for predictions (option -model)") + dest="in_model", + type=str, + required=True, + help="GraphProt model file to use for predictions" + " (option -model)") p_man.add_argument("--params", - dest="in_params", - type=str, - required = True, - help = "Parameter file for given model") + dest="in_params", + type=str, + required=True, + help="Parameter file for given model") p_man.add_argument("--data-id", - dest="data_id", - type=str, - required = True, - help = "Data ID (option -prefix)") + dest="data_id", + type=str, + required=True, + help="Data ID (option -prefix)") # ---> I'm a conditional argument <--- p_opt.add_argument("--ws-pred", - dest = "ws_pred", - default = False, - action = "store_true", - help = "Run a whole site prediction instead of calculating profiles (default: false)") + dest="ws_pred", + default=False, + action="store_true", + help="Run a whole site prediction instead " + "of calculating profiles (default: false)") # Additional arguments. p_opt.add_argument("--sc-thr", - dest="score_thr", - type = float, - default = 0, - help = "Score threshold for extracting average profile peak regions (default: 0)") + dest="score_thr", + type=float, + default=0, + help="Score threshold for extracting " + "average profile peak regions (default: 0)") p_opt.add_argument("--max-merge-dist", - dest="max_merge_dist", - type = int, - default = 0, - choices = [0,1,2,3,4,5,6,7,8,9,10], - help = "Maximum merge distance for nearby peak regions (default: report all non-overlapping regions)") + dest="max_merge_dist", + type=int, + default=0, + choices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + help="Maximum merge distance for nearby peak regions" + " (default: report all non-overlapping regions)") p_opt.add_argument("--gen-site-bed", - dest="genomic_sites_bed", - type=str, - help = ".bed file specifying the genomic regions of the input .fa sequences. Corrupt .bed information will be punished (default: false)") + dest="genomic_sites_bed", + type=str, + help=".bed file specifying the genomic regions of " + "the input .fa sequences. Corrupt .bed " + "information will be punished (default: false)") p_opt.add_argument("--conf-out", - dest="conf_out", - default = False, - action = "store_true", - help = "Output filtered peak regions BED file or predictions file (if --ws-pred) using the median positive training site score for filtering (default: false)") + dest="conf_out", + default=False, + action="store_true", + help="Output filtered peak regions BED file or " + "predictions file (if --ws-pred) using the median " + "positive training site score for filtering " + "(default: false)") p_opt.add_argument("--gp-output", - dest = "gp_output", - default = False, - action = "store_true", - help = "Print output produced by GraphProt (default: false)") + dest="gp_output", + default=False, + action="store_true", + help="Print output produced by GraphProt " + "(default: false)") p_opt.add_argument("--ap-extlr", - dest="ap_extlr", - type = int, - default = 5, - choices = [0,1,2,3,4,5,6,7,8,9,10], - help = "Define average profile up- and downstream extension to produce the average profile. The mean over small sequence windows (window length = --ap-extlr*2 + 1) is used to get position scores, thus the average profile is more smooth than the initial profile output by GraphProt (default: 5)") + dest="ap_extlr", + type=int, + default=5, + choices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + help="Define average profile up- and downstream " + "extension to produce the average profile. " + "The mean over small sequence windows " + "(window length = --ap-extlr*2 + 1) is used to " + "get position scores, thus the average profile " + "is more smooth than the initial profile output " + "by GraphProt (default: 5)") return p -################################################################################ +############################################################################### if __name__ == '__main__': @@ -166,49 +199,68 @@ parser = setup_argument_parser() # Read in command line arguments. args = parser.parse_args() - + """ Do all sorts of sanity checking. - + """ # Check for Linux. assert "linux" in sys.platform, "please use Linux" # Check tool availability. assert gplib.is_tool("GraphProt.pl"), "GraphProt.pl not in PATH" # Check file inputs. - assert os.path.exists(args.in_fa), "input .fa file \"%s\" not found" %(args.in_fa) - assert os.path.exists(args.in_model), "input .model file \"%s\" not found" %(args.in_model) - assert os.path.exists(args.in_params), "input .params file \"%s\" not found" %(args.in_params) + assert os.path.exists(args.in_fa), \ + "input .fa file \"%s\" not found" % (args.in_fa) + assert os.path.exists(args.in_model), \ + "input .model file \"%s\" not found" % (args.in_model) + assert os.path.exists(args.in_params), \ + "input .params file \"%s\" not found" % (args.in_params) # Count .fa entries. c_in_fa = gplib.count_fasta_headers(args.in_fa) - assert c_in_fa, "input .fa file \"%s\" no headers found" %(args.in_fa) - print("# input .fa sequences: %i" %(c_in_fa)) + assert c_in_fa, "input .fa file \"%s\" no headers found" % (args.in_fa) + print("# input .fa sequences: %i" % (c_in_fa)) # Read in FASTA sequences to check for uppercase sequences. seqs_dic = gplib.read_fasta_into_dic(args.in_fa) c_uc_nt = gplib.seqs_dic_count_uc_nts(seqs_dic) - assert c_uc_nt, "no uppercase nucleotides in input .fa sequences. Please change sequences to uppercase (keep in mind GraphProt only scores uppercase regions (according to its viewpoint concept)" + assert c_uc_nt, "no uppercase nucleotides in input .fa sequences. "\ + "Please change sequences to uppercase (keep in mind "\ + "GraphProt only scores uppercase regions (according "\ + "to its viewpoint concept)" if not args.ws_pred: # Check for lowercase sequences. c_lc_nt = gplib.seqs_dic_count_lc_nts(seqs_dic) - assert not c_lc_nt, "lowercase nucleotides in input .fa not allowed in profile predictions, since GraphProt only scores uppercase regions (according to its viewpoint concept)" + assert not c_lc_nt, "lowercase nucleotides in input .fa not allowed"\ + " in profile predictions, since GraphProt only scores "\ + "uppercase regions (according to its viewpoint concept)" # Check .bed. if args.genomic_sites_bed: # An array of checks, marvelous. - assert os.path.exists(args.genomic_sites_bed), "genomic .bed file \"%s\" not found" %(args.genomic_sites_bed) + assert \ + os.path.exists(args.genomic_sites_bed), \ + "genomic .bed file \"%s\" not found" % (args.genomic_sites_bed) # Check .bed for content. - assert gplib.count_file_rows(args.genomic_sites_bed), "genomic .bed file \"%s\" is empty" %(args.genomic_sites_bed) + assert gplib.count_file_rows(args.genomic_sites_bed), \ + "genomic .bed file \"%s\" is empty" % (args.genomic_sites_bed) # Check .bed for 6-column format. - assert gplib.bed_check_six_col_format(args.genomic_sites_bed), "genomic .bed file \"%s\" appears to not be in 6-column .bed format" %(args.genomic_sites_bed) + assert gplib.bed_check_six_col_format(args.genomic_sites_bed), \ + "genomic .bed file \"%s\" appears to not "\ + "be in 6-column .bed format" % (args.genomic_sites_bed) # Check for unique column 4 IDs. - assert gplib.bed_check_unique_ids(args.genomic_sites_bed), "genomic .bed file \"%s\" column 4 IDs not unique" %(args.genomic_sites_bed) - # Read in .bed regions, compare to FASTA sequences (compare IDs + lengths) + assert gplib.bed_check_unique_ids(args.genomic_sites_bed), \ + "genomic .bed file \"%s\" column 4 IDs not unique" % \ + (args.genomic_sites_bed) + # Read in .bed regions, compare to FASTA sequences. seq_len_dic = gplib.get_seq_lengths_from_seqs_dic(seqs_dic) reg_len_dic = gplib.bed_get_region_lengths(args.genomic_sites_bed) for seq_id in seq_len_dic: seq_l = seq_len_dic[seq_id] - assert seq_id in reg_len_dic, "sequence ID \"\" missing in input .bed \"\"" %(seq_id, args.genomic_sites_bed) + assert seq_id in reg_len_dic, \ + "sequence ID \"%s\" missing in input .bed \"%s\"" % \ + (seq_id, args.genomic_sites_bed) reg_l = reg_len_dic[seq_id] - assert seq_l == reg_l, "sequence length differs from .bed region length (%i != %i)" %(seq_l, reg_l) + assert seq_l == reg_l, \ + "sequence length differs from .bed region length (%i != %i)" \ + % (seq_l, reg_l) # Read in model parameters. param_dic = gplib.graphprot_get_param_dic(args.in_params) # Create GraphProt parameter string. @@ -216,83 +268,114 @@ """ Run predictions. - + """ if args.ws_pred: # Do whole site prediction. - print("Starting whole site predictions on input .fa file (-action predict) ... ") - check_cmd = "GraphProt.pl -action predict -prefix " + args.data_id + " -fasta " + args.in_fa + " " + param_string + " -model " + args.in_model + print("Starting whole site predictions on input .fa file" + " (-action predict) ... ") + check_cmd = "GraphProt.pl -action predict -prefix " \ + + args.data_id + " -fasta " + args.in_fa + " " \ + + param_string + " -model " + args.in_model output = subprocess.getoutput(check_cmd) - assert output, "the following call of GraphProt.pl produced no output:\n%s" %(check_cmd) + assert output, "the following call of GraphProt.pl "\ + "produced no output:\n%s" % (check_cmd) if args.gp_output: print(output) ws_predictions_file = args.data_id + ".predictions" - assert os.path.exists(ws_predictions_file), "Whole site prediction output .predictions file \"%s\" not found" %(ws_predictions_file) + assert os.path.exists(ws_predictions_file), \ + "Whole site prediction output .predictions file \"%s\" not found" \ + % (ws_predictions_file) if args.conf_out: # Filter by pos_train_ws_pred_median median. - assert "pos_train_ws_pred_median" in param_dic, "whole site top scores median information missing in .params file" - pos_train_ws_pred_median = float(param_dic["pos_train_ws_pred_median"]) + assert "pos_train_ws_pred_median" in param_dic, \ + "whole site top scores median information "\ + "missing in .params file" + pos_tr_ws_pred_med = \ + float(param_dic["pos_train_ws_pred_median"]) # Filtered file. filt_ws_predictions_file = args.data_id + ".p50.predictions" - print("Extracting p50 sites from whole site predictions (score threshold = %f) ... " %(pos_train_ws_pred_median)) - gplib.graphprot_filter_predictions_file(ws_predictions_file, filt_ws_predictions_file, - sc_thr=pos_train_ws_pred_median) + print("Extracting p50 sites from whole site predictions" + " (score threshold = %f) ... " % (pos_tr_ws_pred_med)) + gplib.graphprot_filter_predictions_file(ws_predictions_file, + filt_ws_predictions_file, + sc_thr=pos_tr_ws_pred_med) else: # Do profile prediction. - print("Starting profile predictions on on input .fa file (-action predict_profile) ... ") - check_cmd = "GraphProt.pl -action predict_profile -prefix " + args.data_id + " -fasta " + args.in_fa + " " + param_string + " -model " + args.in_model + print("Starting profile predictions on on input .fa file " + "(-action predict_profile) ... ") + check_cmd = "GraphProt.pl -action predict_profile -prefix " \ + + args.data_id + " -fasta " + args.in_fa + " " + \ + param_string + " -model " + args.in_model output = subprocess.getoutput(check_cmd) - assert output, "the following call of GraphProt.pl produced no output:\n%s" %(check_cmd) + assert output, "the following call of GraphProt.pl produced "\ + "no output:\n%s" % (check_cmd) if args.gp_output: print(output) profile_predictions_file = args.data_id + ".profile" - assert os.path.exists(profile_predictions_file), "Profile prediction output .profile file \"%s\" not found" %(profile_predictions_file) + assert os.path.exists(profile_predictions_file), \ + "Profile prediction output .profile file \"%s\" not found" % \ + (profile_predictions_file) # Profile prediction output files. avg_prof_file = args.data_id + ".avg_profile" avg_prof_peaks_file = args.data_id + ".avg_profile.peaks.bed" - avg_prof_gen_peaks_file = args.data_id + ".avg_profile.genomic_peaks.bed" - avg_prof_peaks_p50_file = args.data_id + ".avg_profile.p50.peaks.bed" - avg_prof_gen_peaks_p50_file = args.data_id + ".avg_profile.p50.genomic_peaks.bed" + avg_prof_gen_peaks_file = args.data_id + \ + ".avg_profile.genomic_peaks.bed" + avg_prof_peaks_p50_file = args.data_id + \ + ".avg_profile.p50.peaks.bed" + avg_prof_gen_peaks_p50_file = args.data_id + \ + ".avg_profile.p50.genomic_peaks.bed" # Get sequence IDs in order from input .fa file. seq_ids_list = gplib.fasta_read_in_ids(args.in_fa) # Calculate average profiles. - print("Getting average profile from profile (extlr for smoothing: %i) ... " %(args.ap_extlr)) - gplib.graphprot_profile_calculate_avg_profile(profile_predictions_file, - avg_prof_file, - ap_extlr=args.ap_extlr, - seq_ids_list=seq_ids_list, - method=2) + print("Getting average profile from profile " + "(extlr for smoothing: %i) ... " % (args.ap_extlr)) + gplib.graphprot_profile_calc_avg_profile(profile_predictions_file, + avg_prof_file, + ap_extlr=args.ap_extlr, + seq_ids_list=seq_ids_list, + method=2) # Extract peak regions on sequences with threshold score 0. - print("Extracting peak regions from average profile (score threshold = 0) ... ") - gplib.graphprot_profile_extract_peak_regions(avg_prof_file, avg_prof_peaks_file, - max_merge_dist=args.max_merge_dist, - sc_thr=args.score_thr) + print("Extracting peak regions from average profile " + "(score threshold = 0) ... ") + killpep8 = args.max_merge_dist + gplib.graphprot_profile_extract_peak_regions(avg_prof_file, + avg_prof_peaks_file, + max_merge_dist=killpep8, + sc_thr=args.score_thr) # Convert peaks to genomic coordinates. if args.genomic_sites_bed: print("Converting peak regions to genomic coordinates ... ") - gplib.bed_peaks_to_genomic_peaks(avg_prof_peaks_file, avg_prof_gen_peaks_file, + killit = args.genomic_sites_bed + gplib.bed_peaks_to_genomic_peaks(avg_prof_peaks_file, + avg_prof_gen_peaks_file, print_rows=False, - genomic_sites_bed=args.genomic_sites_bed) - # gplib.make_file_copy(avg_prof_gen_peaks_file, avg_prof_peaks_file) + genomic_sites_bed=killit) # Extract peak regions with threshold score p50. if args.conf_out: - sc_id = "pos_train_avg_profile_median_%i" %(args.ap_extlr) - # Filter by pos_train_ws_pred_median median. - assert sc_id in param_dic, "average profile extlr %i median information missing in .params file" %(args.ap_extlr) + sc_id = "pos_train_avg_profile_median_%i" % (args.ap_extlr) + # Filter by pos_tr_ws_pred_med median. + assert sc_id in param_dic, "average profile extlr %i median "\ + "information missing in .params file" % (args.ap_extlr) p50_sc_thr = float(param_dic[sc_id]) - print("Extracting p50 peak regions from average profile (score threshold = %f) ... " %(p50_sc_thr)) - gplib.graphprot_profile_extract_peak_regions(avg_prof_file, avg_prof_peaks_p50_file, - max_merge_dist=args.max_merge_dist, + print("Extracting p50 peak regions from average profile " + "(score threshold = %f) ... " % (p50_sc_thr)) + despair = avg_prof_peaks_p50_file + pain = args.max_merge_dist + gplib.graphprot_profile_extract_peak_regions(avg_prof_file, + despair, + max_merge_dist=pain, sc_thr=p50_sc_thr) # Convert peaks to genomic coordinates. if args.genomic_sites_bed: - print("Converting p50 peak regions to genomic coordinates ... ") - gplib.bed_peaks_to_genomic_peaks(avg_prof_peaks_p50_file, avg_prof_gen_peaks_p50_file, - genomic_sites_bed=args.genomic_sites_bed) + print("Converting p50 peak regions to " + "genomic coordinates ... ") + madness = args.genomic_sites_bed + gplib.bed_peaks_to_genomic_peaks(avg_prof_peaks_p50_file, + avg_prof_gen_peaks_p50_file, + genomic_sites_bed=madness) # Done. print("Script: I'm done.") print("Author: ... ") - -