Mercurial > repos > rnateam > graphprot_predict_profile
diff graphprot_predict_wrapper.py @ 5:ddcf35a868b8 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/graphprot commit ad60258f5759eaa205fec4af6143c728ea131419
author | bgruening |
---|---|
date | Wed, 05 Jun 2024 16:40:51 +0000 |
parents | 4ad83aed5c3c |
children |
line wrap: on
line diff
--- a/graphprot_predict_wrapper.py Thu Jan 28 15:06:14 2021 +0000 +++ b/graphprot_predict_wrapper.py Wed Jun 05 16:40:51 2024 +0000 @@ -7,7 +7,6 @@ import gplib - """ TOOL DEPENDENCIES @@ -81,7 +80,8 @@ """ -############################################################################### +####################################################################### + def setup_argument_parser(): """Setup argparse parser.""" @@ -100,100 +100,119 @@ """ # Define argument parser. - p = ap.ArgumentParser(add_help=False, - prog="graphprot_predict_wrapper.py", - description=help_description, - formatter_class=ap.MetavarTypeHelpFormatter) + p = ap.ArgumentParser( + add_help=False, + prog="graphprot_predict_wrapper.py", + description=help_description, + formatter_class=ap.MetavarTypeHelpFormatter, + ) # Argument groups. p_man = p.add_argument_group("REQUIRED ARGUMENTS") p_opt = p.add_argument_group("OPTIONAL ARGUMENTS") # Required arguments. - p_opt.add_argument("-h", "--help", - action="help", - help="Print help message") - p_man.add_argument("--fasta", - dest="in_fa", - type=str, - required=True, - help="Sequences .fa file to predict" - " on (option -fasta)") - p_man.add_argument("--model", - dest="in_model", - type=str, - required=True, - help="GraphProt model file to use for predictions" - " (option -model)") - p_man.add_argument("--params", - dest="in_params", - type=str, - required=True, - help="Parameter file for given model") - p_man.add_argument("--data-id", - dest="data_id", - type=str, - required=True, - help="Data ID (option -prefix)") + p_opt.add_argument("-h", "--help", action="help", help="Print help message") + p_man.add_argument( + "--fasta", + dest="in_fa", + type=str, + required=True, + help="Sequences .fa file to predict" " on (option -fasta)", + ) + p_man.add_argument( + "--model", + dest="in_model", + type=str, + required=True, + help="GraphProt model file to use for predictions" " (option -model)", + ) + p_man.add_argument( + "--params", + dest="in_params", + type=str, + required=True, + help="Parameter file for given model", + ) + p_man.add_argument( + "--data-id", + dest="data_id", + type=str, + required=True, + help="Data ID (option -prefix)", + ) # ---> I'm a conditional argument <--- - p_opt.add_argument("--ws-pred", - dest="ws_pred", - default=False, - action="store_true", - help="Run a whole site prediction instead " - "of calculating profiles (default: false)") + p_opt.add_argument( + "--ws-pred", + dest="ws_pred", + default=False, + action="store_true", + help="Run a whole site prediction instead " + "of calculating profiles (default: false)", + ) # Additional arguments. - p_opt.add_argument("--sc-thr", - dest="score_thr", - type=float, - default=0, - help="Score threshold for extracting " - "average profile peak regions (default: 0)") - p_opt.add_argument("--max-merge-dist", - dest="max_merge_dist", - type=int, - default=0, - choices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], - help="Maximum merge distance for nearby peak regions" - " (default: report all non-overlapping regions)") - p_opt.add_argument("--gen-site-bed", - dest="genomic_sites_bed", - type=str, - help=".bed file specifying the genomic regions of " - "the input .fa sequences. Corrupt .bed " - "information will be punished (default: false)") - p_opt.add_argument("--conf-out", - dest="conf_out", - default=False, - action="store_true", - help="Output filtered peak regions BED file or " - "predictions file (if --ws-pred) using the median " - "positive training site score for filtering " - "(default: false)") - p_opt.add_argument("--gp-output", - dest="gp_output", - default=False, - action="store_true", - help="Print output produced by GraphProt " - "(default: false)") - p_opt.add_argument("--ap-extlr", - dest="ap_extlr", - type=int, - default=5, - choices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], - help="Define average profile up- and downstream " - "extension to produce the average profile. " - "The mean over small sequence windows " - "(window length = --ap-extlr*2 + 1) is used to " - "get position scores, thus the average profile " - "is more smooth than the initial profile output " - "by GraphProt (default: 5)") + p_opt.add_argument( + "--sc-thr", + dest="score_thr", + type=float, + default=0, + help="Score threshold for extracting " + "average profile peak regions (default: 0)", + ) + p_opt.add_argument( + "--max-merge-dist", + dest="max_merge_dist", + type=int, + default=0, + choices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + help="Maximum merge distance for nearby peak regions" + " (default: report all non-overlapping regions)", + ) + p_opt.add_argument( + "--gen-site-bed", + dest="genomic_sites_bed", + type=str, + help=".bed file specifying the genomic regions of " + "the input .fa sequences. Corrupt .bed " + "information will be punished (default: false)", + ) + p_opt.add_argument( + "--conf-out", + dest="conf_out", + default=False, + action="store_true", + help="Output filtered peak regions BED file or " + "predictions file (if --ws-pred) using the median " + "positive training site score for filtering " + "(default: false)", + ) + p_opt.add_argument( + "--gp-output", + dest="gp_output", + default=False, + action="store_true", + help="Print output produced by GraphProt " "(default: false)", + ) + p_opt.add_argument( + "--ap-extlr", + dest="ap_extlr", + type=int, + default=5, + choices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + help="Define average profile up- and downstream " + "extension to produce the average profile. " + "The mean over small sequence windows " + "(window length = --ap-extlr*2 + 1) is used to " + "get position scores, thus the average profile " + "is more smooth than the initial profile output " + "by GraphProt (default: 5)", + ) return p -############################################################################### +####################################################################### -if __name__ == '__main__': +if __name__ == "__main__": # Setup argparse. parser = setup_argument_parser() @@ -209,70 +228,86 @@ # Check tool availability. assert gplib.is_tool("GraphProt.pl"), "GraphProt.pl not in PATH" # Check file inputs. - assert os.path.exists(args.in_fa), \ - "input .fa file \"%s\" not found" % (args.in_fa) - assert os.path.exists(args.in_model), \ - "input .model file \"%s\" not found" % (args.in_model) - assert os.path.exists(args.in_params), \ - "input .params file \"%s\" not found" % (args.in_params) + assert os.path.exists(args.in_fa), 'input .fa file "%s" not found' % (args.in_fa) + assert os.path.exists(args.in_model), 'input .model file "%s" not found' % ( + args.in_model + ) + assert os.path.exists(args.in_params), 'input .params file "%s" not found' % ( + args.in_params + ) # Count .fa entries. c_in_fa = gplib.count_fasta_headers(args.in_fa) - assert c_in_fa, "input .fa file \"%s\" no headers found" % (args.in_fa) + assert c_in_fa, 'input .fa file "%s" no headers found' % (args.in_fa) print("# input .fa sequences: %i" % (c_in_fa)) # Read in FASTA sequences to check for uppercase sequences. seqs_dic = gplib.read_fasta_into_dic(args.in_fa) # Check for lowercase only sequences, which cause GP to crash. - error_mess = "input sequences encountered containing "\ - "only lowercase characters or lowercase characters in between "\ - "uppercase characters. Please provide either all uppercase "\ - "sequences or sequences containing uppercase regions surrounded "\ - "by lowercase context regions for structure calculation (see "\ - "viewpoint concept in original GraphProt publication "\ + error_mess = ( + "input sequences encountered containing " + "only lowercase characters or lowercase characters in between " + "uppercase characters. Please provide either all uppercase " + "sequences or sequences containing uppercase regions surrounded " + "by lowercase context regions for structure calculation (see " + "viewpoint concept in original GraphProt publication " "for more details)" + ) if args.ws_pred: bad_ids = gplib.check_seqs_dic_format(seqs_dic) assert not bad_ids, "%s" % (error_mess) c_uc_nt = gplib.seqs_dic_count_uc_nts(seqs_dic) - assert c_uc_nt, "no uppercase nucleotides in input .fa sequences. "\ - "Please change sequences to uppercase (keep in mind "\ - "GraphProt only scores uppercase regions (according "\ - "to its viewpoint concept)" + assert c_uc_nt, ( + "no uppercase nucleotides in input .fa sequences. " + "Please change sequences to uppercase (keep in mind " + "GraphProt only scores uppercase regions (according " + "to its viewpoint concept)" + ) if not args.ws_pred: # Check for lowercase sequences. c_lc_nt = gplib.seqs_dic_count_lc_nts(seqs_dic) - assert not c_lc_nt, "lowercase nucleotides in input .fa not allowed"\ - " in profile predictions, since GraphProt only scores "\ + assert not c_lc_nt, ( + "lowercase nucleotides in input .fa not allowed" + " in profile predictions, since GraphProt only scores " "uppercase regions (according to its viewpoint concept)" + ) # Check .bed. if args.genomic_sites_bed: # An array of checks, marvelous. - assert \ - os.path.exists(args.genomic_sites_bed), \ - "genomic .bed file \"%s\" not found" % (args.genomic_sites_bed) + assert os.path.exists( + args.genomic_sites_bed + ), 'genomic .bed file "%s" not found' % (args.genomic_sites_bed) # Check .bed for content. - assert gplib.count_file_rows(args.genomic_sites_bed), \ - "genomic .bed file \"%s\" is empty" % (args.genomic_sites_bed) + assert gplib.count_file_rows( + args.genomic_sites_bed + ), 'genomic .bed file "%s" is empty' % (args.genomic_sites_bed) # Check .bed for 6-column format. - assert gplib.bed_check_six_col_format(args.genomic_sites_bed), \ - "genomic .bed file \"%s\" appears to not "\ - "be in 6-column .bed format" % (args.genomic_sites_bed) + assert gplib.bed_check_six_col_format( + args.genomic_sites_bed + ), 'genomic .bed file "%s" appears to not ' "be in 6-column .bed format" % ( + args.genomic_sites_bed + ) # Check for unique column 4 IDs. - assert gplib.bed_check_unique_ids(args.genomic_sites_bed), \ - "genomic .bed file \"%s\" column 4 IDs not unique" % \ - (args.genomic_sites_bed) + assert gplib.bed_check_unique_ids( + args.genomic_sites_bed + ), 'genomic .bed file "%s" column 4 IDs not unique' % (args.genomic_sites_bed) # Read in .bed regions, compare to FASTA sequences. seq_len_dic = gplib.get_seq_lengths_from_seqs_dic(seqs_dic) reg_len_dic = gplib.bed_get_region_lengths(args.genomic_sites_bed) for seq_id in seq_len_dic: seq_l = seq_len_dic[seq_id] - assert seq_id in reg_len_dic, \ - "sequence ID \"%s\" missing in input .bed \"%s\"" % \ - (seq_id, args.genomic_sites_bed) + assert ( + seq_id in reg_len_dic + ), 'sequence ID "%s" missing in input .bed "%s"' % ( + seq_id, + args.genomic_sites_bed, + ) reg_l = reg_len_dic[seq_id] - assert seq_l == reg_l, \ - "sequence length differs from .bed region length (%i != %i)" \ - % (seq_l, reg_l) + assert ( + seq_l == reg_l + ), "sequence length differs from .bed region length (%i != %i)" % ( + seq_l, + reg_l, + ) # Read in model parameters. param_dic = gplib.graphprot_get_param_dic(args.in_params) # Create GraphProt parameter string. @@ -284,110 +319,147 @@ """ if args.ws_pred: # Do whole site prediction. - print("Starting whole site predictions on input .fa file" - " (-action predict) ... ") - check_cmd = "GraphProt.pl -action predict -prefix " \ - + args.data_id + " -fasta " + args.in_fa + " " \ - + param_string + " -model " + args.in_model + print( + "Starting whole site predictions on input .fa file" + " (-action predict) ... " + ) + check_cmd = ( + "GraphProt.pl -action predict -prefix " + + args.data_id + + " -fasta " + + args.in_fa + + " " + + param_string + + " -model " + + args.in_model + ) output = subprocess.getoutput(check_cmd) - assert output, "the following call of GraphProt.pl "\ - "produced no output:\n%s" % (check_cmd) + assert ( + output + ), "the following call of GraphProt.pl " "produced no output:\n%s" % (check_cmd) if args.gp_output: print(output) ws_predictions_file = args.data_id + ".predictions" - assert os.path.exists(ws_predictions_file), \ - "Whole site prediction output .predictions file \"%s\" not found" \ - % (ws_predictions_file) + assert os.path.exists( + ws_predictions_file + ), 'Whole site prediction output .predictions file "%s" not found' % ( + ws_predictions_file + ) if args.conf_out: # Filter by pos_train_ws_pred_median median. - assert "pos_train_ws_pred_median" in param_dic, \ - "whole site top scores median information "\ - "missing in .params file" - pos_tr_ws_pred_med = \ - float(param_dic["pos_train_ws_pred_median"]) + assert "pos_train_ws_pred_median" in param_dic, ( + "whole site top scores median information " "missing in .params file" + ) + pos_tr_ws_pred_med = float(param_dic["pos_train_ws_pred_median"]) # Filtered file. filt_ws_predictions_file = args.data_id + ".p50.predictions" - print("Extracting p50 sites from whole site predictions" - " (score threshold = %f) ... " % (pos_tr_ws_pred_med)) - gplib.graphprot_filter_predictions_file(ws_predictions_file, - filt_ws_predictions_file, - sc_thr=pos_tr_ws_pred_med) + print( + "Extracting p50 sites from whole site predictions" + " (score threshold = %f) ... " % (pos_tr_ws_pred_med) + ) + gplib.graphprot_filter_predictions_file( + ws_predictions_file, filt_ws_predictions_file, sc_thr=pos_tr_ws_pred_med + ) else: # Do profile prediction. - print("Starting profile predictions on on input .fa file " - "(-action predict_profile) ... ") - check_cmd = "GraphProt.pl -action predict_profile -prefix " \ - + args.data_id + " -fasta " + args.in_fa + " " + \ - param_string + " -model " + args.in_model + print( + "Starting profile predictions on on input .fa file " + "(-action predict_profile) ... " + ) + check_cmd = ( + "GraphProt.pl -action predict_profile -prefix " + + args.data_id + + " -fasta " + + args.in_fa + + " " + + param_string + + " -model " + + args.in_model + ) output = subprocess.getoutput(check_cmd) - assert output, "the following call of GraphProt.pl produced "\ - "no output:\n%s" % (check_cmd) + assert ( + output + ), "the following call of GraphProt.pl produced " "no output:\n%s" % (check_cmd) if args.gp_output: print(output) profile_predictions_file = args.data_id + ".profile" - assert os.path.exists(profile_predictions_file), \ - "Profile prediction output .profile file \"%s\" not found" % \ - (profile_predictions_file) + assert os.path.exists( + profile_predictions_file + ), 'Profile prediction output .profile file "%s" not found' % ( + profile_predictions_file + ) # Profile prediction output files. avg_prof_file = args.data_id + ".avg_profile" avg_prof_peaks_file = args.data_id + ".avg_profile.peaks.bed" - avg_prof_gen_peaks_file = args.data_id + \ - ".avg_profile.genomic_peaks.bed" - avg_prof_peaks_p50_file = args.data_id + \ - ".avg_profile.p50.peaks.bed" - avg_prof_gen_peaks_p50_file = args.data_id + \ - ".avg_profile.p50.genomic_peaks.bed" + avg_prof_gen_peaks_file = args.data_id + ".avg_profile.genomic_peaks.bed" + avg_prof_peaks_p50_file = args.data_id + ".avg_profile.p50.peaks.bed" + avg_prof_gen_peaks_p50_file = ( + args.data_id + ".avg_profile.p50.genomic_peaks.bed" + ) # Get sequence IDs in order from input .fa file. seq_ids_list = gplib.fasta_read_in_ids(args.in_fa) # Calculate average profiles. - print("Getting average profile from profile " - "(extlr for smoothing: %i) ... " % (args.ap_extlr)) - gplib.graphprot_profile_calc_avg_profile(profile_predictions_file, - avg_prof_file, - ap_extlr=args.ap_extlr, - seq_ids_list=seq_ids_list, - method=2) + print( + "Getting average profile from profile " + "(extlr for smoothing: %i) ... " % (args.ap_extlr) + ) + gplib.graphprot_profile_calc_avg_profile( + profile_predictions_file, + avg_prof_file, + ap_extlr=args.ap_extlr, + seq_ids_list=seq_ids_list, + method=2, + ) # Extract peak regions on sequences with threshold score 0. - print("Extracting peak regions from average profile " - "(score threshold = 0) ... ") + print( + "Extracting peak regions from average profile " "(score threshold = 0) ... " + ) killpep8 = args.max_merge_dist - gplib.graphprot_profile_extract_peak_regions(avg_prof_file, - avg_prof_peaks_file, - max_merge_dist=killpep8, - sc_thr=args.score_thr) + gplib.graphprot_profile_extract_peak_regions( + avg_prof_file, + avg_prof_peaks_file, + max_merge_dist=killpep8, + sc_thr=args.score_thr, + ) # Convert peaks to genomic coordinates. if args.genomic_sites_bed: print("Converting peak regions to genomic coordinates ... ") killit = args.genomic_sites_bed - gplib.bed_peaks_to_genomic_peaks(avg_prof_peaks_file, - avg_prof_gen_peaks_file, - print_rows=False, - genomic_sites_bed=killit) + gplib.bed_peaks_to_genomic_peaks( + avg_prof_peaks_file, + avg_prof_gen_peaks_file, + print_rows=False, + genomic_sites_bed=killit, + ) # Extract peak regions with threshold score p50. if args.conf_out: sc_id = "pos_train_avg_profile_median_%i" % (args.ap_extlr) # Filter by pos_tr_ws_pred_med median. - assert sc_id in param_dic, "average profile extlr %i median "\ + assert sc_id in param_dic, ( + "average profile extlr %i median " "information missing in .params file" % (args.ap_extlr) + ) p50_sc_thr = float(param_dic[sc_id]) - print("Extracting p50 peak regions from average profile " - "(score threshold = %f) ... " % (p50_sc_thr)) + print( + "Extracting p50 peak regions from average profile " + "(score threshold = %f) ... " % (p50_sc_thr) + ) despair = avg_prof_peaks_p50_file pain = args.max_merge_dist - gplib.graphprot_profile_extract_peak_regions(avg_prof_file, - despair, - max_merge_dist=pain, - sc_thr=p50_sc_thr) + gplib.graphprot_profile_extract_peak_regions( + avg_prof_file, despair, max_merge_dist=pain, sc_thr=p50_sc_thr + ) # Convert peaks to genomic coordinates. if args.genomic_sites_bed: - print("Converting p50 peak regions to " - "genomic coordinates ... ") + print("Converting p50 peak regions to " "genomic coordinates ... ") madness = args.genomic_sites_bed - gplib.bed_peaks_to_genomic_peaks(avg_prof_peaks_p50_file, - avg_prof_gen_peaks_p50_file, - genomic_sites_bed=madness) + gplib.bed_peaks_to_genomic_peaks( + avg_prof_peaks_p50_file, + avg_prof_gen_peaks_p50_file, + genomic_sites_bed=madness, + ) # Done. print("Script: I'm done.") print("Author: ... ")