graphprot_predict_profile: graphprot_predict

comparison graphprot_predict_wrapper.py @ 5:ddcf35a868b8 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/graphprot commit ad60258f5759eaa205fec4af6143c728ea131419

author	bgruening
date	Wed, 05 Jun 2024 16:40:51 +0000
parents	4ad83aed5c3c
children

comparison

equal deleted inserted replaced

-:4ad83aed5c3c
+:ddcf35a868b8
 import os
 import subprocess
 import sys
 import gplib
 """
 TOOL DEPENDENCIES
 =================
 --sc-thr 0.0 --max-merge-dist 0 --conf-out  --ap-extlr 5
 """
-###############################################################################
+#######################################################################
 def setup_argument_parser():
 """Setup argparse parser."""
 help_description = """
 Galaxy wrapper script for GraphProt (-action predict and -action
 If --gen-site-bed .bed file is provided, peak regions will be output
 with genomic coordinates too.
 """
 # Define argument parser.
-p = ap.ArgumentParser(add_help=False,
+p = ap.ArgumentParser(
-prog="graphprot_predict_wrapper.py",
+add_help=False,
-description=help_description,
+prog="graphprot_predict_wrapper.py",
-formatter_class=ap.MetavarTypeHelpFormatter)
+description=help_description,
+formatter_class=ap.MetavarTypeHelpFormatter,
+)
 # Argument groups.
 p_man = p.add_argument_group("REQUIRED ARGUMENTS")
 p_opt = p.add_argument_group("OPTIONAL ARGUMENTS")
 # Required arguments.
-p_opt.add_argument("-h", "--help",
+p_opt.add_argument("-h", "--help", action="help", help="Print help message")
-action="help",
+p_man.add_argument(
-help="Print help message")
+"--fasta",
-p_man.add_argument("--fasta",
+dest="in_fa",
-dest="in_fa",
+type=str,
-type=str,
+required=True,
-required=True,
+help="Sequences .fa file to predict" " on (option -fasta)",
-help="Sequences .fa file to predict"
+)
-" on (option -fasta)")
+p_man.add_argument(
-p_man.add_argument("--model",
+"--model",
 dest="in_model",
 type=str,
 required=True,
-help="GraphProt model file to use for predictions"
+help="GraphProt model file to use for predictions" " (option -model)",
-" (option -model)")
+)
-p_man.add_argument("--params",
+p_man.add_argument(
-dest="in_params",
+"--params",
-type=str,
+dest="in_params",
-required=True,
+type=str,
-help="Parameter file for given model")
+required=True,
-p_man.add_argument("--data-id",
+help="Parameter file for given model",
-dest="data_id",
+)
-type=str,
+p_man.add_argument(
-required=True,
+"--data-id",
-help="Data ID (option -prefix)")
+dest="data_id",
+type=str,
+required=True,
+help="Data ID (option -prefix)",
+)
 # ---> I'm  a conditional argument <---
-p_opt.add_argument("--ws-pred",
+p_opt.add_argument(
-dest="ws_pred",
+"--ws-pred",
-default=False,
+dest="ws_pred",
-action="store_true",
+default=False,
-help="Run a whole site prediction instead "
+action="store_true",
-"of calculating profiles (default: false)")
+help="Run a whole site prediction instead "
+"of calculating profiles (default: false)",
+)
 # Additional arguments.
-p_opt.add_argument("--sc-thr",
+p_opt.add_argument(
-dest="score_thr",
+"--sc-thr",
-type=float,
+dest="score_thr",
-default=0,
+type=float,
-help="Score threshold for extracting "
+default=0,
-"average profile peak regions (default: 0)")
+help="Score threshold for extracting "
-p_opt.add_argument("--max-merge-dist",
+"average profile peak regions (default: 0)",
-dest="max_merge_dist",
+)
-type=int,
+p_opt.add_argument(
-default=0,
+"--max-merge-dist",
-choices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+dest="max_merge_dist",
-help="Maximum merge distance for nearby peak regions"
+type=int,
-" (default: report all non-overlapping regions)")
+default=0,
-p_opt.add_argument("--gen-site-bed",
+choices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
-dest="genomic_sites_bed",
+help="Maximum merge distance for nearby peak regions"
-type=str,
+" (default: report all non-overlapping regions)",
-help=".bed file specifying the genomic regions of "
+)
-"the input .fa sequences. Corrupt .bed "
+p_opt.add_argument(
-"information will be punished (default: false)")
+"--gen-site-bed",
-p_opt.add_argument("--conf-out",
+dest="genomic_sites_bed",
-dest="conf_out",
+type=str,
-default=False,
+help=".bed file specifying the genomic regions of "
-action="store_true",
+"the input .fa sequences. Corrupt .bed "
-help="Output filtered peak regions BED file or "
+"information will be punished (default: false)",
-"predictions file (if --ws-pred) using the median "
+)
-"positive training site score for filtering "
+p_opt.add_argument(
-"(default: false)")
+"--conf-out",
-p_opt.add_argument("--gp-output",
+dest="conf_out",
-dest="gp_output",
+default=False,
-default=False,
+action="store_true",
-action="store_true",
+help="Output filtered peak regions BED file or "
-help="Print output produced by GraphProt "
+"predictions file (if --ws-pred) using the median "
-"(default: false)")
+"positive training site score for filtering "
-p_opt.add_argument("--ap-extlr",
+"(default: false)",
-dest="ap_extlr",
+)
-type=int,
+p_opt.add_argument(
-default=5,
+"--gp-output",
-choices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+dest="gp_output",
-help="Define average profile up- and downstream "
+default=False,
-"extension to produce the average profile. "
+action="store_true",
-"The mean over small sequence windows "
+help="Print output produced by GraphProt " "(default: false)",
-"(window length = --ap-extlr*2 + 1) is used to "
+)
-"get position scores, thus the average profile "
+p_opt.add_argument(
-"is more smooth than the initial profile output "
+"--ap-extlr",
-"by GraphProt (default: 5)")
+dest="ap_extlr",
+type=int,
+default=5,
+choices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+help="Define average profile up- and downstream "
+"extension to produce the average profile. "
+"The mean over small sequence windows "
+"(window length = --ap-extlr*2 + 1) is used to "
+"get position scores, thus the average profile "
+"is more smooth than the initial profile output "
+"by GraphProt (default: 5)",
+)
 return p
-###############################################################################
+#######################################################################
-if __name__ == '__main__':
+if __name__ == "__main__":
 # Setup argparse.
 parser = setup_argument_parser()
 # Read in command line arguments.
 args = parser.parse_args()
 # Check for Linux.
 assert "linux" in sys.platform, "please use Linux"
 # Check tool availability.
 assert gplib.is_tool("GraphProt.pl"), "GraphProt.pl not in PATH"
 # Check file inputs.
-assert os.path.exists(args.in_fa), \
+assert os.path.exists(args.in_fa), 'input .fa file "%s" not found' % (args.in_fa)
-"input .fa file \"%s\" not found" % (args.in_fa)
+assert os.path.exists(args.in_model), 'input .model file "%s" not found' % (
-assert os.path.exists(args.in_model), \
+args.in_model
-"input .model file \"%s\" not found" % (args.in_model)
+)
-assert os.path.exists(args.in_params), \
+assert os.path.exists(args.in_params), 'input .params file "%s" not found' % (
-"input .params file \"%s\" not found" % (args.in_params)
+args.in_params
+)
 # Count .fa entries.
 c_in_fa = gplib.count_fasta_headers(args.in_fa)
-assert c_in_fa, "input .fa file \"%s\" no headers found" % (args.in_fa)
+assert c_in_fa, 'input .fa file "%s" no headers found' % (args.in_fa)
 print("# input .fa sequences:   %i" % (c_in_fa))
 # Read in FASTA sequences to check for uppercase sequences.
 seqs_dic = gplib.read_fasta_into_dic(args.in_fa)
 # Check for lowercase only sequences, which cause GP to crash.
-error_mess = "input sequences encountered containing "\
+error_mess = (
-"only lowercase characters or lowercase characters in between "\
+"input sequences encountered containing "
-"uppercase characters. Please provide either all uppercase "\
+"only lowercase characters or lowercase characters in between "
-"sequences or sequences containing uppercase regions surrounded "\
+"uppercase characters. Please provide either all uppercase "
-"by lowercase context regions for structure calculation (see "\
+"sequences or sequences containing uppercase regions surrounded "
-"viewpoint concept in original GraphProt publication "\
+"by lowercase context regions for structure calculation (see "
+"viewpoint concept in original GraphProt publication "
 "for more details)"
+)
 if args.ws_pred:
 bad_ids = gplib.check_seqs_dic_format(seqs_dic)
 assert not bad_ids, "%s" % (error_mess)
 c_uc_nt = gplib.seqs_dic_count_uc_nts(seqs_dic)
-assert c_uc_nt, "no uppercase nucleotides in input .fa sequences. "\
+assert c_uc_nt, (
-"Please change sequences to uppercase (keep in mind "\
+"no uppercase nucleotides in input .fa sequences. "
-"GraphProt only scores uppercase regions (according "\
+"Please change sequences to uppercase (keep in mind "
-"to its viewpoint concept)"
+"GraphProt only scores uppercase regions (according "
+"to its viewpoint concept)"
+)
 if not args.ws_pred:
 # Check for lowercase sequences.
 c_lc_nt = gplib.seqs_dic_count_lc_nts(seqs_dic)
-assert not c_lc_nt, "lowercase nucleotides in input .fa not allowed"\
+assert not c_lc_nt, (
-" in profile predictions, since GraphProt only scores "\
+"lowercase nucleotides in input .fa not allowed"
+" in profile predictions, since GraphProt only scores "
 "uppercase regions (according to its viewpoint concept)"
+)
 # Check .bed.
 if args.genomic_sites_bed:
 # An array of checks, marvelous.
-assert \
+assert os.path.exists(
-os.path.exists(args.genomic_sites_bed), \
+args.genomic_sites_bed
-"genomic .bed file \"%s\" not found" % (args.genomic_sites_bed)
+), 'genomic .bed file "%s" not found' % (args.genomic_sites_bed)
 # Check .bed for content.
-assert gplib.count_file_rows(args.genomic_sites_bed), \
+assert gplib.count_file_rows(
-"genomic .bed file \"%s\" is empty" % (args.genomic_sites_bed)
+args.genomic_sites_bed
+), 'genomic .bed file "%s" is empty' % (args.genomic_sites_bed)
 # Check .bed for 6-column format.
-assert gplib.bed_check_six_col_format(args.genomic_sites_bed), \
+assert gplib.bed_check_six_col_format(
-"genomic .bed file \"%s\" appears to not "\
+args.genomic_sites_bed
-"be in 6-column .bed format" % (args.genomic_sites_bed)
+), 'genomic .bed file "%s" appears to not ' "be in 6-column .bed format" % (
+args.genomic_sites_bed
+)
 # Check for unique column 4 IDs.
-assert gplib.bed_check_unique_ids(args.genomic_sites_bed), \
+assert gplib.bed_check_unique_ids(
-"genomic .bed file \"%s\" column 4 IDs not unique" % \
+args.genomic_sites_bed
-(args.genomic_sites_bed)
+), 'genomic .bed file "%s" column 4 IDs not unique' % (args.genomic_sites_bed)
 # Read in .bed regions, compare to FASTA sequences.
 seq_len_dic = gplib.get_seq_lengths_from_seqs_dic(seqs_dic)
 reg_len_dic = gplib.bed_get_region_lengths(args.genomic_sites_bed)
 for seq_id in seq_len_dic:
 seq_l = seq_len_dic[seq_id]
-assert seq_id in reg_len_dic, \
+assert (
-"sequence ID \"%s\" missing in input .bed \"%s\"" % \
+seq_id in reg_len_dic
-(seq_id, args.genomic_sites_bed)
+), 'sequence ID "%s" missing in input .bed "%s"' % (
+seq_id,
+args.genomic_sites_bed,
+)
 reg_l = reg_len_dic[seq_id]
-assert seq_l == reg_l, \
+assert (
-"sequence length differs from .bed region length (%i != %i)" \
+seq_l == reg_l
-% (seq_l, reg_l)
+), "sequence length differs from .bed region length (%i != %i)" % (
+seq_l,
+reg_l,
+)
 # Read in model parameters.
 param_dic = gplib.graphprot_get_param_dic(args.in_params)
 # Create GraphProt parameter string.
 param_string = gplib.graphprot_get_param_string(args.in_params)
 Run predictions.
 """
 if args.ws_pred:
 # Do whole site prediction.
-print("Starting whole site predictions on input .fa file"
+print(
-" (-action predict) ... ")
+"Starting whole site predictions on input .fa file"
-check_cmd = "GraphProt.pl -action predict -prefix " \
+" (-action predict) ... "
-+ args.data_id + " -fasta " + args.in_fa + " " \
+)
-+ param_string + " -model " + args.in_model
+check_cmd = (
+"GraphProt.pl -action predict -prefix "
++ args.data_id
++ " -fasta "
++ args.in_fa
++ " "
++ param_string
++ " -model "
++ args.in_model
+)
 output = subprocess.getoutput(check_cmd)
-assert output, "the following call of GraphProt.pl "\
+assert (
-"produced no output:\n%s" % (check_cmd)
+output
+), "the following call of GraphProt.pl " "produced no output:\n%s" % (check_cmd)
 if args.gp_output:
 print(output)
 ws_predictions_file = args.data_id + ".predictions"
-assert os.path.exists(ws_predictions_file), \
+assert os.path.exists(
-"Whole site prediction output .predictions file \"%s\" not found" \
+ws_predictions_file
-% (ws_predictions_file)
+), 'Whole site prediction output .predictions file "%s" not found' % (
+ws_predictions_file
+)
 if args.conf_out:
 # Filter by pos_train_ws_pred_median median.
-assert "pos_train_ws_pred_median" in param_dic, \
+assert "pos_train_ws_pred_median" in param_dic, (
-"whole site top scores median information "\
+"whole site top scores median information " "missing in .params file"
-"missing in .params file"
+)
-pos_tr_ws_pred_med = \
+pos_tr_ws_pred_med = float(param_dic["pos_train_ws_pred_median"])
-float(param_dic["pos_train_ws_pred_median"])
 # Filtered file.
 filt_ws_predictions_file = args.data_id + ".p50.predictions"
-print("Extracting p50 sites from whole site predictions"
+print(
-" (score threshold = %f) ... " % (pos_tr_ws_pred_med))
+"Extracting p50 sites from whole site predictions"
-gplib.graphprot_filter_predictions_file(ws_predictions_file,
+" (score threshold = %f) ... " % (pos_tr_ws_pred_med)
-filt_ws_predictions_file,
+)
-sc_thr=pos_tr_ws_pred_med)
+gplib.graphprot_filter_predictions_file(
+ws_predictions_file, filt_ws_predictions_file, sc_thr=pos_tr_ws_pred_med
+)
 else:
 # Do profile prediction.
-print("Starting profile predictions on on input .fa file "
+print(
-"(-action predict_profile) ... ")
+"Starting profile predictions on on input .fa file "
-check_cmd = "GraphProt.pl -action predict_profile -prefix " \
+"(-action predict_profile) ... "
-+ args.data_id + " -fasta " + args.in_fa + " " + \
+)
-param_string + " -model " + args.in_model
+check_cmd = (
+"GraphProt.pl -action predict_profile -prefix "
++ args.data_id
++ " -fasta "
++ args.in_fa
++ " "
++ param_string
++ " -model "
++ args.in_model
+)
 output = subprocess.getoutput(check_cmd)
-assert output, "the following call of GraphProt.pl produced "\
+assert (
-"no output:\n%s" % (check_cmd)
+output
+), "the following call of GraphProt.pl produced " "no output:\n%s" % (check_cmd)
 if args.gp_output:
 print(output)
 profile_predictions_file = args.data_id + ".profile"
-assert os.path.exists(profile_predictions_file), \
+assert os.path.exists(
-"Profile prediction output .profile file \"%s\" not found" % \
+profile_predictions_file
-(profile_predictions_file)
+), 'Profile prediction output .profile file "%s" not found' % (
+profile_predictions_file
+)
 # Profile prediction output files.
 avg_prof_file = args.data_id + ".avg_profile"
 avg_prof_peaks_file = args.data_id + ".avg_profile.peaks.bed"
-avg_prof_gen_peaks_file = args.data_id + \
+avg_prof_gen_peaks_file = args.data_id + ".avg_profile.genomic_peaks.bed"
-".avg_profile.genomic_peaks.bed"
+avg_prof_peaks_p50_file = args.data_id + ".avg_profile.p50.peaks.bed"
-avg_prof_peaks_p50_file = args.data_id + \
+avg_prof_gen_peaks_p50_file = (
-".avg_profile.p50.peaks.bed"
+args.data_id + ".avg_profile.p50.genomic_peaks.bed"
-avg_prof_gen_peaks_p50_file = args.data_id + \
+)
-".avg_profile.p50.genomic_peaks.bed"
 # Get sequence IDs in order from input .fa file.
 seq_ids_list = gplib.fasta_read_in_ids(args.in_fa)
 # Calculate average profiles.
-print("Getting average profile from profile "
+print(
-"(extlr for smoothing: %i) ... " % (args.ap_extlr))
+"Getting average profile from profile "
-gplib.graphprot_profile_calc_avg_profile(profile_predictions_file,
+"(extlr for smoothing: %i) ... " % (args.ap_extlr)
-avg_prof_file,
+)
-ap_extlr=args.ap_extlr,
+gplib.graphprot_profile_calc_avg_profile(
-seq_ids_list=seq_ids_list,
+profile_predictions_file,
-method=2)
+avg_prof_file,
+ap_extlr=args.ap_extlr,
+seq_ids_list=seq_ids_list,
+method=2,
+)
 # Extract peak regions on sequences with threshold score 0.
-print("Extracting peak regions from average profile "
+print(
-"(score threshold = 0) ... ")
+"Extracting peak regions from average profile " "(score threshold = 0) ... "
+)
 killpep8 = args.max_merge_dist
-gplib.graphprot_profile_extract_peak_regions(avg_prof_file,
+gplib.graphprot_profile_extract_peak_regions(
-avg_prof_peaks_file,
+avg_prof_file,
-max_merge_dist=killpep8,
+avg_prof_peaks_file,
-sc_thr=args.score_thr)
+max_merge_dist=killpep8,
+sc_thr=args.score_thr,
+)
 # Convert peaks to genomic coordinates.
 if args.genomic_sites_bed:
 print("Converting peak regions to genomic coordinates ... ")
 killit = args.genomic_sites_bed
-gplib.bed_peaks_to_genomic_peaks(avg_prof_peaks_file,
+gplib.bed_peaks_to_genomic_peaks(
-avg_prof_gen_peaks_file,
+avg_prof_peaks_file,
-print_rows=False,
+avg_prof_gen_peaks_file,
-genomic_sites_bed=killit)
+print_rows=False,
+genomic_sites_bed=killit,
+)
 # Extract peak regions with threshold score p50.
 if args.conf_out:
 sc_id = "pos_train_avg_profile_median_%i" % (args.ap_extlr)
 # Filter by pos_tr_ws_pred_med median.
-assert sc_id in param_dic, "average profile extlr %i median "\
+assert sc_id in param_dic, (
+"average profile extlr %i median "
 "information missing in .params file" % (args.ap_extlr)
+)
 p50_sc_thr = float(param_dic[sc_id])
-print("Extracting p50 peak regions from average profile "
+print(
-"(score threshold = %f) ... " % (p50_sc_thr))
+"Extracting p50 peak regions from average profile "
+"(score threshold = %f) ... " % (p50_sc_thr)
+)
 despair = avg_prof_peaks_p50_file
 pain = args.max_merge_dist
-gplib.graphprot_profile_extract_peak_regions(avg_prof_file,
+gplib.graphprot_profile_extract_peak_regions(
-despair,
+avg_prof_file, despair, max_merge_dist=pain, sc_thr=p50_sc_thr
-max_merge_dist=pain,
+)
-sc_thr=p50_sc_thr)
 # Convert peaks to genomic coordinates.
 if args.genomic_sites_bed:
-print("Converting p50 peak regions to "
+print("Converting p50 peak regions to " "genomic coordinates ... ")
-"genomic coordinates ... ")
 madness = args.genomic_sites_bed
-gplib.bed_peaks_to_genomic_peaks(avg_prof_peaks_p50_file,
+gplib.bed_peaks_to_genomic_peaks(
-avg_prof_gen_peaks_p50_file,
+avg_prof_peaks_p50_file,
-genomic_sites_bed=madness)
+avg_prof_gen_peaks_p50_file,
+genomic_sites_bed=madness,
+)
 # Done.
 print("Script: I'm done.")
 print("Author: ... ")

Mercurial > repos > rnateam > graphprot_predict_profile

comparison graphprot_predict_wrapper.py @ 5:ddcf35a868b8 draft