Mercurial > repos > rnateam > graphprot_predict_profile
view graphprot_predict_wrapper.py @ 6:33b590aa07c1 draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/graphprot commit 902e994cb04db968ce797afa676f5fa6512ab6d3
author | bgruening |
---|---|
date | Tue, 06 Aug 2024 14:55:18 +0000 |
parents | ddcf35a868b8 |
children |
line wrap: on
line source
#!/usr/bin/env python3 import argparse as ap import os import subprocess import sys import gplib """ TOOL DEPENDENCIES ================= GraphProt 1.1.7 Best install via: https://anaconda.org/bioconda/graphprot Tested with: miniconda3, conda 4.7.12 Script: What's my job this time, master? Author: It'll be a though one. Script: I take this as a given. Author: Oh yeah? Script: ... I'm ready. OUTPUT FILES ============ data_id.avg_profile data_id.avg_profile.peaks.bed --conf-out data_id.avg_profile.p50.peaks.bed --gen-site-bed data_id.avg_profile.genomic_peaks.bed --conf-out --gen-site-bed data_id.avg_profile.p50.genomic_peaks.bed --ws-pred data_id.predictions --ws-pred --conf-out data_id.predictions data_id.p50.predictions EXAMPLE CALLS ============= flake8 coming out of hotel room. FB enters. FB: Who is this f*** ??? python graphprot_predict_wrapper.py --model test2.model --params test2.params --fasta gp_data/test10_predict.fa --data-id test2pred --gp-output python graphprot_predict_wrapper.py --model test2.model --params test2.params --fasta gp_data/test10_predict.fa --data-id test2pred --gen-site-bed gp_data/test10_predict.bed python graphprot_predict_wrapper.py --model test2.model --params test2.params --fasta gp_data/test10_predict.fa --data-id test2pred --gen-site-bed gp_data/test10_predict.bed --conf-out python graphprot_predict_wrapper.py --model test2.model --params test2.params --fasta gp_data/test10_predict.fa --data-id test2pred --conf-out --ws-pred python graphprot_predict_wrapper.py --model test-data/test.model --params test-data/test.params --fasta test-data/test_predict.fa --data-id predtest python graphprot_predict_wrapper.py --model test-data/test.model --params test-data/test.params --fasta test-data/test_predict.fa --data-id predtest --gen-site-bed test-data/test_predict.bed --sc-thr 0.0 --max-merge-dist 0 --conf-out --ap-extlr 5 python graphprot_predict_wrapper.py --data-id GraphProt --fasta test-data/test_predict.fa --model test-data/test.model --params test-data/test.params --gen-site-bed test-data/test_predict.bed --sc-thr 0.0 --max-merge-dist 0 --conf-out --ap-extlr 5 """ ####################################################################### def setup_argument_parser(): """Setup argparse parser.""" help_description = """ Galaxy wrapper script for GraphProt (-action predict and -action predict_profile) to compute whole site or position-wise scores for input FASTA sequences. By default, profile predictions are calculated, followed by average profiles computions and peak regions extraction from average profiles. If --ws-pred is set, whole site score predictions on input sequences will be run instead. If --conf-out is set, sites or peak regions with a score >= the median score of positive training sites will be output. If --gen-site-bed .bed file is provided, peak regions will be output with genomic coordinates too. """ # Define argument parser. p = ap.ArgumentParser( add_help=False, prog="graphprot_predict_wrapper.py", description=help_description, formatter_class=ap.MetavarTypeHelpFormatter, ) # Argument groups. p_man = p.add_argument_group("REQUIRED ARGUMENTS") p_opt = p.add_argument_group("OPTIONAL ARGUMENTS") # Required arguments. p_opt.add_argument("-h", "--help", action="help", help="Print help message") p_man.add_argument( "--fasta", dest="in_fa", type=str, required=True, help="Sequences .fa file to predict" " on (option -fasta)", ) p_man.add_argument( "--model", dest="in_model", type=str, required=True, help="GraphProt model file to use for predictions" " (option -model)", ) p_man.add_argument( "--params", dest="in_params", type=str, required=True, help="Parameter file for given model", ) p_man.add_argument( "--data-id", dest="data_id", type=str, required=True, help="Data ID (option -prefix)", ) # ---> I'm a conditional argument <--- p_opt.add_argument( "--ws-pred", dest="ws_pred", default=False, action="store_true", help="Run a whole site prediction instead " "of calculating profiles (default: false)", ) # Additional arguments. p_opt.add_argument( "--sc-thr", dest="score_thr", type=float, default=0, help="Score threshold for extracting " "average profile peak regions (default: 0)", ) p_opt.add_argument( "--max-merge-dist", dest="max_merge_dist", type=int, default=0, choices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], help="Maximum merge distance for nearby peak regions" " (default: report all non-overlapping regions)", ) p_opt.add_argument( "--gen-site-bed", dest="genomic_sites_bed", type=str, help=".bed file specifying the genomic regions of " "the input .fa sequences. Corrupt .bed " "information will be punished (default: false)", ) p_opt.add_argument( "--conf-out", dest="conf_out", default=False, action="store_true", help="Output filtered peak regions BED file or " "predictions file (if --ws-pred) using the median " "positive training site score for filtering " "(default: false)", ) p_opt.add_argument( "--gp-output", dest="gp_output", default=False, action="store_true", help="Print output produced by GraphProt " "(default: false)", ) p_opt.add_argument( "--ap-extlr", dest="ap_extlr", type=int, default=5, choices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], help="Define average profile up- and downstream " "extension to produce the average profile. " "The mean over small sequence windows " "(window length = --ap-extlr*2 + 1) is used to " "get position scores, thus the average profile " "is more smooth than the initial profile output " "by GraphProt (default: 5)", ) return p ####################################################################### if __name__ == "__main__": # Setup argparse. parser = setup_argument_parser() # Read in command line arguments. args = parser.parse_args() """ Do all sorts of sanity checking. """ # Check for Linux. assert "linux" in sys.platform, "please use Linux" # Check tool availability. assert gplib.is_tool("GraphProt.pl"), "GraphProt.pl not in PATH" # Check file inputs. assert os.path.exists(args.in_fa), 'input .fa file "%s" not found' % (args.in_fa) assert os.path.exists(args.in_model), 'input .model file "%s" not found' % ( args.in_model ) assert os.path.exists(args.in_params), 'input .params file "%s" not found' % ( args.in_params ) # Count .fa entries. c_in_fa = gplib.count_fasta_headers(args.in_fa) assert c_in_fa, 'input .fa file "%s" no headers found' % (args.in_fa) print("# input .fa sequences: %i" % (c_in_fa)) # Read in FASTA sequences to check for uppercase sequences. seqs_dic = gplib.read_fasta_into_dic(args.in_fa) # Check for lowercase only sequences, which cause GP to crash. error_mess = ( "input sequences encountered containing " "only lowercase characters or lowercase characters in between " "uppercase characters. Please provide either all uppercase " "sequences or sequences containing uppercase regions surrounded " "by lowercase context regions for structure calculation (see " "viewpoint concept in original GraphProt publication " "for more details)" ) if args.ws_pred: bad_ids = gplib.check_seqs_dic_format(seqs_dic) assert not bad_ids, "%s" % (error_mess) c_uc_nt = gplib.seqs_dic_count_uc_nts(seqs_dic) assert c_uc_nt, ( "no uppercase nucleotides in input .fa sequences. " "Please change sequences to uppercase (keep in mind " "GraphProt only scores uppercase regions (according " "to its viewpoint concept)" ) if not args.ws_pred: # Check for lowercase sequences. c_lc_nt = gplib.seqs_dic_count_lc_nts(seqs_dic) assert not c_lc_nt, ( "lowercase nucleotides in input .fa not allowed" " in profile predictions, since GraphProt only scores " "uppercase regions (according to its viewpoint concept)" ) # Check .bed. if args.genomic_sites_bed: # An array of checks, marvelous. assert os.path.exists( args.genomic_sites_bed ), 'genomic .bed file "%s" not found' % (args.genomic_sites_bed) # Check .bed for content. assert gplib.count_file_rows( args.genomic_sites_bed ), 'genomic .bed file "%s" is empty' % (args.genomic_sites_bed) # Check .bed for 6-column format. assert gplib.bed_check_six_col_format( args.genomic_sites_bed ), 'genomic .bed file "%s" appears to not ' "be in 6-column .bed format" % ( args.genomic_sites_bed ) # Check for unique column 4 IDs. assert gplib.bed_check_unique_ids( args.genomic_sites_bed ), 'genomic .bed file "%s" column 4 IDs not unique' % (args.genomic_sites_bed) # Read in .bed regions, compare to FASTA sequences. seq_len_dic = gplib.get_seq_lengths_from_seqs_dic(seqs_dic) reg_len_dic = gplib.bed_get_region_lengths(args.genomic_sites_bed) for seq_id in seq_len_dic: seq_l = seq_len_dic[seq_id] assert ( seq_id in reg_len_dic ), 'sequence ID "%s" missing in input .bed "%s"' % ( seq_id, args.genomic_sites_bed, ) reg_l = reg_len_dic[seq_id] assert ( seq_l == reg_l ), "sequence length differs from .bed region length (%i != %i)" % ( seq_l, reg_l, ) # Read in model parameters. param_dic = gplib.graphprot_get_param_dic(args.in_params) # Create GraphProt parameter string. param_string = gplib.graphprot_get_param_string(args.in_params) """ Run predictions. """ if args.ws_pred: # Do whole site prediction. print( "Starting whole site predictions on input .fa file" " (-action predict) ... " ) check_cmd = ( "GraphProt.pl -action predict -prefix " + args.data_id + " -fasta " + args.in_fa + " " + param_string + " -model " + args.in_model ) output = subprocess.getoutput(check_cmd) assert ( output ), "the following call of GraphProt.pl " "produced no output:\n%s" % (check_cmd) if args.gp_output: print(output) ws_predictions_file = args.data_id + ".predictions" assert os.path.exists( ws_predictions_file ), 'Whole site prediction output .predictions file "%s" not found' % ( ws_predictions_file ) if args.conf_out: # Filter by pos_train_ws_pred_median median. assert "pos_train_ws_pred_median" in param_dic, ( "whole site top scores median information " "missing in .params file" ) pos_tr_ws_pred_med = float(param_dic["pos_train_ws_pred_median"]) # Filtered file. filt_ws_predictions_file = args.data_id + ".p50.predictions" print( "Extracting p50 sites from whole site predictions" " (score threshold = %f) ... " % (pos_tr_ws_pred_med) ) gplib.graphprot_filter_predictions_file( ws_predictions_file, filt_ws_predictions_file, sc_thr=pos_tr_ws_pred_med ) else: # Do profile prediction. print( "Starting profile predictions on on input .fa file " "(-action predict_profile) ... " ) check_cmd = ( "GraphProt.pl -action predict_profile -prefix " + args.data_id + " -fasta " + args.in_fa + " " + param_string + " -model " + args.in_model ) output = subprocess.getoutput(check_cmd) assert ( output ), "the following call of GraphProt.pl produced " "no output:\n%s" % (check_cmd) if args.gp_output: print(output) profile_predictions_file = args.data_id + ".profile" assert os.path.exists( profile_predictions_file ), 'Profile prediction output .profile file "%s" not found' % ( profile_predictions_file ) # Profile prediction output files. avg_prof_file = args.data_id + ".avg_profile" avg_prof_peaks_file = args.data_id + ".avg_profile.peaks.bed" avg_prof_gen_peaks_file = args.data_id + ".avg_profile.genomic_peaks.bed" avg_prof_peaks_p50_file = args.data_id + ".avg_profile.p50.peaks.bed" avg_prof_gen_peaks_p50_file = ( args.data_id + ".avg_profile.p50.genomic_peaks.bed" ) # Get sequence IDs in order from input .fa file. seq_ids_list = gplib.fasta_read_in_ids(args.in_fa) # Calculate average profiles. print( "Getting average profile from profile " "(extlr for smoothing: %i) ... " % (args.ap_extlr) ) gplib.graphprot_profile_calc_avg_profile( profile_predictions_file, avg_prof_file, ap_extlr=args.ap_extlr, seq_ids_list=seq_ids_list, method=2, ) # Extract peak regions on sequences with threshold score 0. print( "Extracting peak regions from average profile " "(score threshold = 0) ... " ) killpep8 = args.max_merge_dist gplib.graphprot_profile_extract_peak_regions( avg_prof_file, avg_prof_peaks_file, max_merge_dist=killpep8, sc_thr=args.score_thr, ) # Convert peaks to genomic coordinates. if args.genomic_sites_bed: print("Converting peak regions to genomic coordinates ... ") killit = args.genomic_sites_bed gplib.bed_peaks_to_genomic_peaks( avg_prof_peaks_file, avg_prof_gen_peaks_file, print_rows=False, genomic_sites_bed=killit, ) # Extract peak regions with threshold score p50. if args.conf_out: sc_id = "pos_train_avg_profile_median_%i" % (args.ap_extlr) # Filter by pos_tr_ws_pred_med median. assert sc_id in param_dic, ( "average profile extlr %i median " "information missing in .params file" % (args.ap_extlr) ) p50_sc_thr = float(param_dic[sc_id]) print( "Extracting p50 peak regions from average profile " "(score threshold = %f) ... " % (p50_sc_thr) ) despair = avg_prof_peaks_p50_file pain = args.max_merge_dist gplib.graphprot_profile_extract_peak_regions( avg_prof_file, despair, max_merge_dist=pain, sc_thr=p50_sc_thr ) # Convert peaks to genomic coordinates. if args.genomic_sites_bed: print("Converting p50 peak regions to " "genomic coordinates ... ") madness = args.genomic_sites_bed gplib.bed_peaks_to_genomic_peaks( avg_prof_peaks_p50_file, avg_prof_gen_peaks_p50_file, genomic_sites_bed=madness, ) # Done. print("Script: I'm done.") print("Author: ... ")