Mercurial > repos > rnateam > graphprot_predict_profile
comparison graphprot_predict_wrapper.py @ 3:ace92c9a4653 draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/graphprot commit efcac98677c3ea9039c1c61eaa9e58f78287ccb3"
| author | bgruening |
|---|---|
| date | Wed, 27 Jan 2021 19:27:47 +0000 |
| parents | 7bbb7bf6304f |
| children | 4ad83aed5c3c |
comparison
equal
deleted
inserted
replaced
| 2:7bbb7bf6304f | 3:ace92c9a4653 |
|---|---|
| 1 #!/usr/bin/env python3 | 1 #!/usr/bin/env python3 |
| 2 | 2 |
| 3 import argparse as ap | |
| 4 import os | |
| 3 import subprocess | 5 import subprocess |
| 4 import argparse | 6 import sys |
| 5 import shutil | 7 |
| 6 import gplib | 8 import gplib |
| 7 import gzip | |
| 8 import sys | |
| 9 import os | |
| 10 | 9 |
| 11 | 10 |
| 12 """ | 11 """ |
| 13 | 12 |
| 14 TOOL DEPENDENCIES | 13 TOOL DEPENDENCIES |
| 46 | 45 |
| 47 | 46 |
| 48 EXAMPLE CALLS | 47 EXAMPLE CALLS |
| 49 ============= | 48 ============= |
| 50 | 49 |
| 51 python graphprot_predict_wrapper.py --model test2.model --params test2.params --fasta gp_data/test10_predict.fa --data-id test2pred --gp-output | 50 flake8 coming out of hotel room. FB enters. |
| 52 python graphprot_predict_wrapper.py --model test2.model --params test2.params --fasta gp_data/test10_predict.fa --data-id test2pred --gen-site-bed gp_data/test10_predict.bed | 51 FB: Who is this f*** ??? |
| 53 python graphprot_predict_wrapper.py --model test2.model --params test2.params --fasta gp_data/test10_predict.fa --data-id test2pred --gen-site-bed gp_data/test10_predict.bed --conf-out | 52 |
| 54 python graphprot_predict_wrapper.py --model test2.model --params test2.params --fasta gp_data/test10_predict.fa --data-id test2pred --conf-out --ws-pred | 53 |
| 55 | 54 python graphprot_predict_wrapper.py --model test2.model --params test2.params |
| 56 python graphprot_predict_wrapper.py --model test-data/test.model --params test-data/test.params --fasta test-data/test_predict.fa --data-id predtest | 55 --fasta gp_data/test10_predict.fa --data-id test2pred --gp-output |
| 57 | 56 python graphprot_predict_wrapper.py --model test2.model --params test2.params |
| 58 python graphprot_predict_wrapper.py --model test-data/test.model --params test-data/test.params --fasta test-data/test_predict.fa --data-id predtest --gen-site-bed test-data/test_predict.bed --sc-thr 0.0 --max-merge-dist 0 --conf-out --ap-extlr 5 | 57 --fasta gp_data/test10_predict.fa --data-id test2pred |
| 59 | 58 --gen-site-bed gp_data/test10_predict.bed |
| 60 python graphprot_predict_wrapper.py --data-id GraphProt --fasta test-data/test_predict.fa --model test-data/test.model --params test-data/test.params --gen-site-bed test-data/test_predict.bed --sc-thr 0.0 --max-merge-dist 0 --conf-out --ap-extlr 5 | 59 |
| 61 | 60 python graphprot_predict_wrapper.py --model test2.model --params test2.params |
| 62 | 61 --fasta gp_data/test10_predict.fa --data-id test2pred |
| 63 pwd && python '/home/uhlm/Dokumente/Projekte/GraphProt_galaxy_new/galaxytools/tools/rna_tools/graphprot/graphprot_predict_wrapper.py' --data-id GraphProt --fasta /tmp/tmpmuslpc1h/files/0/8/c/dataset_08c48d88-e3b5-423b-acf6-bf89b8c60660.dat --model /tmp/tmpmuslpc1h/files/e/6/4/dataset_e6471bb4-e74c-4372-bc49-656f900e7191.dat --params /tmp/tmpmuslpc1h/files/b/6/5/dataset_b65e8cf4-d3e6-429e-8d57-1d401adf4b3c.dat --gen-site-bed /tmp/tmpmuslpc1h/files/5/1/a/dataset_51a38b65-5943-472d-853e-5d845fa8ac3e.dat --sc-thr 0.0 --max-merge-dist 0 --conf-out --ap-extlr 5 | 62 --gen-site-bed gp_data/test10_predict.bed --conf-out |
| 64 | 63 |
| 64 python graphprot_predict_wrapper.py --model test2.model --params test2.params | |
| 65 --fasta gp_data/test10_predict.fa --data-id test2pred --conf-out --ws-pred | |
| 66 | |
| 67 python graphprot_predict_wrapper.py --model test-data/test.model | |
| 68 --params test-data/test.params --fasta test-data/test_predict.fa | |
| 69 --data-id predtest | |
| 70 | |
| 71 python graphprot_predict_wrapper.py --model test-data/test.model | |
| 72 --params test-data/test.params --fasta test-data/test_predict.fa | |
| 73 --data-id predtest --gen-site-bed test-data/test_predict.bed | |
| 74 --sc-thr 0.0 --max-merge-dist 0 --conf-out --ap-extlr 5 | |
| 75 | |
| 76 python graphprot_predict_wrapper.py --data-id GraphProt | |
| 77 --fasta test-data/test_predict.fa --model test-data/test.model | |
| 78 --params test-data/test.params --gen-site-bed test-data/test_predict.bed | |
| 79 --sc-thr 0.0 --max-merge-dist 0 --conf-out --ap-extlr 5 | |
| 65 | 80 |
| 66 """ | 81 """ |
| 67 | 82 |
| 68 ################################################################################ | 83 |
| 84 ############################################################################### | |
| 69 | 85 |
| 70 def setup_argument_parser(): | 86 def setup_argument_parser(): |
| 71 """Setup argparse parser.""" | 87 """Setup argparse parser.""" |
| 72 help_description = """ | 88 help_description = """ |
| 73 Galaxy wrapper script for GraphProt (-action predict and -action | 89 Galaxy wrapper script for GraphProt (-action predict and -action |
| 74 predict_profile) to compute whole site or position-wise scores for input | 90 predict_profile) to compute whole site or position-wise scores for input |
| 75 FASTA sequences. | 91 FASTA sequences. |
| 76 By default, profile predictions are calculated, followed by average | 92 By default, profile predictions are calculated, followed by average |
| 77 profiles computions and peak regions extraction from average profiles. | 93 profiles computions and peak regions extraction from average profiles. |
| 78 If --ws-pred is set, whole site score predictions on input sequences | 94 If --ws-pred is set, whole site score predictions on input sequences |
| 79 will be run instead. | 95 will be run instead. |
| 80 If --conf-out is set, sites or peak regions with a score >= the median | 96 If --conf-out is set, sites or peak regions with a score >= the median |
| 81 score of positive training sites will be output. | 97 score of positive training sites will be output. |
| 82 If --gen-site-bed .bed file is provided, peak regions will be output | 98 If --gen-site-bed .bed file is provided, peak regions will be output |
| 83 with genomic coordinates too. | 99 with genomic coordinates too. |
| 84 | 100 |
| 85 """ | 101 """ |
| 86 # Define argument parser. | 102 # Define argument parser. |
| 87 p = argparse.ArgumentParser(add_help=False, | 103 p = ap.ArgumentParser(add_help=False, |
| 88 prog="graphprot_predict_wrapper.py", | 104 prog="graphprot_predict_wrapper.py", |
| 89 description=help_description, | 105 description=help_description, |
| 90 formatter_class=argparse.MetavarTypeHelpFormatter) | 106 formatter_class=ap.MetavarTypeHelpFormatter) |
| 91 | 107 |
| 92 # Argument groups. | 108 # Argument groups. |
| 93 p_man = p.add_argument_group("REQUIRED ARGUMENTS") | 109 p_man = p.add_argument_group("REQUIRED ARGUMENTS") |
| 94 p_opt = p.add_argument_group("OPTIONAL ARGUMENTS") | 110 p_opt = p.add_argument_group("OPTIONAL ARGUMENTS") |
| 95 | 111 |
| 96 # Required arguments. | 112 # Required arguments. |
| 97 p_opt.add_argument("-h", "--help", | 113 p_opt.add_argument("-h", "--help", |
| 98 action="help", | 114 action="help", |
| 99 help="Print help message") | 115 help="Print help message") |
| 100 p_man.add_argument("--fasta", | 116 p_man.add_argument("--fasta", |
| 101 dest="in_fa", | 117 dest="in_fa", |
| 102 type=str, | 118 type=str, |
| 103 required = True, | 119 required=True, |
| 104 help = "Sequences .fa file to predict on (option -fasta)") | 120 help="Sequences .fa file to predict" |
| 121 " on (option -fasta)") | |
| 105 p_man.add_argument("--model", | 122 p_man.add_argument("--model", |
| 106 dest="in_model", | 123 dest="in_model", |
| 107 type=str, | 124 type=str, |
| 108 required = True, | 125 required=True, |
| 109 help = "GraphProt model file to use for predictions (option -model)") | 126 help="GraphProt model file to use for predictions" |
| 127 " (option -model)") | |
| 110 p_man.add_argument("--params", | 128 p_man.add_argument("--params", |
| 111 dest="in_params", | 129 dest="in_params", |
| 112 type=str, | 130 type=str, |
| 113 required = True, | 131 required=True, |
| 114 help = "Parameter file for given model") | 132 help="Parameter file for given model") |
| 115 p_man.add_argument("--data-id", | 133 p_man.add_argument("--data-id", |
| 116 dest="data_id", | 134 dest="data_id", |
| 117 type=str, | 135 type=str, |
| 118 required = True, | 136 required=True, |
| 119 help = "Data ID (option -prefix)") | 137 help="Data ID (option -prefix)") |
| 120 # ---> I'm a conditional argument <--- | 138 # ---> I'm a conditional argument <--- |
| 121 p_opt.add_argument("--ws-pred", | 139 p_opt.add_argument("--ws-pred", |
| 122 dest = "ws_pred", | 140 dest="ws_pred", |
| 123 default = False, | 141 default=False, |
| 124 action = "store_true", | 142 action="store_true", |
| 125 help = "Run a whole site prediction instead of calculating profiles (default: false)") | 143 help="Run a whole site prediction instead " |
| 144 "of calculating profiles (default: false)") | |
| 126 # Additional arguments. | 145 # Additional arguments. |
| 127 p_opt.add_argument("--sc-thr", | 146 p_opt.add_argument("--sc-thr", |
| 128 dest="score_thr", | 147 dest="score_thr", |
| 129 type = float, | 148 type=float, |
| 130 default = 0, | 149 default=0, |
| 131 help = "Score threshold for extracting average profile peak regions (default: 0)") | 150 help="Score threshold for extracting " |
| 151 "average profile peak regions (default: 0)") | |
| 132 p_opt.add_argument("--max-merge-dist", | 152 p_opt.add_argument("--max-merge-dist", |
| 133 dest="max_merge_dist", | 153 dest="max_merge_dist", |
| 134 type = int, | 154 type=int, |
| 135 default = 0, | 155 default=0, |
| 136 choices = [0,1,2,3,4,5,6,7,8,9,10], | 156 choices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], |
| 137 help = "Maximum merge distance for nearby peak regions (default: report all non-overlapping regions)") | 157 help="Maximum merge distance for nearby peak regions" |
| 158 " (default: report all non-overlapping regions)") | |
| 138 p_opt.add_argument("--gen-site-bed", | 159 p_opt.add_argument("--gen-site-bed", |
| 139 dest="genomic_sites_bed", | 160 dest="genomic_sites_bed", |
| 140 type=str, | 161 type=str, |
| 141 help = ".bed file specifying the genomic regions of the input .fa sequences. Corrupt .bed information will be punished (default: false)") | 162 help=".bed file specifying the genomic regions of " |
| 163 "the input .fa sequences. Corrupt .bed " | |
| 164 "information will be punished (default: false)") | |
| 142 p_opt.add_argument("--conf-out", | 165 p_opt.add_argument("--conf-out", |
| 143 dest="conf_out", | 166 dest="conf_out", |
| 144 default = False, | 167 default=False, |
| 145 action = "store_true", | 168 action="store_true", |
| 146 help = "Output filtered peak regions BED file or predictions file (if --ws-pred) using the median positive training site score for filtering (default: false)") | 169 help="Output filtered peak regions BED file or " |
| 170 "predictions file (if --ws-pred) using the median " | |
| 171 "positive training site score for filtering " | |
| 172 "(default: false)") | |
| 147 p_opt.add_argument("--gp-output", | 173 p_opt.add_argument("--gp-output", |
| 148 dest = "gp_output", | 174 dest="gp_output", |
| 149 default = False, | 175 default=False, |
| 150 action = "store_true", | 176 action="store_true", |
| 151 help = "Print output produced by GraphProt (default: false)") | 177 help="Print output produced by GraphProt " |
| 178 "(default: false)") | |
| 152 p_opt.add_argument("--ap-extlr", | 179 p_opt.add_argument("--ap-extlr", |
| 153 dest="ap_extlr", | 180 dest="ap_extlr", |
| 154 type = int, | 181 type=int, |
| 155 default = 5, | 182 default=5, |
| 156 choices = [0,1,2,3,4,5,6,7,8,9,10], | 183 choices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], |
| 157 help = "Define average profile up- and downstream extension to produce the average profile. The mean over small sequence windows (window length = --ap-extlr*2 + 1) is used to get position scores, thus the average profile is more smooth than the initial profile output by GraphProt (default: 5)") | 184 help="Define average profile up- and downstream " |
| 185 "extension to produce the average profile. " | |
| 186 "The mean over small sequence windows " | |
| 187 "(window length = --ap-extlr*2 + 1) is used to " | |
| 188 "get position scores, thus the average profile " | |
| 189 "is more smooth than the initial profile output " | |
| 190 "by GraphProt (default: 5)") | |
| 158 return p | 191 return p |
| 159 | 192 |
| 160 | 193 |
| 161 ################################################################################ | 194 ############################################################################### |
| 162 | 195 |
| 163 if __name__ == '__main__': | 196 if __name__ == '__main__': |
| 164 | 197 |
| 165 # Setup argparse. | 198 # Setup argparse. |
| 166 parser = setup_argument_parser() | 199 parser = setup_argument_parser() |
| 167 # Read in command line arguments. | 200 # Read in command line arguments. |
| 168 args = parser.parse_args() | 201 args = parser.parse_args() |
| 169 | 202 |
| 170 """ | 203 """ |
| 171 Do all sorts of sanity checking. | 204 Do all sorts of sanity checking. |
| 172 | 205 |
| 173 """ | 206 """ |
| 174 # Check for Linux. | 207 # Check for Linux. |
| 175 assert "linux" in sys.platform, "please use Linux" | 208 assert "linux" in sys.platform, "please use Linux" |
| 176 # Check tool availability. | 209 # Check tool availability. |
| 177 assert gplib.is_tool("GraphProt.pl"), "GraphProt.pl not in PATH" | 210 assert gplib.is_tool("GraphProt.pl"), "GraphProt.pl not in PATH" |
| 178 # Check file inputs. | 211 # Check file inputs. |
| 179 assert os.path.exists(args.in_fa), "input .fa file \"%s\" not found" %(args.in_fa) | 212 assert os.path.exists(args.in_fa), \ |
| 180 assert os.path.exists(args.in_model), "input .model file \"%s\" not found" %(args.in_model) | 213 "input .fa file \"%s\" not found" % (args.in_fa) |
| 181 assert os.path.exists(args.in_params), "input .params file \"%s\" not found" %(args.in_params) | 214 assert os.path.exists(args.in_model), \ |
| 215 "input .model file \"%s\" not found" % (args.in_model) | |
| 216 assert os.path.exists(args.in_params), \ | |
| 217 "input .params file \"%s\" not found" % (args.in_params) | |
| 182 # Count .fa entries. | 218 # Count .fa entries. |
| 183 c_in_fa = gplib.count_fasta_headers(args.in_fa) | 219 c_in_fa = gplib.count_fasta_headers(args.in_fa) |
| 184 assert c_in_fa, "input .fa file \"%s\" no headers found" %(args.in_fa) | 220 assert c_in_fa, "input .fa file \"%s\" no headers found" % (args.in_fa) |
| 185 print("# input .fa sequences: %i" %(c_in_fa)) | 221 print("# input .fa sequences: %i" % (c_in_fa)) |
| 186 # Read in FASTA sequences to check for uppercase sequences. | 222 # Read in FASTA sequences to check for uppercase sequences. |
| 187 seqs_dic = gplib.read_fasta_into_dic(args.in_fa) | 223 seqs_dic = gplib.read_fasta_into_dic(args.in_fa) |
| 188 c_uc_nt = gplib.seqs_dic_count_uc_nts(seqs_dic) | 224 c_uc_nt = gplib.seqs_dic_count_uc_nts(seqs_dic) |
| 189 assert c_uc_nt, "no uppercase nucleotides in input .fa sequences. Please change sequences to uppercase (keep in mind GraphProt only scores uppercase regions (according to its viewpoint concept)" | 225 assert c_uc_nt, "no uppercase nucleotides in input .fa sequences. "\ |
| 226 "Please change sequences to uppercase (keep in mind "\ | |
| 227 "GraphProt only scores uppercase regions (according "\ | |
| 228 "to its viewpoint concept)" | |
| 190 if not args.ws_pred: | 229 if not args.ws_pred: |
| 191 # Check for lowercase sequences. | 230 # Check for lowercase sequences. |
| 192 c_lc_nt = gplib.seqs_dic_count_lc_nts(seqs_dic) | 231 c_lc_nt = gplib.seqs_dic_count_lc_nts(seqs_dic) |
| 193 assert not c_lc_nt, "lowercase nucleotides in input .fa not allowed in profile predictions, since GraphProt only scores uppercase regions (according to its viewpoint concept)" | 232 assert not c_lc_nt, "lowercase nucleotides in input .fa not allowed"\ |
| 233 " in profile predictions, since GraphProt only scores "\ | |
| 234 "uppercase regions (according to its viewpoint concept)" | |
| 194 # Check .bed. | 235 # Check .bed. |
| 195 if args.genomic_sites_bed: | 236 if args.genomic_sites_bed: |
| 196 # An array of checks, marvelous. | 237 # An array of checks, marvelous. |
| 197 assert os.path.exists(args.genomic_sites_bed), "genomic .bed file \"%s\" not found" %(args.genomic_sites_bed) | 238 assert \ |
| 239 os.path.exists(args.genomic_sites_bed), \ | |
| 240 "genomic .bed file \"%s\" not found" % (args.genomic_sites_bed) | |
| 198 # Check .bed for content. | 241 # Check .bed for content. |
| 199 assert gplib.count_file_rows(args.genomic_sites_bed), "genomic .bed file \"%s\" is empty" %(args.genomic_sites_bed) | 242 assert gplib.count_file_rows(args.genomic_sites_bed), \ |
| 243 "genomic .bed file \"%s\" is empty" % (args.genomic_sites_bed) | |
| 200 # Check .bed for 6-column format. | 244 # Check .bed for 6-column format. |
| 201 assert gplib.bed_check_six_col_format(args.genomic_sites_bed), "genomic .bed file \"%s\" appears to not be in 6-column .bed format" %(args.genomic_sites_bed) | 245 assert gplib.bed_check_six_col_format(args.genomic_sites_bed), \ |
| 246 "genomic .bed file \"%s\" appears to not "\ | |
| 247 "be in 6-column .bed format" % (args.genomic_sites_bed) | |
| 202 # Check for unique column 4 IDs. | 248 # Check for unique column 4 IDs. |
| 203 assert gplib.bed_check_unique_ids(args.genomic_sites_bed), "genomic .bed file \"%s\" column 4 IDs not unique" %(args.genomic_sites_bed) | 249 assert gplib.bed_check_unique_ids(args.genomic_sites_bed), \ |
| 204 # Read in .bed regions, compare to FASTA sequences (compare IDs + lengths) | 250 "genomic .bed file \"%s\" column 4 IDs not unique" % \ |
| 251 (args.genomic_sites_bed) | |
| 252 # Read in .bed regions, compare to FASTA sequences. | |
| 205 seq_len_dic = gplib.get_seq_lengths_from_seqs_dic(seqs_dic) | 253 seq_len_dic = gplib.get_seq_lengths_from_seqs_dic(seqs_dic) |
| 206 reg_len_dic = gplib.bed_get_region_lengths(args.genomic_sites_bed) | 254 reg_len_dic = gplib.bed_get_region_lengths(args.genomic_sites_bed) |
| 207 for seq_id in seq_len_dic: | 255 for seq_id in seq_len_dic: |
| 208 seq_l = seq_len_dic[seq_id] | 256 seq_l = seq_len_dic[seq_id] |
| 209 assert seq_id in reg_len_dic, "sequence ID \"\" missing in input .bed \"\"" %(seq_id, args.genomic_sites_bed) | 257 assert seq_id in reg_len_dic, \ |
| 258 "sequence ID \"%s\" missing in input .bed \"%s\"" % \ | |
| 259 (seq_id, args.genomic_sites_bed) | |
| 210 reg_l = reg_len_dic[seq_id] | 260 reg_l = reg_len_dic[seq_id] |
| 211 assert seq_l == reg_l, "sequence length differs from .bed region length (%i != %i)" %(seq_l, reg_l) | 261 assert seq_l == reg_l, \ |
| 262 "sequence length differs from .bed region length (%i != %i)" \ | |
| 263 % (seq_l, reg_l) | |
| 212 # Read in model parameters. | 264 # Read in model parameters. |
| 213 param_dic = gplib.graphprot_get_param_dic(args.in_params) | 265 param_dic = gplib.graphprot_get_param_dic(args.in_params) |
| 214 # Create GraphProt parameter string. | 266 # Create GraphProt parameter string. |
| 215 param_string = gplib.graphprot_get_param_string(args.in_params) | 267 param_string = gplib.graphprot_get_param_string(args.in_params) |
| 216 | 268 |
| 217 """ | 269 """ |
| 218 Run predictions. | 270 Run predictions. |
| 219 | 271 |
| 220 """ | 272 """ |
| 221 if args.ws_pred: | 273 if args.ws_pred: |
| 222 # Do whole site prediction. | 274 # Do whole site prediction. |
| 223 print("Starting whole site predictions on input .fa file (-action predict) ... ") | 275 print("Starting whole site predictions on input .fa file" |
| 224 check_cmd = "GraphProt.pl -action predict -prefix " + args.data_id + " -fasta " + args.in_fa + " " + param_string + " -model " + args.in_model | 276 " (-action predict) ... ") |
| 277 check_cmd = "GraphProt.pl -action predict -prefix " \ | |
| 278 + args.data_id + " -fasta " + args.in_fa + " " \ | |
| 279 + param_string + " -model " + args.in_model | |
| 225 output = subprocess.getoutput(check_cmd) | 280 output = subprocess.getoutput(check_cmd) |
| 226 assert output, "the following call of GraphProt.pl produced no output:\n%s" %(check_cmd) | 281 assert output, "the following call of GraphProt.pl "\ |
| 282 "produced no output:\n%s" % (check_cmd) | |
| 227 if args.gp_output: | 283 if args.gp_output: |
| 228 print(output) | 284 print(output) |
| 229 ws_predictions_file = args.data_id + ".predictions" | 285 ws_predictions_file = args.data_id + ".predictions" |
| 230 assert os.path.exists(ws_predictions_file), "Whole site prediction output .predictions file \"%s\" not found" %(ws_predictions_file) | 286 assert os.path.exists(ws_predictions_file), \ |
| 287 "Whole site prediction output .predictions file \"%s\" not found" \ | |
| 288 % (ws_predictions_file) | |
| 231 if args.conf_out: | 289 if args.conf_out: |
| 232 # Filter by pos_train_ws_pred_median median. | 290 # Filter by pos_train_ws_pred_median median. |
| 233 assert "pos_train_ws_pred_median" in param_dic, "whole site top scores median information missing in .params file" | 291 assert "pos_train_ws_pred_median" in param_dic, \ |
| 234 pos_train_ws_pred_median = float(param_dic["pos_train_ws_pred_median"]) | 292 "whole site top scores median information "\ |
| 293 "missing in .params file" | |
| 294 pos_tr_ws_pred_med = \ | |
| 295 float(param_dic["pos_train_ws_pred_median"]) | |
| 235 # Filtered file. | 296 # Filtered file. |
| 236 filt_ws_predictions_file = args.data_id + ".p50.predictions" | 297 filt_ws_predictions_file = args.data_id + ".p50.predictions" |
| 237 print("Extracting p50 sites from whole site predictions (score threshold = %f) ... " %(pos_train_ws_pred_median)) | 298 print("Extracting p50 sites from whole site predictions" |
| 238 gplib.graphprot_filter_predictions_file(ws_predictions_file, filt_ws_predictions_file, | 299 " (score threshold = %f) ... " % (pos_tr_ws_pred_med)) |
| 239 sc_thr=pos_train_ws_pred_median) | 300 gplib.graphprot_filter_predictions_file(ws_predictions_file, |
| 301 filt_ws_predictions_file, | |
| 302 sc_thr=pos_tr_ws_pred_med) | |
| 240 else: | 303 else: |
| 241 # Do profile prediction. | 304 # Do profile prediction. |
| 242 print("Starting profile predictions on on input .fa file (-action predict_profile) ... ") | 305 print("Starting profile predictions on on input .fa file " |
| 243 check_cmd = "GraphProt.pl -action predict_profile -prefix " + args.data_id + " -fasta " + args.in_fa + " " + param_string + " -model " + args.in_model | 306 "(-action predict_profile) ... ") |
| 307 check_cmd = "GraphProt.pl -action predict_profile -prefix " \ | |
| 308 + args.data_id + " -fasta " + args.in_fa + " " + \ | |
| 309 param_string + " -model " + args.in_model | |
| 244 output = subprocess.getoutput(check_cmd) | 310 output = subprocess.getoutput(check_cmd) |
| 245 assert output, "the following call of GraphProt.pl produced no output:\n%s" %(check_cmd) | 311 assert output, "the following call of GraphProt.pl produced "\ |
| 312 "no output:\n%s" % (check_cmd) | |
| 246 if args.gp_output: | 313 if args.gp_output: |
| 247 print(output) | 314 print(output) |
| 248 profile_predictions_file = args.data_id + ".profile" | 315 profile_predictions_file = args.data_id + ".profile" |
| 249 assert os.path.exists(profile_predictions_file), "Profile prediction output .profile file \"%s\" not found" %(profile_predictions_file) | 316 assert os.path.exists(profile_predictions_file), \ |
| 317 "Profile prediction output .profile file \"%s\" not found" % \ | |
| 318 (profile_predictions_file) | |
| 250 | 319 |
| 251 # Profile prediction output files. | 320 # Profile prediction output files. |
| 252 avg_prof_file = args.data_id + ".avg_profile" | 321 avg_prof_file = args.data_id + ".avg_profile" |
| 253 avg_prof_peaks_file = args.data_id + ".avg_profile.peaks.bed" | 322 avg_prof_peaks_file = args.data_id + ".avg_profile.peaks.bed" |
| 254 avg_prof_gen_peaks_file = args.data_id + ".avg_profile.genomic_peaks.bed" | 323 avg_prof_gen_peaks_file = args.data_id + \ |
| 255 avg_prof_peaks_p50_file = args.data_id + ".avg_profile.p50.peaks.bed" | 324 ".avg_profile.genomic_peaks.bed" |
| 256 avg_prof_gen_peaks_p50_file = args.data_id + ".avg_profile.p50.genomic_peaks.bed" | 325 avg_prof_peaks_p50_file = args.data_id + \ |
| 326 ".avg_profile.p50.peaks.bed" | |
| 327 avg_prof_gen_peaks_p50_file = args.data_id + \ | |
| 328 ".avg_profile.p50.genomic_peaks.bed" | |
| 257 | 329 |
| 258 # Get sequence IDs in order from input .fa file. | 330 # Get sequence IDs in order from input .fa file. |
| 259 seq_ids_list = gplib.fasta_read_in_ids(args.in_fa) | 331 seq_ids_list = gplib.fasta_read_in_ids(args.in_fa) |
| 260 # Calculate average profiles. | 332 # Calculate average profiles. |
| 261 print("Getting average profile from profile (extlr for smoothing: %i) ... " %(args.ap_extlr)) | 333 print("Getting average profile from profile " |
| 262 gplib.graphprot_profile_calculate_avg_profile(profile_predictions_file, | 334 "(extlr for smoothing: %i) ... " % (args.ap_extlr)) |
| 263 avg_prof_file, | 335 gplib.graphprot_profile_calc_avg_profile(profile_predictions_file, |
| 264 ap_extlr=args.ap_extlr, | 336 avg_prof_file, |
| 265 seq_ids_list=seq_ids_list, | 337 ap_extlr=args.ap_extlr, |
| 266 method=2) | 338 seq_ids_list=seq_ids_list, |
| 339 method=2) | |
| 267 # Extract peak regions on sequences with threshold score 0. | 340 # Extract peak regions on sequences with threshold score 0. |
| 268 print("Extracting peak regions from average profile (score threshold = 0) ... ") | 341 print("Extracting peak regions from average profile " |
| 269 gplib.graphprot_profile_extract_peak_regions(avg_prof_file, avg_prof_peaks_file, | 342 "(score threshold = 0) ... ") |
| 270 max_merge_dist=args.max_merge_dist, | 343 killpep8 = args.max_merge_dist |
| 271 sc_thr=args.score_thr) | 344 gplib.graphprot_profile_extract_peak_regions(avg_prof_file, |
| 345 avg_prof_peaks_file, | |
| 346 max_merge_dist=killpep8, | |
| 347 sc_thr=args.score_thr) | |
| 272 # Convert peaks to genomic coordinates. | 348 # Convert peaks to genomic coordinates. |
| 273 if args.genomic_sites_bed: | 349 if args.genomic_sites_bed: |
| 274 print("Converting peak regions to genomic coordinates ... ") | 350 print("Converting peak regions to genomic coordinates ... ") |
| 275 gplib.bed_peaks_to_genomic_peaks(avg_prof_peaks_file, avg_prof_gen_peaks_file, | 351 killit = args.genomic_sites_bed |
| 352 gplib.bed_peaks_to_genomic_peaks(avg_prof_peaks_file, | |
| 353 avg_prof_gen_peaks_file, | |
| 276 print_rows=False, | 354 print_rows=False, |
| 277 genomic_sites_bed=args.genomic_sites_bed) | 355 genomic_sites_bed=killit) |
| 278 # gplib.make_file_copy(avg_prof_gen_peaks_file, avg_prof_peaks_file) | |
| 279 # Extract peak regions with threshold score p50. | 356 # Extract peak regions with threshold score p50. |
| 280 if args.conf_out: | 357 if args.conf_out: |
| 281 sc_id = "pos_train_avg_profile_median_%i" %(args.ap_extlr) | 358 sc_id = "pos_train_avg_profile_median_%i" % (args.ap_extlr) |
| 282 # Filter by pos_train_ws_pred_median median. | 359 # Filter by pos_tr_ws_pred_med median. |
| 283 assert sc_id in param_dic, "average profile extlr %i median information missing in .params file" %(args.ap_extlr) | 360 assert sc_id in param_dic, "average profile extlr %i median "\ |
| 361 "information missing in .params file" % (args.ap_extlr) | |
| 284 p50_sc_thr = float(param_dic[sc_id]) | 362 p50_sc_thr = float(param_dic[sc_id]) |
| 285 print("Extracting p50 peak regions from average profile (score threshold = %f) ... " %(p50_sc_thr)) | 363 print("Extracting p50 peak regions from average profile " |
| 286 gplib.graphprot_profile_extract_peak_regions(avg_prof_file, avg_prof_peaks_p50_file, | 364 "(score threshold = %f) ... " % (p50_sc_thr)) |
| 287 max_merge_dist=args.max_merge_dist, | 365 despair = avg_prof_peaks_p50_file |
| 366 pain = args.max_merge_dist | |
| 367 gplib.graphprot_profile_extract_peak_regions(avg_prof_file, | |
| 368 despair, | |
| 369 max_merge_dist=pain, | |
| 288 sc_thr=p50_sc_thr) | 370 sc_thr=p50_sc_thr) |
| 289 # Convert peaks to genomic coordinates. | 371 # Convert peaks to genomic coordinates. |
| 290 if args.genomic_sites_bed: | 372 if args.genomic_sites_bed: |
| 291 print("Converting p50 peak regions to genomic coordinates ... ") | 373 print("Converting p50 peak regions to " |
| 292 gplib.bed_peaks_to_genomic_peaks(avg_prof_peaks_p50_file, avg_prof_gen_peaks_p50_file, | 374 "genomic coordinates ... ") |
| 293 genomic_sites_bed=args.genomic_sites_bed) | 375 madness = args.genomic_sites_bed |
| 376 gplib.bed_peaks_to_genomic_peaks(avg_prof_peaks_p50_file, | |
| 377 avg_prof_gen_peaks_p50_file, | |
| 378 genomic_sites_bed=madness) | |
| 294 # Done. | 379 # Done. |
| 295 print("Script: I'm done.") | 380 print("Script: I'm done.") |
| 296 print("Author: ... ") | 381 print("Author: ... ") |
| 297 | |
| 298 |
