comparison graphprot_predict_wrapper.py @ 5:ddcf35a868b8 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/graphprot commit ad60258f5759eaa205fec4af6143c728ea131419
author bgruening
date Wed, 05 Jun 2024 16:40:51 +0000
parents 4ad83aed5c3c
children
comparison
equal deleted inserted replaced
4:4ad83aed5c3c 5:ddcf35a868b8
4 import os 4 import os
5 import subprocess 5 import subprocess
6 import sys 6 import sys
7 7
8 import gplib 8 import gplib
9
10 9
11 """ 10 """
12 11
13 TOOL DEPENDENCIES 12 TOOL DEPENDENCIES
14 ================= 13 =================
79 --sc-thr 0.0 --max-merge-dist 0 --conf-out --ap-extlr 5 78 --sc-thr 0.0 --max-merge-dist 0 --conf-out --ap-extlr 5
80 79
81 """ 80 """
82 81
83 82
84 ############################################################################### 83 #######################################################################
84
85 85
86 def setup_argument_parser(): 86 def setup_argument_parser():
87 """Setup argparse parser.""" 87 """Setup argparse parser."""
88 help_description = """ 88 help_description = """
89 Galaxy wrapper script for GraphProt (-action predict and -action 89 Galaxy wrapper script for GraphProt (-action predict and -action
98 If --gen-site-bed .bed file is provided, peak regions will be output 98 If --gen-site-bed .bed file is provided, peak regions will be output
99 with genomic coordinates too. 99 with genomic coordinates too.
100 100
101 """ 101 """
102 # Define argument parser. 102 # Define argument parser.
103 p = ap.ArgumentParser(add_help=False, 103 p = ap.ArgumentParser(
104 prog="graphprot_predict_wrapper.py", 104 add_help=False,
105 description=help_description, 105 prog="graphprot_predict_wrapper.py",
106 formatter_class=ap.MetavarTypeHelpFormatter) 106 description=help_description,
107 formatter_class=ap.MetavarTypeHelpFormatter,
108 )
107 109
108 # Argument groups. 110 # Argument groups.
109 p_man = p.add_argument_group("REQUIRED ARGUMENTS") 111 p_man = p.add_argument_group("REQUIRED ARGUMENTS")
110 p_opt = p.add_argument_group("OPTIONAL ARGUMENTS") 112 p_opt = p.add_argument_group("OPTIONAL ARGUMENTS")
111 113
112 # Required arguments. 114 # Required arguments.
113 p_opt.add_argument("-h", "--help", 115 p_opt.add_argument("-h", "--help", action="help", help="Print help message")
114 action="help", 116 p_man.add_argument(
115 help="Print help message") 117 "--fasta",
116 p_man.add_argument("--fasta", 118 dest="in_fa",
117 dest="in_fa", 119 type=str,
118 type=str, 120 required=True,
119 required=True, 121 help="Sequences .fa file to predict" " on (option -fasta)",
120 help="Sequences .fa file to predict" 122 )
121 " on (option -fasta)") 123 p_man.add_argument(
122 p_man.add_argument("--model", 124 "--model",
123 dest="in_model", 125 dest="in_model",
124 type=str, 126 type=str,
125 required=True, 127 required=True,
126 help="GraphProt model file to use for predictions" 128 help="GraphProt model file to use for predictions" " (option -model)",
127 " (option -model)") 129 )
128 p_man.add_argument("--params", 130 p_man.add_argument(
129 dest="in_params", 131 "--params",
130 type=str, 132 dest="in_params",
131 required=True, 133 type=str,
132 help="Parameter file for given model") 134 required=True,
133 p_man.add_argument("--data-id", 135 help="Parameter file for given model",
134 dest="data_id", 136 )
135 type=str, 137 p_man.add_argument(
136 required=True, 138 "--data-id",
137 help="Data ID (option -prefix)") 139 dest="data_id",
140 type=str,
141 required=True,
142 help="Data ID (option -prefix)",
143 )
138 # ---> I'm a conditional argument <--- 144 # ---> I'm a conditional argument <---
139 p_opt.add_argument("--ws-pred", 145 p_opt.add_argument(
140 dest="ws_pred", 146 "--ws-pred",
141 default=False, 147 dest="ws_pred",
142 action="store_true", 148 default=False,
143 help="Run a whole site prediction instead " 149 action="store_true",
144 "of calculating profiles (default: false)") 150 help="Run a whole site prediction instead "
151 "of calculating profiles (default: false)",
152 )
145 # Additional arguments. 153 # Additional arguments.
146 p_opt.add_argument("--sc-thr", 154 p_opt.add_argument(
147 dest="score_thr", 155 "--sc-thr",
148 type=float, 156 dest="score_thr",
149 default=0, 157 type=float,
150 help="Score threshold for extracting " 158 default=0,
151 "average profile peak regions (default: 0)") 159 help="Score threshold for extracting "
152 p_opt.add_argument("--max-merge-dist", 160 "average profile peak regions (default: 0)",
153 dest="max_merge_dist", 161 )
154 type=int, 162 p_opt.add_argument(
155 default=0, 163 "--max-merge-dist",
156 choices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 164 dest="max_merge_dist",
157 help="Maximum merge distance for nearby peak regions" 165 type=int,
158 " (default: report all non-overlapping regions)") 166 default=0,
159 p_opt.add_argument("--gen-site-bed", 167 choices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
160 dest="genomic_sites_bed", 168 help="Maximum merge distance for nearby peak regions"
161 type=str, 169 " (default: report all non-overlapping regions)",
162 help=".bed file specifying the genomic regions of " 170 )
163 "the input .fa sequences. Corrupt .bed " 171 p_opt.add_argument(
164 "information will be punished (default: false)") 172 "--gen-site-bed",
165 p_opt.add_argument("--conf-out", 173 dest="genomic_sites_bed",
166 dest="conf_out", 174 type=str,
167 default=False, 175 help=".bed file specifying the genomic regions of "
168 action="store_true", 176 "the input .fa sequences. Corrupt .bed "
169 help="Output filtered peak regions BED file or " 177 "information will be punished (default: false)",
170 "predictions file (if --ws-pred) using the median " 178 )
171 "positive training site score for filtering " 179 p_opt.add_argument(
172 "(default: false)") 180 "--conf-out",
173 p_opt.add_argument("--gp-output", 181 dest="conf_out",
174 dest="gp_output", 182 default=False,
175 default=False, 183 action="store_true",
176 action="store_true", 184 help="Output filtered peak regions BED file or "
177 help="Print output produced by GraphProt " 185 "predictions file (if --ws-pred) using the median "
178 "(default: false)") 186 "positive training site score for filtering "
179 p_opt.add_argument("--ap-extlr", 187 "(default: false)",
180 dest="ap_extlr", 188 )
181 type=int, 189 p_opt.add_argument(
182 default=5, 190 "--gp-output",
183 choices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 191 dest="gp_output",
184 help="Define average profile up- and downstream " 192 default=False,
185 "extension to produce the average profile. " 193 action="store_true",
186 "The mean over small sequence windows " 194 help="Print output produced by GraphProt " "(default: false)",
187 "(window length = --ap-extlr*2 + 1) is used to " 195 )
188 "get position scores, thus the average profile " 196 p_opt.add_argument(
189 "is more smooth than the initial profile output " 197 "--ap-extlr",
190 "by GraphProt (default: 5)") 198 dest="ap_extlr",
199 type=int,
200 default=5,
201 choices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
202 help="Define average profile up- and downstream "
203 "extension to produce the average profile. "
204 "The mean over small sequence windows "
205 "(window length = --ap-extlr*2 + 1) is used to "
206 "get position scores, thus the average profile "
207 "is more smooth than the initial profile output "
208 "by GraphProt (default: 5)",
209 )
191 return p 210 return p
192 211
193 212
194 ############################################################################### 213 #######################################################################
195 214
196 if __name__ == '__main__': 215 if __name__ == "__main__":
197 216
198 # Setup argparse. 217 # Setup argparse.
199 parser = setup_argument_parser() 218 parser = setup_argument_parser()
200 # Read in command line arguments. 219 # Read in command line arguments.
201 args = parser.parse_args() 220 args = parser.parse_args()
207 # Check for Linux. 226 # Check for Linux.
208 assert "linux" in sys.platform, "please use Linux" 227 assert "linux" in sys.platform, "please use Linux"
209 # Check tool availability. 228 # Check tool availability.
210 assert gplib.is_tool("GraphProt.pl"), "GraphProt.pl not in PATH" 229 assert gplib.is_tool("GraphProt.pl"), "GraphProt.pl not in PATH"
211 # Check file inputs. 230 # Check file inputs.
212 assert os.path.exists(args.in_fa), \ 231 assert os.path.exists(args.in_fa), 'input .fa file "%s" not found' % (args.in_fa)
213 "input .fa file \"%s\" not found" % (args.in_fa) 232 assert os.path.exists(args.in_model), 'input .model file "%s" not found' % (
214 assert os.path.exists(args.in_model), \ 233 args.in_model
215 "input .model file \"%s\" not found" % (args.in_model) 234 )
216 assert os.path.exists(args.in_params), \ 235 assert os.path.exists(args.in_params), 'input .params file "%s" not found' % (
217 "input .params file \"%s\" not found" % (args.in_params) 236 args.in_params
237 )
218 # Count .fa entries. 238 # Count .fa entries.
219 c_in_fa = gplib.count_fasta_headers(args.in_fa) 239 c_in_fa = gplib.count_fasta_headers(args.in_fa)
220 assert c_in_fa, "input .fa file \"%s\" no headers found" % (args.in_fa) 240 assert c_in_fa, 'input .fa file "%s" no headers found' % (args.in_fa)
221 print("# input .fa sequences: %i" % (c_in_fa)) 241 print("# input .fa sequences: %i" % (c_in_fa))
222 # Read in FASTA sequences to check for uppercase sequences. 242 # Read in FASTA sequences to check for uppercase sequences.
223 seqs_dic = gplib.read_fasta_into_dic(args.in_fa) 243 seqs_dic = gplib.read_fasta_into_dic(args.in_fa)
224 # Check for lowercase only sequences, which cause GP to crash. 244 # Check for lowercase only sequences, which cause GP to crash.
225 error_mess = "input sequences encountered containing "\ 245 error_mess = (
226 "only lowercase characters or lowercase characters in between "\ 246 "input sequences encountered containing "
227 "uppercase characters. Please provide either all uppercase "\ 247 "only lowercase characters or lowercase characters in between "
228 "sequences or sequences containing uppercase regions surrounded "\ 248 "uppercase characters. Please provide either all uppercase "
229 "by lowercase context regions for structure calculation (see "\ 249 "sequences or sequences containing uppercase regions surrounded "
230 "viewpoint concept in original GraphProt publication "\ 250 "by lowercase context regions for structure calculation (see "
251 "viewpoint concept in original GraphProt publication "
231 "for more details)" 252 "for more details)"
253 )
232 if args.ws_pred: 254 if args.ws_pred:
233 bad_ids = gplib.check_seqs_dic_format(seqs_dic) 255 bad_ids = gplib.check_seqs_dic_format(seqs_dic)
234 assert not bad_ids, "%s" % (error_mess) 256 assert not bad_ids, "%s" % (error_mess)
235 257
236 c_uc_nt = gplib.seqs_dic_count_uc_nts(seqs_dic) 258 c_uc_nt = gplib.seqs_dic_count_uc_nts(seqs_dic)
237 assert c_uc_nt, "no uppercase nucleotides in input .fa sequences. "\ 259 assert c_uc_nt, (
238 "Please change sequences to uppercase (keep in mind "\ 260 "no uppercase nucleotides in input .fa sequences. "
239 "GraphProt only scores uppercase regions (according "\ 261 "Please change sequences to uppercase (keep in mind "
240 "to its viewpoint concept)" 262 "GraphProt only scores uppercase regions (according "
263 "to its viewpoint concept)"
264 )
241 if not args.ws_pred: 265 if not args.ws_pred:
242 # Check for lowercase sequences. 266 # Check for lowercase sequences.
243 c_lc_nt = gplib.seqs_dic_count_lc_nts(seqs_dic) 267 c_lc_nt = gplib.seqs_dic_count_lc_nts(seqs_dic)
244 assert not c_lc_nt, "lowercase nucleotides in input .fa not allowed"\ 268 assert not c_lc_nt, (
245 " in profile predictions, since GraphProt only scores "\ 269 "lowercase nucleotides in input .fa not allowed"
270 " in profile predictions, since GraphProt only scores "
246 "uppercase regions (according to its viewpoint concept)" 271 "uppercase regions (according to its viewpoint concept)"
272 )
247 # Check .bed. 273 # Check .bed.
248 if args.genomic_sites_bed: 274 if args.genomic_sites_bed:
249 # An array of checks, marvelous. 275 # An array of checks, marvelous.
250 assert \ 276 assert os.path.exists(
251 os.path.exists(args.genomic_sites_bed), \ 277 args.genomic_sites_bed
252 "genomic .bed file \"%s\" not found" % (args.genomic_sites_bed) 278 ), 'genomic .bed file "%s" not found' % (args.genomic_sites_bed)
253 # Check .bed for content. 279 # Check .bed for content.
254 assert gplib.count_file_rows(args.genomic_sites_bed), \ 280 assert gplib.count_file_rows(
255 "genomic .bed file \"%s\" is empty" % (args.genomic_sites_bed) 281 args.genomic_sites_bed
282 ), 'genomic .bed file "%s" is empty' % (args.genomic_sites_bed)
256 # Check .bed for 6-column format. 283 # Check .bed for 6-column format.
257 assert gplib.bed_check_six_col_format(args.genomic_sites_bed), \ 284 assert gplib.bed_check_six_col_format(
258 "genomic .bed file \"%s\" appears to not "\ 285 args.genomic_sites_bed
259 "be in 6-column .bed format" % (args.genomic_sites_bed) 286 ), 'genomic .bed file "%s" appears to not ' "be in 6-column .bed format" % (
287 args.genomic_sites_bed
288 )
260 # Check for unique column 4 IDs. 289 # Check for unique column 4 IDs.
261 assert gplib.bed_check_unique_ids(args.genomic_sites_bed), \ 290 assert gplib.bed_check_unique_ids(
262 "genomic .bed file \"%s\" column 4 IDs not unique" % \ 291 args.genomic_sites_bed
263 (args.genomic_sites_bed) 292 ), 'genomic .bed file "%s" column 4 IDs not unique' % (args.genomic_sites_bed)
264 # Read in .bed regions, compare to FASTA sequences. 293 # Read in .bed regions, compare to FASTA sequences.
265 seq_len_dic = gplib.get_seq_lengths_from_seqs_dic(seqs_dic) 294 seq_len_dic = gplib.get_seq_lengths_from_seqs_dic(seqs_dic)
266 reg_len_dic = gplib.bed_get_region_lengths(args.genomic_sites_bed) 295 reg_len_dic = gplib.bed_get_region_lengths(args.genomic_sites_bed)
267 for seq_id in seq_len_dic: 296 for seq_id in seq_len_dic:
268 seq_l = seq_len_dic[seq_id] 297 seq_l = seq_len_dic[seq_id]
269 assert seq_id in reg_len_dic, \ 298 assert (
270 "sequence ID \"%s\" missing in input .bed \"%s\"" % \ 299 seq_id in reg_len_dic
271 (seq_id, args.genomic_sites_bed) 300 ), 'sequence ID "%s" missing in input .bed "%s"' % (
301 seq_id,
302 args.genomic_sites_bed,
303 )
272 reg_l = reg_len_dic[seq_id] 304 reg_l = reg_len_dic[seq_id]
273 assert seq_l == reg_l, \ 305 assert (
274 "sequence length differs from .bed region length (%i != %i)" \ 306 seq_l == reg_l
275 % (seq_l, reg_l) 307 ), "sequence length differs from .bed region length (%i != %i)" % (
308 seq_l,
309 reg_l,
310 )
276 # Read in model parameters. 311 # Read in model parameters.
277 param_dic = gplib.graphprot_get_param_dic(args.in_params) 312 param_dic = gplib.graphprot_get_param_dic(args.in_params)
278 # Create GraphProt parameter string. 313 # Create GraphProt parameter string.
279 param_string = gplib.graphprot_get_param_string(args.in_params) 314 param_string = gplib.graphprot_get_param_string(args.in_params)
280 315
282 Run predictions. 317 Run predictions.
283 318
284 """ 319 """
285 if args.ws_pred: 320 if args.ws_pred:
286 # Do whole site prediction. 321 # Do whole site prediction.
287 print("Starting whole site predictions on input .fa file" 322 print(
288 " (-action predict) ... ") 323 "Starting whole site predictions on input .fa file"
289 check_cmd = "GraphProt.pl -action predict -prefix " \ 324 " (-action predict) ... "
290 + args.data_id + " -fasta " + args.in_fa + " " \ 325 )
291 + param_string + " -model " + args.in_model 326 check_cmd = (
327 "GraphProt.pl -action predict -prefix "
328 + args.data_id
329 + " -fasta "
330 + args.in_fa
331 + " "
332 + param_string
333 + " -model "
334 + args.in_model
335 )
292 output = subprocess.getoutput(check_cmd) 336 output = subprocess.getoutput(check_cmd)
293 assert output, "the following call of GraphProt.pl "\ 337 assert (
294 "produced no output:\n%s" % (check_cmd) 338 output
339 ), "the following call of GraphProt.pl " "produced no output:\n%s" % (check_cmd)
295 if args.gp_output: 340 if args.gp_output:
296 print(output) 341 print(output)
297 ws_predictions_file = args.data_id + ".predictions" 342 ws_predictions_file = args.data_id + ".predictions"
298 assert os.path.exists(ws_predictions_file), \ 343 assert os.path.exists(
299 "Whole site prediction output .predictions file \"%s\" not found" \ 344 ws_predictions_file
300 % (ws_predictions_file) 345 ), 'Whole site prediction output .predictions file "%s" not found' % (
346 ws_predictions_file
347 )
301 if args.conf_out: 348 if args.conf_out:
302 # Filter by pos_train_ws_pred_median median. 349 # Filter by pos_train_ws_pred_median median.
303 assert "pos_train_ws_pred_median" in param_dic, \ 350 assert "pos_train_ws_pred_median" in param_dic, (
304 "whole site top scores median information "\ 351 "whole site top scores median information " "missing in .params file"
305 "missing in .params file" 352 )
306 pos_tr_ws_pred_med = \ 353 pos_tr_ws_pred_med = float(param_dic["pos_train_ws_pred_median"])
307 float(param_dic["pos_train_ws_pred_median"])
308 # Filtered file. 354 # Filtered file.
309 filt_ws_predictions_file = args.data_id + ".p50.predictions" 355 filt_ws_predictions_file = args.data_id + ".p50.predictions"
310 print("Extracting p50 sites from whole site predictions" 356 print(
311 " (score threshold = %f) ... " % (pos_tr_ws_pred_med)) 357 "Extracting p50 sites from whole site predictions"
312 gplib.graphprot_filter_predictions_file(ws_predictions_file, 358 " (score threshold = %f) ... " % (pos_tr_ws_pred_med)
313 filt_ws_predictions_file, 359 )
314 sc_thr=pos_tr_ws_pred_med) 360 gplib.graphprot_filter_predictions_file(
361 ws_predictions_file, filt_ws_predictions_file, sc_thr=pos_tr_ws_pred_med
362 )
315 else: 363 else:
316 # Do profile prediction. 364 # Do profile prediction.
317 print("Starting profile predictions on on input .fa file " 365 print(
318 "(-action predict_profile) ... ") 366 "Starting profile predictions on on input .fa file "
319 check_cmd = "GraphProt.pl -action predict_profile -prefix " \ 367 "(-action predict_profile) ... "
320 + args.data_id + " -fasta " + args.in_fa + " " + \ 368 )
321 param_string + " -model " + args.in_model 369 check_cmd = (
370 "GraphProt.pl -action predict_profile -prefix "
371 + args.data_id
372 + " -fasta "
373 + args.in_fa
374 + " "
375 + param_string
376 + " -model "
377 + args.in_model
378 )
322 output = subprocess.getoutput(check_cmd) 379 output = subprocess.getoutput(check_cmd)
323 assert output, "the following call of GraphProt.pl produced "\ 380 assert (
324 "no output:\n%s" % (check_cmd) 381 output
382 ), "the following call of GraphProt.pl produced " "no output:\n%s" % (check_cmd)
325 if args.gp_output: 383 if args.gp_output:
326 print(output) 384 print(output)
327 profile_predictions_file = args.data_id + ".profile" 385 profile_predictions_file = args.data_id + ".profile"
328 assert os.path.exists(profile_predictions_file), \ 386 assert os.path.exists(
329 "Profile prediction output .profile file \"%s\" not found" % \ 387 profile_predictions_file
330 (profile_predictions_file) 388 ), 'Profile prediction output .profile file "%s" not found' % (
389 profile_predictions_file
390 )
331 391
332 # Profile prediction output files. 392 # Profile prediction output files.
333 avg_prof_file = args.data_id + ".avg_profile" 393 avg_prof_file = args.data_id + ".avg_profile"
334 avg_prof_peaks_file = args.data_id + ".avg_profile.peaks.bed" 394 avg_prof_peaks_file = args.data_id + ".avg_profile.peaks.bed"
335 avg_prof_gen_peaks_file = args.data_id + \ 395 avg_prof_gen_peaks_file = args.data_id + ".avg_profile.genomic_peaks.bed"
336 ".avg_profile.genomic_peaks.bed" 396 avg_prof_peaks_p50_file = args.data_id + ".avg_profile.p50.peaks.bed"
337 avg_prof_peaks_p50_file = args.data_id + \ 397 avg_prof_gen_peaks_p50_file = (
338 ".avg_profile.p50.peaks.bed" 398 args.data_id + ".avg_profile.p50.genomic_peaks.bed"
339 avg_prof_gen_peaks_p50_file = args.data_id + \ 399 )
340 ".avg_profile.p50.genomic_peaks.bed"
341 400
342 # Get sequence IDs in order from input .fa file. 401 # Get sequence IDs in order from input .fa file.
343 seq_ids_list = gplib.fasta_read_in_ids(args.in_fa) 402 seq_ids_list = gplib.fasta_read_in_ids(args.in_fa)
344 # Calculate average profiles. 403 # Calculate average profiles.
345 print("Getting average profile from profile " 404 print(
346 "(extlr for smoothing: %i) ... " % (args.ap_extlr)) 405 "Getting average profile from profile "
347 gplib.graphprot_profile_calc_avg_profile(profile_predictions_file, 406 "(extlr for smoothing: %i) ... " % (args.ap_extlr)
348 avg_prof_file, 407 )
349 ap_extlr=args.ap_extlr, 408 gplib.graphprot_profile_calc_avg_profile(
350 seq_ids_list=seq_ids_list, 409 profile_predictions_file,
351 method=2) 410 avg_prof_file,
411 ap_extlr=args.ap_extlr,
412 seq_ids_list=seq_ids_list,
413 method=2,
414 )
352 # Extract peak regions on sequences with threshold score 0. 415 # Extract peak regions on sequences with threshold score 0.
353 print("Extracting peak regions from average profile " 416 print(
354 "(score threshold = 0) ... ") 417 "Extracting peak regions from average profile " "(score threshold = 0) ... "
418 )
355 killpep8 = args.max_merge_dist 419 killpep8 = args.max_merge_dist
356 gplib.graphprot_profile_extract_peak_regions(avg_prof_file, 420 gplib.graphprot_profile_extract_peak_regions(
357 avg_prof_peaks_file, 421 avg_prof_file,
358 max_merge_dist=killpep8, 422 avg_prof_peaks_file,
359 sc_thr=args.score_thr) 423 max_merge_dist=killpep8,
424 sc_thr=args.score_thr,
425 )
360 # Convert peaks to genomic coordinates. 426 # Convert peaks to genomic coordinates.
361 if args.genomic_sites_bed: 427 if args.genomic_sites_bed:
362 print("Converting peak regions to genomic coordinates ... ") 428 print("Converting peak regions to genomic coordinates ... ")
363 killit = args.genomic_sites_bed 429 killit = args.genomic_sites_bed
364 gplib.bed_peaks_to_genomic_peaks(avg_prof_peaks_file, 430 gplib.bed_peaks_to_genomic_peaks(
365 avg_prof_gen_peaks_file, 431 avg_prof_peaks_file,
366 print_rows=False, 432 avg_prof_gen_peaks_file,
367 genomic_sites_bed=killit) 433 print_rows=False,
434 genomic_sites_bed=killit,
435 )
368 # Extract peak regions with threshold score p50. 436 # Extract peak regions with threshold score p50.
369 if args.conf_out: 437 if args.conf_out:
370 sc_id = "pos_train_avg_profile_median_%i" % (args.ap_extlr) 438 sc_id = "pos_train_avg_profile_median_%i" % (args.ap_extlr)
371 # Filter by pos_tr_ws_pred_med median. 439 # Filter by pos_tr_ws_pred_med median.
372 assert sc_id in param_dic, "average profile extlr %i median "\ 440 assert sc_id in param_dic, (
441 "average profile extlr %i median "
373 "information missing in .params file" % (args.ap_extlr) 442 "information missing in .params file" % (args.ap_extlr)
443 )
374 p50_sc_thr = float(param_dic[sc_id]) 444 p50_sc_thr = float(param_dic[sc_id])
375 print("Extracting p50 peak regions from average profile " 445 print(
376 "(score threshold = %f) ... " % (p50_sc_thr)) 446 "Extracting p50 peak regions from average profile "
447 "(score threshold = %f) ... " % (p50_sc_thr)
448 )
377 despair = avg_prof_peaks_p50_file 449 despair = avg_prof_peaks_p50_file
378 pain = args.max_merge_dist 450 pain = args.max_merge_dist
379 gplib.graphprot_profile_extract_peak_regions(avg_prof_file, 451 gplib.graphprot_profile_extract_peak_regions(
380 despair, 452 avg_prof_file, despair, max_merge_dist=pain, sc_thr=p50_sc_thr
381 max_merge_dist=pain, 453 )
382 sc_thr=p50_sc_thr)
383 # Convert peaks to genomic coordinates. 454 # Convert peaks to genomic coordinates.
384 if args.genomic_sites_bed: 455 if args.genomic_sites_bed:
385 print("Converting p50 peak regions to " 456 print("Converting p50 peak regions to " "genomic coordinates ... ")
386 "genomic coordinates ... ")
387 madness = args.genomic_sites_bed 457 madness = args.genomic_sites_bed
388 gplib.bed_peaks_to_genomic_peaks(avg_prof_peaks_p50_file, 458 gplib.bed_peaks_to_genomic_peaks(
389 avg_prof_gen_peaks_p50_file, 459 avg_prof_peaks_p50_file,
390 genomic_sites_bed=madness) 460 avg_prof_gen_peaks_p50_file,
461 genomic_sites_bed=madness,
462 )
391 # Done. 463 # Done.
392 print("Script: I'm done.") 464 print("Script: I'm done.")
393 print("Author: ... ") 465 print("Author: ... ")