comparison graphprot_predict_wrapper.py @ 3:ace92c9a4653 draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/graphprot commit efcac98677c3ea9039c1c61eaa9e58f78287ccb3"
author bgruening
date Wed, 27 Jan 2021 19:27:47 +0000
parents 7bbb7bf6304f
children 4ad83aed5c3c
comparison
equal deleted inserted replaced
2:7bbb7bf6304f 3:ace92c9a4653
1 #!/usr/bin/env python3 1 #!/usr/bin/env python3
2 2
3 import argparse as ap
4 import os
3 import subprocess 5 import subprocess
4 import argparse 6 import sys
5 import shutil 7
6 import gplib 8 import gplib
7 import gzip
8 import sys
9 import os
10 9
11 10
12 """ 11 """
13 12
14 TOOL DEPENDENCIES 13 TOOL DEPENDENCIES
46 45
47 46
48 EXAMPLE CALLS 47 EXAMPLE CALLS
49 ============= 48 =============
50 49
51 python graphprot_predict_wrapper.py --model test2.model --params test2.params --fasta gp_data/test10_predict.fa --data-id test2pred --gp-output 50 flake8 coming out of hotel room. FB enters.
52 python graphprot_predict_wrapper.py --model test2.model --params test2.params --fasta gp_data/test10_predict.fa --data-id test2pred --gen-site-bed gp_data/test10_predict.bed 51 FB: Who is this f*** ???
53 python graphprot_predict_wrapper.py --model test2.model --params test2.params --fasta gp_data/test10_predict.fa --data-id test2pred --gen-site-bed gp_data/test10_predict.bed --conf-out 52
54 python graphprot_predict_wrapper.py --model test2.model --params test2.params --fasta gp_data/test10_predict.fa --data-id test2pred --conf-out --ws-pred 53
55 54 python graphprot_predict_wrapper.py --model test2.model --params test2.params
56 python graphprot_predict_wrapper.py --model test-data/test.model --params test-data/test.params --fasta test-data/test_predict.fa --data-id predtest 55 --fasta gp_data/test10_predict.fa --data-id test2pred --gp-output
57 56 python graphprot_predict_wrapper.py --model test2.model --params test2.params
58 python graphprot_predict_wrapper.py --model test-data/test.model --params test-data/test.params --fasta test-data/test_predict.fa --data-id predtest --gen-site-bed test-data/test_predict.bed --sc-thr 0.0 --max-merge-dist 0 --conf-out --ap-extlr 5 57 --fasta gp_data/test10_predict.fa --data-id test2pred
59 58 --gen-site-bed gp_data/test10_predict.bed
60 python graphprot_predict_wrapper.py --data-id GraphProt --fasta test-data/test_predict.fa --model test-data/test.model --params test-data/test.params --gen-site-bed test-data/test_predict.bed --sc-thr 0.0 --max-merge-dist 0 --conf-out --ap-extlr 5 59
61 60 python graphprot_predict_wrapper.py --model test2.model --params test2.params
62 61 --fasta gp_data/test10_predict.fa --data-id test2pred
63 pwd && python '/home/uhlm/Dokumente/Projekte/GraphProt_galaxy_new/galaxytools/tools/rna_tools/graphprot/graphprot_predict_wrapper.py' --data-id GraphProt --fasta /tmp/tmpmuslpc1h/files/0/8/c/dataset_08c48d88-e3b5-423b-acf6-bf89b8c60660.dat --model /tmp/tmpmuslpc1h/files/e/6/4/dataset_e6471bb4-e74c-4372-bc49-656f900e7191.dat --params /tmp/tmpmuslpc1h/files/b/6/5/dataset_b65e8cf4-d3e6-429e-8d57-1d401adf4b3c.dat --gen-site-bed /tmp/tmpmuslpc1h/files/5/1/a/dataset_51a38b65-5943-472d-853e-5d845fa8ac3e.dat --sc-thr 0.0 --max-merge-dist 0 --conf-out --ap-extlr 5 62 --gen-site-bed gp_data/test10_predict.bed --conf-out
64 63
64 python graphprot_predict_wrapper.py --model test2.model --params test2.params
65 --fasta gp_data/test10_predict.fa --data-id test2pred --conf-out --ws-pred
66
67 python graphprot_predict_wrapper.py --model test-data/test.model
68 --params test-data/test.params --fasta test-data/test_predict.fa
69 --data-id predtest
70
71 python graphprot_predict_wrapper.py --model test-data/test.model
72 --params test-data/test.params --fasta test-data/test_predict.fa
73 --data-id predtest --gen-site-bed test-data/test_predict.bed
74 --sc-thr 0.0 --max-merge-dist 0 --conf-out --ap-extlr 5
75
76 python graphprot_predict_wrapper.py --data-id GraphProt
77 --fasta test-data/test_predict.fa --model test-data/test.model
78 --params test-data/test.params --gen-site-bed test-data/test_predict.bed
79 --sc-thr 0.0 --max-merge-dist 0 --conf-out --ap-extlr 5
65 80
66 """ 81 """
67 82
68 ################################################################################ 83
84 ###############################################################################
69 85
70 def setup_argument_parser(): 86 def setup_argument_parser():
71 """Setup argparse parser.""" 87 """Setup argparse parser."""
72 help_description = """ 88 help_description = """
73 Galaxy wrapper script for GraphProt (-action predict and -action 89 Galaxy wrapper script for GraphProt (-action predict and -action
74 predict_profile) to compute whole site or position-wise scores for input 90 predict_profile) to compute whole site or position-wise scores for input
75 FASTA sequences. 91 FASTA sequences.
76 By default, profile predictions are calculated, followed by average 92 By default, profile predictions are calculated, followed by average
77 profiles computions and peak regions extraction from average profiles. 93 profiles computions and peak regions extraction from average profiles.
78 If --ws-pred is set, whole site score predictions on input sequences 94 If --ws-pred is set, whole site score predictions on input sequences
79 will be run instead. 95 will be run instead.
80 If --conf-out is set, sites or peak regions with a score >= the median 96 If --conf-out is set, sites or peak regions with a score >= the median
81 score of positive training sites will be output. 97 score of positive training sites will be output.
82 If --gen-site-bed .bed file is provided, peak regions will be output 98 If --gen-site-bed .bed file is provided, peak regions will be output
83 with genomic coordinates too. 99 with genomic coordinates too.
84 100
85 """ 101 """
86 # Define argument parser. 102 # Define argument parser.
87 p = argparse.ArgumentParser(add_help=False, 103 p = ap.ArgumentParser(add_help=False,
88 prog="graphprot_predict_wrapper.py", 104 prog="graphprot_predict_wrapper.py",
89 description=help_description, 105 description=help_description,
90 formatter_class=argparse.MetavarTypeHelpFormatter) 106 formatter_class=ap.MetavarTypeHelpFormatter)
91 107
92 # Argument groups. 108 # Argument groups.
93 p_man = p.add_argument_group("REQUIRED ARGUMENTS") 109 p_man = p.add_argument_group("REQUIRED ARGUMENTS")
94 p_opt = p.add_argument_group("OPTIONAL ARGUMENTS") 110 p_opt = p.add_argument_group("OPTIONAL ARGUMENTS")
95 111
96 # Required arguments. 112 # Required arguments.
97 p_opt.add_argument("-h", "--help", 113 p_opt.add_argument("-h", "--help",
98 action="help", 114 action="help",
99 help="Print help message") 115 help="Print help message")
100 p_man.add_argument("--fasta", 116 p_man.add_argument("--fasta",
101 dest="in_fa", 117 dest="in_fa",
102 type=str, 118 type=str,
103 required = True, 119 required=True,
104 help = "Sequences .fa file to predict on (option -fasta)") 120 help="Sequences .fa file to predict"
121 " on (option -fasta)")
105 p_man.add_argument("--model", 122 p_man.add_argument("--model",
106 dest="in_model", 123 dest="in_model",
107 type=str, 124 type=str,
108 required = True, 125 required=True,
109 help = "GraphProt model file to use for predictions (option -model)") 126 help="GraphProt model file to use for predictions"
127 " (option -model)")
110 p_man.add_argument("--params", 128 p_man.add_argument("--params",
111 dest="in_params", 129 dest="in_params",
112 type=str, 130 type=str,
113 required = True, 131 required=True,
114 help = "Parameter file for given model") 132 help="Parameter file for given model")
115 p_man.add_argument("--data-id", 133 p_man.add_argument("--data-id",
116 dest="data_id", 134 dest="data_id",
117 type=str, 135 type=str,
118 required = True, 136 required=True,
119 help = "Data ID (option -prefix)") 137 help="Data ID (option -prefix)")
120 # ---> I'm a conditional argument <--- 138 # ---> I'm a conditional argument <---
121 p_opt.add_argument("--ws-pred", 139 p_opt.add_argument("--ws-pred",
122 dest = "ws_pred", 140 dest="ws_pred",
123 default = False, 141 default=False,
124 action = "store_true", 142 action="store_true",
125 help = "Run a whole site prediction instead of calculating profiles (default: false)") 143 help="Run a whole site prediction instead "
144 "of calculating profiles (default: false)")
126 # Additional arguments. 145 # Additional arguments.
127 p_opt.add_argument("--sc-thr", 146 p_opt.add_argument("--sc-thr",
128 dest="score_thr", 147 dest="score_thr",
129 type = float, 148 type=float,
130 default = 0, 149 default=0,
131 help = "Score threshold for extracting average profile peak regions (default: 0)") 150 help="Score threshold for extracting "
151 "average profile peak regions (default: 0)")
132 p_opt.add_argument("--max-merge-dist", 152 p_opt.add_argument("--max-merge-dist",
133 dest="max_merge_dist", 153 dest="max_merge_dist",
134 type = int, 154 type=int,
135 default = 0, 155 default=0,
136 choices = [0,1,2,3,4,5,6,7,8,9,10], 156 choices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
137 help = "Maximum merge distance for nearby peak regions (default: report all non-overlapping regions)") 157 help="Maximum merge distance for nearby peak regions"
158 " (default: report all non-overlapping regions)")
138 p_opt.add_argument("--gen-site-bed", 159 p_opt.add_argument("--gen-site-bed",
139 dest="genomic_sites_bed", 160 dest="genomic_sites_bed",
140 type=str, 161 type=str,
141 help = ".bed file specifying the genomic regions of the input .fa sequences. Corrupt .bed information will be punished (default: false)") 162 help=".bed file specifying the genomic regions of "
163 "the input .fa sequences. Corrupt .bed "
164 "information will be punished (default: false)")
142 p_opt.add_argument("--conf-out", 165 p_opt.add_argument("--conf-out",
143 dest="conf_out", 166 dest="conf_out",
144 default = False, 167 default=False,
145 action = "store_true", 168 action="store_true",
146 help = "Output filtered peak regions BED file or predictions file (if --ws-pred) using the median positive training site score for filtering (default: false)") 169 help="Output filtered peak regions BED file or "
170 "predictions file (if --ws-pred) using the median "
171 "positive training site score for filtering "
172 "(default: false)")
147 p_opt.add_argument("--gp-output", 173 p_opt.add_argument("--gp-output",
148 dest = "gp_output", 174 dest="gp_output",
149 default = False, 175 default=False,
150 action = "store_true", 176 action="store_true",
151 help = "Print output produced by GraphProt (default: false)") 177 help="Print output produced by GraphProt "
178 "(default: false)")
152 p_opt.add_argument("--ap-extlr", 179 p_opt.add_argument("--ap-extlr",
153 dest="ap_extlr", 180 dest="ap_extlr",
154 type = int, 181 type=int,
155 default = 5, 182 default=5,
156 choices = [0,1,2,3,4,5,6,7,8,9,10], 183 choices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
157 help = "Define average profile up- and downstream extension to produce the average profile. The mean over small sequence windows (window length = --ap-extlr*2 + 1) is used to get position scores, thus the average profile is more smooth than the initial profile output by GraphProt (default: 5)") 184 help="Define average profile up- and downstream "
185 "extension to produce the average profile. "
186 "The mean over small sequence windows "
187 "(window length = --ap-extlr*2 + 1) is used to "
188 "get position scores, thus the average profile "
189 "is more smooth than the initial profile output "
190 "by GraphProt (default: 5)")
158 return p 191 return p
159 192
160 193
161 ################################################################################ 194 ###############################################################################
162 195
163 if __name__ == '__main__': 196 if __name__ == '__main__':
164 197
165 # Setup argparse. 198 # Setup argparse.
166 parser = setup_argument_parser() 199 parser = setup_argument_parser()
167 # Read in command line arguments. 200 # Read in command line arguments.
168 args = parser.parse_args() 201 args = parser.parse_args()
169 202
170 """ 203 """
171 Do all sorts of sanity checking. 204 Do all sorts of sanity checking.
172 205
173 """ 206 """
174 # Check for Linux. 207 # Check for Linux.
175 assert "linux" in sys.platform, "please use Linux" 208 assert "linux" in sys.platform, "please use Linux"
176 # Check tool availability. 209 # Check tool availability.
177 assert gplib.is_tool("GraphProt.pl"), "GraphProt.pl not in PATH" 210 assert gplib.is_tool("GraphProt.pl"), "GraphProt.pl not in PATH"
178 # Check file inputs. 211 # Check file inputs.
179 assert os.path.exists(args.in_fa), "input .fa file \"%s\" not found" %(args.in_fa) 212 assert os.path.exists(args.in_fa), \
180 assert os.path.exists(args.in_model), "input .model file \"%s\" not found" %(args.in_model) 213 "input .fa file \"%s\" not found" % (args.in_fa)
181 assert os.path.exists(args.in_params), "input .params file \"%s\" not found" %(args.in_params) 214 assert os.path.exists(args.in_model), \
215 "input .model file \"%s\" not found" % (args.in_model)
216 assert os.path.exists(args.in_params), \
217 "input .params file \"%s\" not found" % (args.in_params)
182 # Count .fa entries. 218 # Count .fa entries.
183 c_in_fa = gplib.count_fasta_headers(args.in_fa) 219 c_in_fa = gplib.count_fasta_headers(args.in_fa)
184 assert c_in_fa, "input .fa file \"%s\" no headers found" %(args.in_fa) 220 assert c_in_fa, "input .fa file \"%s\" no headers found" % (args.in_fa)
185 print("# input .fa sequences: %i" %(c_in_fa)) 221 print("# input .fa sequences: %i" % (c_in_fa))
186 # Read in FASTA sequences to check for uppercase sequences. 222 # Read in FASTA sequences to check for uppercase sequences.
187 seqs_dic = gplib.read_fasta_into_dic(args.in_fa) 223 seqs_dic = gplib.read_fasta_into_dic(args.in_fa)
188 c_uc_nt = gplib.seqs_dic_count_uc_nts(seqs_dic) 224 c_uc_nt = gplib.seqs_dic_count_uc_nts(seqs_dic)
189 assert c_uc_nt, "no uppercase nucleotides in input .fa sequences. Please change sequences to uppercase (keep in mind GraphProt only scores uppercase regions (according to its viewpoint concept)" 225 assert c_uc_nt, "no uppercase nucleotides in input .fa sequences. "\
226 "Please change sequences to uppercase (keep in mind "\
227 "GraphProt only scores uppercase regions (according "\
228 "to its viewpoint concept)"
190 if not args.ws_pred: 229 if not args.ws_pred:
191 # Check for lowercase sequences. 230 # Check for lowercase sequences.
192 c_lc_nt = gplib.seqs_dic_count_lc_nts(seqs_dic) 231 c_lc_nt = gplib.seqs_dic_count_lc_nts(seqs_dic)
193 assert not c_lc_nt, "lowercase nucleotides in input .fa not allowed in profile predictions, since GraphProt only scores uppercase regions (according to its viewpoint concept)" 232 assert not c_lc_nt, "lowercase nucleotides in input .fa not allowed"\
233 " in profile predictions, since GraphProt only scores "\
234 "uppercase regions (according to its viewpoint concept)"
194 # Check .bed. 235 # Check .bed.
195 if args.genomic_sites_bed: 236 if args.genomic_sites_bed:
196 # An array of checks, marvelous. 237 # An array of checks, marvelous.
197 assert os.path.exists(args.genomic_sites_bed), "genomic .bed file \"%s\" not found" %(args.genomic_sites_bed) 238 assert \
239 os.path.exists(args.genomic_sites_bed), \
240 "genomic .bed file \"%s\" not found" % (args.genomic_sites_bed)
198 # Check .bed for content. 241 # Check .bed for content.
199 assert gplib.count_file_rows(args.genomic_sites_bed), "genomic .bed file \"%s\" is empty" %(args.genomic_sites_bed) 242 assert gplib.count_file_rows(args.genomic_sites_bed), \
243 "genomic .bed file \"%s\" is empty" % (args.genomic_sites_bed)
200 # Check .bed for 6-column format. 244 # Check .bed for 6-column format.
201 assert gplib.bed_check_six_col_format(args.genomic_sites_bed), "genomic .bed file \"%s\" appears to not be in 6-column .bed format" %(args.genomic_sites_bed) 245 assert gplib.bed_check_six_col_format(args.genomic_sites_bed), \
246 "genomic .bed file \"%s\" appears to not "\
247 "be in 6-column .bed format" % (args.genomic_sites_bed)
202 # Check for unique column 4 IDs. 248 # Check for unique column 4 IDs.
203 assert gplib.bed_check_unique_ids(args.genomic_sites_bed), "genomic .bed file \"%s\" column 4 IDs not unique" %(args.genomic_sites_bed) 249 assert gplib.bed_check_unique_ids(args.genomic_sites_bed), \
204 # Read in .bed regions, compare to FASTA sequences (compare IDs + lengths) 250 "genomic .bed file \"%s\" column 4 IDs not unique" % \
251 (args.genomic_sites_bed)
252 # Read in .bed regions, compare to FASTA sequences.
205 seq_len_dic = gplib.get_seq_lengths_from_seqs_dic(seqs_dic) 253 seq_len_dic = gplib.get_seq_lengths_from_seqs_dic(seqs_dic)
206 reg_len_dic = gplib.bed_get_region_lengths(args.genomic_sites_bed) 254 reg_len_dic = gplib.bed_get_region_lengths(args.genomic_sites_bed)
207 for seq_id in seq_len_dic: 255 for seq_id in seq_len_dic:
208 seq_l = seq_len_dic[seq_id] 256 seq_l = seq_len_dic[seq_id]
209 assert seq_id in reg_len_dic, "sequence ID \"\" missing in input .bed \"\"" %(seq_id, args.genomic_sites_bed) 257 assert seq_id in reg_len_dic, \
258 "sequence ID \"%s\" missing in input .bed \"%s\"" % \
259 (seq_id, args.genomic_sites_bed)
210 reg_l = reg_len_dic[seq_id] 260 reg_l = reg_len_dic[seq_id]
211 assert seq_l == reg_l, "sequence length differs from .bed region length (%i != %i)" %(seq_l, reg_l) 261 assert seq_l == reg_l, \
262 "sequence length differs from .bed region length (%i != %i)" \
263 % (seq_l, reg_l)
212 # Read in model parameters. 264 # Read in model parameters.
213 param_dic = gplib.graphprot_get_param_dic(args.in_params) 265 param_dic = gplib.graphprot_get_param_dic(args.in_params)
214 # Create GraphProt parameter string. 266 # Create GraphProt parameter string.
215 param_string = gplib.graphprot_get_param_string(args.in_params) 267 param_string = gplib.graphprot_get_param_string(args.in_params)
216 268
217 """ 269 """
218 Run predictions. 270 Run predictions.
219 271
220 """ 272 """
221 if args.ws_pred: 273 if args.ws_pred:
222 # Do whole site prediction. 274 # Do whole site prediction.
223 print("Starting whole site predictions on input .fa file (-action predict) ... ") 275 print("Starting whole site predictions on input .fa file"
224 check_cmd = "GraphProt.pl -action predict -prefix " + args.data_id + " -fasta " + args.in_fa + " " + param_string + " -model " + args.in_model 276 " (-action predict) ... ")
277 check_cmd = "GraphProt.pl -action predict -prefix " \
278 + args.data_id + " -fasta " + args.in_fa + " " \
279 + param_string + " -model " + args.in_model
225 output = subprocess.getoutput(check_cmd) 280 output = subprocess.getoutput(check_cmd)
226 assert output, "the following call of GraphProt.pl produced no output:\n%s" %(check_cmd) 281 assert output, "the following call of GraphProt.pl "\
282 "produced no output:\n%s" % (check_cmd)
227 if args.gp_output: 283 if args.gp_output:
228 print(output) 284 print(output)
229 ws_predictions_file = args.data_id + ".predictions" 285 ws_predictions_file = args.data_id + ".predictions"
230 assert os.path.exists(ws_predictions_file), "Whole site prediction output .predictions file \"%s\" not found" %(ws_predictions_file) 286 assert os.path.exists(ws_predictions_file), \
287 "Whole site prediction output .predictions file \"%s\" not found" \
288 % (ws_predictions_file)
231 if args.conf_out: 289 if args.conf_out:
232 # Filter by pos_train_ws_pred_median median. 290 # Filter by pos_train_ws_pred_median median.
233 assert "pos_train_ws_pred_median" in param_dic, "whole site top scores median information missing in .params file" 291 assert "pos_train_ws_pred_median" in param_dic, \
234 pos_train_ws_pred_median = float(param_dic["pos_train_ws_pred_median"]) 292 "whole site top scores median information "\
293 "missing in .params file"
294 pos_tr_ws_pred_med = \
295 float(param_dic["pos_train_ws_pred_median"])
235 # Filtered file. 296 # Filtered file.
236 filt_ws_predictions_file = args.data_id + ".p50.predictions" 297 filt_ws_predictions_file = args.data_id + ".p50.predictions"
237 print("Extracting p50 sites from whole site predictions (score threshold = %f) ... " %(pos_train_ws_pred_median)) 298 print("Extracting p50 sites from whole site predictions"
238 gplib.graphprot_filter_predictions_file(ws_predictions_file, filt_ws_predictions_file, 299 " (score threshold = %f) ... " % (pos_tr_ws_pred_med))
239 sc_thr=pos_train_ws_pred_median) 300 gplib.graphprot_filter_predictions_file(ws_predictions_file,
301 filt_ws_predictions_file,
302 sc_thr=pos_tr_ws_pred_med)
240 else: 303 else:
241 # Do profile prediction. 304 # Do profile prediction.
242 print("Starting profile predictions on on input .fa file (-action predict_profile) ... ") 305 print("Starting profile predictions on on input .fa file "
243 check_cmd = "GraphProt.pl -action predict_profile -prefix " + args.data_id + " -fasta " + args.in_fa + " " + param_string + " -model " + args.in_model 306 "(-action predict_profile) ... ")
307 check_cmd = "GraphProt.pl -action predict_profile -prefix " \
308 + args.data_id + " -fasta " + args.in_fa + " " + \
309 param_string + " -model " + args.in_model
244 output = subprocess.getoutput(check_cmd) 310 output = subprocess.getoutput(check_cmd)
245 assert output, "the following call of GraphProt.pl produced no output:\n%s" %(check_cmd) 311 assert output, "the following call of GraphProt.pl produced "\
312 "no output:\n%s" % (check_cmd)
246 if args.gp_output: 313 if args.gp_output:
247 print(output) 314 print(output)
248 profile_predictions_file = args.data_id + ".profile" 315 profile_predictions_file = args.data_id + ".profile"
249 assert os.path.exists(profile_predictions_file), "Profile prediction output .profile file \"%s\" not found" %(profile_predictions_file) 316 assert os.path.exists(profile_predictions_file), \
317 "Profile prediction output .profile file \"%s\" not found" % \
318 (profile_predictions_file)
250 319
251 # Profile prediction output files. 320 # Profile prediction output files.
252 avg_prof_file = args.data_id + ".avg_profile" 321 avg_prof_file = args.data_id + ".avg_profile"
253 avg_prof_peaks_file = args.data_id + ".avg_profile.peaks.bed" 322 avg_prof_peaks_file = args.data_id + ".avg_profile.peaks.bed"
254 avg_prof_gen_peaks_file = args.data_id + ".avg_profile.genomic_peaks.bed" 323 avg_prof_gen_peaks_file = args.data_id + \
255 avg_prof_peaks_p50_file = args.data_id + ".avg_profile.p50.peaks.bed" 324 ".avg_profile.genomic_peaks.bed"
256 avg_prof_gen_peaks_p50_file = args.data_id + ".avg_profile.p50.genomic_peaks.bed" 325 avg_prof_peaks_p50_file = args.data_id + \
326 ".avg_profile.p50.peaks.bed"
327 avg_prof_gen_peaks_p50_file = args.data_id + \
328 ".avg_profile.p50.genomic_peaks.bed"
257 329
258 # Get sequence IDs in order from input .fa file. 330 # Get sequence IDs in order from input .fa file.
259 seq_ids_list = gplib.fasta_read_in_ids(args.in_fa) 331 seq_ids_list = gplib.fasta_read_in_ids(args.in_fa)
260 # Calculate average profiles. 332 # Calculate average profiles.
261 print("Getting average profile from profile (extlr for smoothing: %i) ... " %(args.ap_extlr)) 333 print("Getting average profile from profile "
262 gplib.graphprot_profile_calculate_avg_profile(profile_predictions_file, 334 "(extlr for smoothing: %i) ... " % (args.ap_extlr))
263 avg_prof_file, 335 gplib.graphprot_profile_calc_avg_profile(profile_predictions_file,
264 ap_extlr=args.ap_extlr, 336 avg_prof_file,
265 seq_ids_list=seq_ids_list, 337 ap_extlr=args.ap_extlr,
266 method=2) 338 seq_ids_list=seq_ids_list,
339 method=2)
267 # Extract peak regions on sequences with threshold score 0. 340 # Extract peak regions on sequences with threshold score 0.
268 print("Extracting peak regions from average profile (score threshold = 0) ... ") 341 print("Extracting peak regions from average profile "
269 gplib.graphprot_profile_extract_peak_regions(avg_prof_file, avg_prof_peaks_file, 342 "(score threshold = 0) ... ")
270 max_merge_dist=args.max_merge_dist, 343 killpep8 = args.max_merge_dist
271 sc_thr=args.score_thr) 344 gplib.graphprot_profile_extract_peak_regions(avg_prof_file,
345 avg_prof_peaks_file,
346 max_merge_dist=killpep8,
347 sc_thr=args.score_thr)
272 # Convert peaks to genomic coordinates. 348 # Convert peaks to genomic coordinates.
273 if args.genomic_sites_bed: 349 if args.genomic_sites_bed:
274 print("Converting peak regions to genomic coordinates ... ") 350 print("Converting peak regions to genomic coordinates ... ")
275 gplib.bed_peaks_to_genomic_peaks(avg_prof_peaks_file, avg_prof_gen_peaks_file, 351 killit = args.genomic_sites_bed
352 gplib.bed_peaks_to_genomic_peaks(avg_prof_peaks_file,
353 avg_prof_gen_peaks_file,
276 print_rows=False, 354 print_rows=False,
277 genomic_sites_bed=args.genomic_sites_bed) 355 genomic_sites_bed=killit)
278 # gplib.make_file_copy(avg_prof_gen_peaks_file, avg_prof_peaks_file)
279 # Extract peak regions with threshold score p50. 356 # Extract peak regions with threshold score p50.
280 if args.conf_out: 357 if args.conf_out:
281 sc_id = "pos_train_avg_profile_median_%i" %(args.ap_extlr) 358 sc_id = "pos_train_avg_profile_median_%i" % (args.ap_extlr)
282 # Filter by pos_train_ws_pred_median median. 359 # Filter by pos_tr_ws_pred_med median.
283 assert sc_id in param_dic, "average profile extlr %i median information missing in .params file" %(args.ap_extlr) 360 assert sc_id in param_dic, "average profile extlr %i median "\
361 "information missing in .params file" % (args.ap_extlr)
284 p50_sc_thr = float(param_dic[sc_id]) 362 p50_sc_thr = float(param_dic[sc_id])
285 print("Extracting p50 peak regions from average profile (score threshold = %f) ... " %(p50_sc_thr)) 363 print("Extracting p50 peak regions from average profile "
286 gplib.graphprot_profile_extract_peak_regions(avg_prof_file, avg_prof_peaks_p50_file, 364 "(score threshold = %f) ... " % (p50_sc_thr))
287 max_merge_dist=args.max_merge_dist, 365 despair = avg_prof_peaks_p50_file
366 pain = args.max_merge_dist
367 gplib.graphprot_profile_extract_peak_regions(avg_prof_file,
368 despair,
369 max_merge_dist=pain,
288 sc_thr=p50_sc_thr) 370 sc_thr=p50_sc_thr)
289 # Convert peaks to genomic coordinates. 371 # Convert peaks to genomic coordinates.
290 if args.genomic_sites_bed: 372 if args.genomic_sites_bed:
291 print("Converting p50 peak regions to genomic coordinates ... ") 373 print("Converting p50 peak regions to "
292 gplib.bed_peaks_to_genomic_peaks(avg_prof_peaks_p50_file, avg_prof_gen_peaks_p50_file, 374 "genomic coordinates ... ")
293 genomic_sites_bed=args.genomic_sites_bed) 375 madness = args.genomic_sites_bed
376 gplib.bed_peaks_to_genomic_peaks(avg_prof_peaks_p50_file,
377 avg_prof_gen_peaks_p50_file,
378 genomic_sites_bed=madness)
294 # Done. 379 # Done.
295 print("Script: I'm done.") 380 print("Script: I'm done.")
296 print("Author: ... ") 381 print("Author: ... ")
297
298