comparison graphprot_train_wrapper.py @ 5:ddcf35a868b8 draft default tip

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/graphprot commit ad60258f5759eaa205fec4af6143c728ea131419
author bgruening
date Wed, 05 Jun 2024 16:40:51 +0000
parents ace92c9a4653
children
comparison
equal deleted inserted replaced
4:4ad83aed5c3c 5:ddcf35a868b8
4 import os 4 import os
5 import subprocess 5 import subprocess
6 import sys 6 import sys
7 7
8 import gplib 8 import gplib
9
10 9
11 """ 10 """
12 11
13 TOOL DEPENDENCIES 12 TOOL DEPENDENCIES
14 ================= 13 =================
60 59
61 60
62 """ 61 """
63 62
64 63
65 ############################################################################### 64 #######################################################################
65
66 66
67 def setup_argument_parser(): 67 def setup_argument_parser():
68 """Setup argparse parser.""" 68 """Setup argparse parser."""
69 help_description = """ 69 help_description = """
70 Galaxy wrapper script for GraphProt to train a GraphProt model on 70 Galaxy wrapper script for GraphProt to train a GraphProt model on
82 score out of these to store in .params file, using it later 82 score out of these to store in .params file, using it later
83 for outputting binding sites or peaks with higher confidence. 83 for outputting binding sites or peaks with higher confidence.
84 84
85 """ 85 """
86 # Define argument parser. 86 # Define argument parser.
87 p = ap.ArgumentParser(add_help=False, 87 p = ap.ArgumentParser(
88 prog="graphprot_train_wrapper.py", 88 add_help=False,
89 description=help_description, 89 prog="graphprot_train_wrapper.py",
90 formatter_class=ap.MetavarTypeHelpFormatter) 90 description=help_description,
91 formatter_class=ap.MetavarTypeHelpFormatter,
92 )
91 93
92 # Argument groups. 94 # Argument groups.
93 p_man = p.add_argument_group("REQUIRED ARGUMENTS") 95 p_man = p.add_argument_group("REQUIRED ARGUMENTS")
94 p_opt = p.add_argument_group("OPTIONAL ARGUMENTS") 96 p_opt = p.add_argument_group("OPTIONAL ARGUMENTS")
95 97
96 # Required arguments. 98 # Required arguments.
97 p_opt.add_argument("-h", "--help", 99 p_opt.add_argument("-h", "--help", action="help", help="Print help message")
98 action="help", 100 p_man.add_argument(
99 help="Print help message") 101 "--pos",
100 p_man.add_argument("--pos", 102 dest="in_pos_fa",
101 dest="in_pos_fa", 103 type=str,
102 type=str, 104 required=True,
103 required=True, 105 help="Positive (= binding site) sequences .fa file "
104 help="Positive (= binding site) sequences .fa file " 106 "for model training (option -fasta)",
105 "for model training (option -fasta)") 107 )
106 p_man.add_argument("--neg", 108 p_man.add_argument(
107 dest="in_neg_fa", 109 "--neg",
108 type=str, 110 dest="in_neg_fa",
109 required=True, 111 type=str,
110 help="Negative sequences .fa file for model " 112 required=True,
111 "training (option -negfasta)") 113 help="Negative sequences .fa file for model " "training (option -negfasta)",
112 p_man.add_argument("--data-id", 114 )
113 dest="data_id", 115 p_man.add_argument(
114 type=str, 116 "--data-id",
115 required=True, 117 dest="data_id",
116 help="Data ID (option -prefix)") 118 type=str,
119 required=True,
120 help="Data ID (option -prefix)",
121 )
117 # Additional arguments. 122 # Additional arguments.
118 p_opt.add_argument("--opt-set-size", 123 p_opt.add_argument(
119 dest="opt_set_size", 124 "--opt-set-size",
120 type=int, 125 dest="opt_set_size",
121 default=500, 126 type=int,
122 help="Hyperparameter optimization set size (taken " 127 default=500,
123 "away from both --pos and --neg) (default: 500)") 128 help="Hyperparameter optimization set size (taken "
124 p_opt.add_argument("--opt-pos", 129 "away from both --pos and --neg) (default: 500)",
125 dest="opt_pos_fa", 130 )
126 type=str, 131 p_opt.add_argument(
127 help="Positive (= binding site) sequences .fa file " 132 "--opt-pos",
128 "for hyperparameter optimization (default: take " 133 dest="opt_pos_fa",
129 "--opt-set-size from --pos)") 134 type=str,
130 p_opt.add_argument("--opt-neg", 135 help="Positive (= binding site) sequences .fa file "
131 dest="opt_neg_fa", 136 "for hyperparameter optimization (default: take "
132 type=str, 137 "--opt-set-size from --pos)",
133 help="Negative sequences .fa file for hyperparameter " 138 )
134 "optimization (default: take --opt-set-size " 139 p_opt.add_argument(
135 "from --neg)") 140 "--opt-neg",
136 p_opt.add_argument("--min-train", 141 dest="opt_neg_fa",
137 dest="min_train", 142 type=str,
138 type=int, 143 help="Negative sequences .fa file for hyperparameter "
139 default=500, 144 "optimization (default: take --opt-set-size "
140 help="Minimum amount of training sites demanded " 145 "from --neg)",
141 "(default: 500)") 146 )
142 p_opt.add_argument("--disable-cv", 147 p_opt.add_argument(
143 dest="disable_cv", 148 "--min-train",
144 default=False, 149 dest="min_train",
145 action="store_true", 150 type=int,
146 help="Disable cross validation step (default: false)") 151 default=500,
147 p_opt.add_argument("--disable-motifs", 152 help="Minimum amount of training sites demanded " "(default: 500)",
148 dest="disable_motifs", 153 )
149 default=False, 154 p_opt.add_argument(
150 action="store_true", 155 "--disable-cv",
151 help="Disable motif generation step (default: false)") 156 dest="disable_cv",
152 p_opt.add_argument("--gp-output", 157 default=False,
153 dest="gp_output", 158 action="store_true",
154 default=False, 159 help="Disable cross validation step (default: false)",
155 action="store_true", 160 )
156 help="Print output produced by GraphProt " 161 p_opt.add_argument(
157 "(default: false)") 162 "--disable-motifs",
158 p_opt.add_argument("--str-model", 163 dest="disable_motifs",
159 dest="train_str_model", 164 default=False,
160 default=False, 165 action="store_true",
161 action="store_true", 166 help="Disable motif generation step (default: false)",
162 help="Train a structure model (default: train " 167 )
163 "a sequence model)") 168 p_opt.add_argument(
169 "--gp-output",
170 dest="gp_output",
171 default=False,
172 action="store_true",
173 help="Print output produced by GraphProt " "(default: false)",
174 )
175 p_opt.add_argument(
176 "--str-model",
177 dest="train_str_model",
178 default=False,
179 action="store_true",
180 help="Train a structure model (default: train " "a sequence model)",
181 )
164 return p 182 return p
165 183
166 184
167 ############################################################################### 185 #######################################################################
168 186
169 if __name__ == '__main__': 187 if __name__ == "__main__":
170 188
171 # Setup argparse. 189 # Setup argparse.
172 parser = setup_argument_parser() 190 parser = setup_argument_parser()
173 # Read in command line arguments. 191 # Read in command line arguments.
174 args = parser.parse_args() 192 args = parser.parse_args()
180 # Check for Linux. 198 # Check for Linux.
181 assert "linux" in sys.platform, "please use Linux" 199 assert "linux" in sys.platform, "please use Linux"
182 # Check tool availability. 200 # Check tool availability.
183 assert gplib.is_tool("GraphProt.pl"), "GraphProt.pl not in PATH" 201 assert gplib.is_tool("GraphProt.pl"), "GraphProt.pl not in PATH"
184 # Check file inputs. 202 # Check file inputs.
185 assert os.path.exists(args.in_pos_fa), \ 203 assert os.path.exists(args.in_pos_fa), 'positives .fa file "%s" not found' % (
186 "positives .fa file \"%s\" not found" % (args.in_pos_fa) 204 args.in_pos_fa
187 assert os.path.exists(args.in_neg_fa), \ 205 )
188 "negatives .fa file \"%s\" not found" % (args.in_neg_fa) 206 assert os.path.exists(args.in_neg_fa), 'negatives .fa file "%s" not found' % (
207 args.in_neg_fa
208 )
189 # Count .fa entries. 209 # Count .fa entries.
190 c_pos_fa = gplib.count_fasta_headers(args.in_pos_fa) 210 c_pos_fa = gplib.count_fasta_headers(args.in_pos_fa)
191 c_neg_fa = gplib.count_fasta_headers(args.in_neg_fa) 211 c_neg_fa = gplib.count_fasta_headers(args.in_neg_fa)
192 assert c_pos_fa, "positives .fa file \"%s\" no headers found" % \ 212 assert c_pos_fa, 'positives .fa file "%s" no headers found' % (args.in_pos_fa)
193 (args.in_pos_fa) 213 assert c_neg_fa, 'negatives .fa file "%s" no headers found' % (args.in_neg_fa)
194 assert c_neg_fa, "negatives .fa file \"%s\" no headers found" % \
195 (args.in_neg_fa)
196 print("# positive .fa sequences: %i" % (c_pos_fa)) 214 print("# positive .fa sequences: %i" % (c_pos_fa))
197 print("# negative .fa sequences: %i" % (c_neg_fa)) 215 print("# negative .fa sequences: %i" % (c_neg_fa))
198 # Check additional files. 216 # Check additional files.
199 if args.opt_pos_fa: 217 if args.opt_pos_fa:
200 assert args.opt_neg_fa, "--opt-pos but no --opt-neg given" 218 assert args.opt_neg_fa, "--opt-pos but no --opt-neg given"
201 if args.opt_neg_fa: 219 if args.opt_neg_fa:
202 assert args.opt_pos_fa, "--opt-neg but no --opt-pos given" 220 assert args.opt_pos_fa, "--opt-neg but no --opt-pos given"
203 # Check for lowercase only sequences, which cause GP to crash. 221 # Check for lowercase only sequences, which cause GP to crash.
204 error_mess = "input sequences encountered containing "\ 222 error_mess = (
205 "only lowercase characters or lowercase characters in between "\ 223 "input sequences encountered containing "
206 "uppercase characters. Please provide either all uppercase "\ 224 "only lowercase characters or lowercase characters in between "
207 "sequences or sequences containing uppercase regions surrounded "\ 225 "uppercase characters. Please provide either all uppercase "
208 "by lowercase context regions for structure calculation (see "\ 226 "sequences or sequences containing uppercase regions surrounded "
209 "viewpoint concept in original GraphProt publication "\ 227 "by lowercase context regions for structure calculation (see "
228 "viewpoint concept in original GraphProt publication "
210 "for more details)" 229 "for more details)"
230 )
211 seqs_dic = gplib.read_fasta_into_dic(args.in_pos_fa) 231 seqs_dic = gplib.read_fasta_into_dic(args.in_pos_fa)
212 bad_ids = gplib.check_seqs_dic_format(seqs_dic) 232 bad_ids = gplib.check_seqs_dic_format(seqs_dic)
213 assert not bad_ids, "%s" % (error_mess) 233 assert not bad_ids, "%s" % (error_mess)
214 seqs_dic = gplib.read_fasta_into_dic(args.in_neg_fa) 234 seqs_dic = gplib.read_fasta_into_dic(args.in_neg_fa)
215 bad_ids = gplib.check_seqs_dic_format(seqs_dic) 235 bad_ids = gplib.check_seqs_dic_format(seqs_dic)
225 245
226 # If parop .fa files given. 246 # If parop .fa files given.
227 if args.opt_pos_fa and args.opt_neg_fa: 247 if args.opt_pos_fa and args.opt_neg_fa:
228 c_parop_pos_fa = gplib.count_fasta_headers(args.opt_pos_fa) 248 c_parop_pos_fa = gplib.count_fasta_headers(args.opt_pos_fa)
229 c_parop_neg_fa = gplib.count_fasta_headers(args.opt_neg_fa) 249 c_parop_neg_fa = gplib.count_fasta_headers(args.opt_neg_fa)
230 assert c_parop_pos_fa, "--opt-pos .fa file \"%s\" no headers found" \ 250 assert c_parop_pos_fa, '--opt-pos .fa file "%s" no headers found' % (
231 % (args.opt_pos_fa) 251 args.opt_pos_fa
232 assert c_parop_neg_fa, "--opt-neg .fa file \"%s\" no headers found" \ 252 )
233 % (args.opt_neg_fa) 253 assert c_parop_neg_fa, '--opt-neg .fa file "%s" no headers found' % (
254 args.opt_neg_fa
255 )
234 # Less than 500 for training?? You gotta be kidding. 256 # Less than 500 for training?? You gotta be kidding.
235 assert c_pos_fa >= args.min_train, \ 257 assert c_pos_fa >= args.min_train, (
236 "--pos for training < %i, please provide more (try at least "\ 258 "--pos for training < %i, please provide more (try at least "
237 "> 1000, the more the better)" % (args.min_train) 259 "> 1000, the more the better)" % (args.min_train)
238 assert c_neg_fa >= args.min_train, \ 260 )
239 "--neg for training < %i, please provide more (try at least "\ 261 assert c_neg_fa >= args.min_train, (
262 "--neg for training < %i, please provide more (try at least "
240 "> 1000, the more the better)" % (args.min_train) 263 "> 1000, the more the better)" % (args.min_train)
264 )
241 # Looking closer at ratios. 265 # Looking closer at ratios.
242 pos_neg_ratio = c_parop_pos_fa / c_parop_neg_fa 266 pos_neg_ratio = c_parop_pos_fa / c_parop_neg_fa
243 if pos_neg_ratio < 0.8 or pos_neg_ratio > 1.25: 267 if pos_neg_ratio < 0.8 or pos_neg_ratio > 1.25:
244 assert 0, "ratio of --opt-pos to --opt-neg < 0.8 or > 1.25 "\ 268 assert 0, (
245 "(ratio = %f). Try to keep ratio closer to 1 or better use "\ 269 "ratio of --opt-pos to --opt-neg < 0.8 or > 1.25 "
246 "identical numbers (keep in mind that performance measures "\ 270 "(ratio = %f). Try to keep ratio closer to 1 or better use "
247 "such as accuracy or AUROC are not suitable for imbalanced "\ 271 "identical numbers (keep in mind that performance measures "
272 "such as accuracy or AUROC are not suitable for imbalanced "
248 " datasets!)" % (pos_neg_ratio) 273 " datasets!)" % (pos_neg_ratio)
274 )
249 else: 275 else:
250 # Define some minimum amount of training sites for the sake of sanity. 276 # Define some minimum amount of training sites for the sake of sanity.
251 c_pos_train = c_pos_fa - args.opt_set_size 277 c_pos_train = c_pos_fa - args.opt_set_size
252 c_neg_train = c_neg_fa - args.opt_set_size 278 c_neg_train = c_neg_fa - args.opt_set_size
253 # Start complaining. 279 # Start complaining.
254 assert c_pos_fa >= args.opt_set_size, \ 280 assert (
255 "# positives < --opt-set-size (%i < %i)" \ 281 c_pos_fa >= args.opt_set_size
256 % (c_pos_fa, args.opt_set_size) 282 ), "# positives < --opt-set-size (%i < %i)" % (c_pos_fa, args.opt_set_size)
257 assert c_neg_fa >= args.opt_set_size, \ 283 assert (
258 "# negatives < --opt-set-size (%i < %i)" \ 284 c_neg_fa >= args.opt_set_size
259 % (c_neg_fa, args.opt_set_size) 285 ), "# negatives < --opt-set-size (%i < %i)" % (c_neg_fa, args.opt_set_size)
260 assert c_pos_train >= args.opt_set_size, \ 286 assert (
261 "# positives remaining for training < --opt-set-size "\ 287 c_pos_train >= args.opt_set_size
262 "(%i < %i)" % (c_pos_train, args.opt_set_size) 288 ), "# positives remaining for training < --opt-set-size " "(%i < %i)" % (
263 assert c_neg_train >= args.opt_set_size, "# negatives remaining "\ 289 c_pos_train,
264 "for training < --opt-set-size (%i < %i)" \ 290 args.opt_set_size,
265 % (c_neg_train, args.opt_set_size) 291 )
292 assert (
293 c_neg_train >= args.opt_set_size
294 ), "# negatives remaining " "for training < --opt-set-size (%i < %i)" % (
295 c_neg_train,
296 args.opt_set_size,
297 )
266 # Less than 500?? You gotta be kidding. 298 # Less than 500?? You gotta be kidding.
267 assert c_pos_train >= args.min_train, \ 299 assert c_pos_train >= args.min_train, (
268 "# positives remaining for training < %i, please provide more "\ 300 "# positives remaining for training < %i, please provide more "
269 " (try at least > 1000, the more the better)" % (args.min_train) 301 " (try at least > 1000, the more the better)" % (args.min_train)
270 assert c_neg_train >= args.min_train, \ 302 )
271 "# negatives remaining for training < %i, please provide more "\ 303 assert c_neg_train >= args.min_train, (
304 "# negatives remaining for training < %i, please provide more "
272 "(try at least > 1000, the more the better)" % (args.min_train) 305 "(try at least > 1000, the more the better)" % (args.min_train)
306 )
273 # Looking closer at ratios. 307 # Looking closer at ratios.
274 pos_neg_ratio = c_pos_train / c_neg_train 308 pos_neg_ratio = c_pos_train / c_neg_train
275 if pos_neg_ratio < 0.8 or pos_neg_ratio > 1.25: 309 if pos_neg_ratio < 0.8 or pos_neg_ratio > 1.25:
276 assert 0, "ratio of --pos to --neg < 0.8 or > 1.25 "\ 310 assert 0, (
277 "(ratio = %f). Try to keep ratio closer to 1 or better use "\ 311 "ratio of --pos to --neg < 0.8 or > 1.25 "
278 "identical numbers (keep in mind that performance measures "\ 312 "(ratio = %f). Try to keep ratio closer to 1 or better use "
279 "such as accuracy or AUROC are not suitable for imbalanced "\ 313 "identical numbers (keep in mind that performance measures "
314 "such as accuracy or AUROC are not suitable for imbalanced "
280 "datasets!)" % (pos_neg_ratio) 315 "datasets!)" % (pos_neg_ratio)
316 )
281 317
282 """ 318 """
283 Generate parop + train .fa output files for hyperparameter 319 Generate parop + train .fa output files for hyperparameter
284 optimization + training. 320 optimization + training.
285 321
297 gplib.make_file_copy(args.opt_neg_fa, neg_parop_fa) 333 gplib.make_file_copy(args.opt_neg_fa, neg_parop_fa)
298 gplib.make_file_copy(args.in_pos_fa, pos_train_fa) 334 gplib.make_file_copy(args.in_pos_fa, pos_train_fa)
299 gplib.make_file_copy(args.in_neg_fa, neg_train_fa) 335 gplib.make_file_copy(args.in_neg_fa, neg_train_fa)
300 else: 336 else:
301 # Generate parop + train .fa files from input .fa files. 337 # Generate parop + train .fa files from input .fa files.
302 gplib.split_fasta_into_test_train_files(args.in_pos_fa, pos_parop_fa, 338 gplib.split_fasta_into_test_train_files(
303 pos_train_fa, 339 args.in_pos_fa, pos_parop_fa, pos_train_fa, test_size=args.opt_set_size
304 test_size=args.opt_set_size) 340 )
305 gplib.split_fasta_into_test_train_files(args.in_neg_fa, neg_parop_fa, 341 gplib.split_fasta_into_test_train_files(
306 neg_train_fa, 342 args.in_neg_fa, neg_parop_fa, neg_train_fa, test_size=args.opt_set_size
307 test_size=args.opt_set_size) 343 )
308 344
309 """ 345 """
310 Do the hyperparameter optimization. 346 Do the hyperparameter optimization.
311 347
312 """ 348 """
313 print("Starting hyperparameter optimization (-action ls) ... ") 349 print("Starting hyperparameter optimization (-action ls) ... ")
314 check_cmd = "GraphProt.pl -action ls -prefix " + args.data_id + \ 350 check_cmd = (
315 " -fasta " + pos_parop_fa + " -negfasta " + neg_parop_fa 351 "GraphProt.pl -action ls -prefix "
352 + args.data_id
353 + " -fasta "
354 + pos_parop_fa
355 + " -negfasta "
356 + neg_parop_fa
357 )
316 # If sequence model should be trained (default). 358 # If sequence model should be trained (default).
317 if not args.train_str_model: 359 if not args.train_str_model:
318 check_cmd += " -onlyseq" 360 check_cmd += " -onlyseq"
319 print(check_cmd) 361 print(check_cmd)
320 output = subprocess.getoutput(check_cmd) 362 output = subprocess.getoutput(check_cmd)
321 params_file = args.data_id + ".params" 363 params_file = args.data_id + ".params"
322 assert os.path.exists(params_file), "Hyperparameter optimization output "\ 364 assert os.path.exists(
323 " .params file \"%s\" not found" % (params_file) 365 params_file
366 ), "Hyperparameter optimization output " ' .params file "%s" not found' % (
367 params_file
368 )
324 # Add model type to params file. 369 # Add model type to params file.
325 if args.train_str_model: 370 if args.train_str_model:
326 gplib.echo_add_to_file("model_type: structure", params_file) 371 gplib.echo_add_to_file("model_type: structure", params_file)
327 else: 372 else:
328 gplib.echo_add_to_file("model_type: sequence", params_file) 373 gplib.echo_add_to_file("model_type: sequence", params_file)
332 """ 377 """
333 Do the model training. (Yowza!) 378 Do the model training. (Yowza!)
334 379
335 """ 380 """
336 print("Starting model training (-action train) ... ") 381 print("Starting model training (-action train) ... ")
337 check_cmd = "GraphProt.pl -action train -prefix " + args.data_id \ 382 check_cmd = (
338 + " -fasta " + pos_train_fa + " -negfasta " + neg_train_fa \ 383 "GraphProt.pl -action train -prefix "
339 + " " + param_string 384 + args.data_id
385 + " -fasta "
386 + pos_train_fa
387 + " -negfasta "
388 + neg_train_fa
389 + " "
390 + param_string
391 )
340 print(check_cmd) 392 print(check_cmd)
341 output = subprocess.getoutput(check_cmd) 393 output = subprocess.getoutput(check_cmd)
342 assert output, \ 394 assert output, "The following call of GraphProt.pl produced no output:\n%s" % (
343 "The following call of GraphProt.pl produced no output:\n%s" \ 395 check_cmd
344 % (check_cmd) 396 )
345 if args.gp_output: 397 if args.gp_output:
346 print(output) 398 print(output)
347 model_file = args.data_id + ".model" 399 model_file = args.data_id + ".model"
348 assert os.path.exists(model_file), \ 400 assert os.path.exists(model_file), 'Training output .model file "%s" not found' % (
349 "Training output .model file \"%s\" not found" % (model_file) 401 model_file
402 )
350 403
351 """ 404 """
352 Do the 10-fold cross validation. 405 Do the 10-fold cross validation.
353 406
354 """ 407 """
355 if not args.disable_cv: 408 if not args.disable_cv:
356 print("Starting 10-fold cross validation (-action cv) ... ") 409 print("Starting 10-fold cross validation (-action cv) ... ")
357 check_cmd = "GraphProt.pl -action cv -prefix " + args.data_id \ 410 check_cmd = (
358 + " -fasta " + pos_train_fa + " -negfasta " + neg_train_fa \ 411 "GraphProt.pl -action cv -prefix "
359 + " " + param_string + " -model " + model_file 412 + args.data_id
413 + " -fasta "
414 + pos_train_fa
415 + " -negfasta "
416 + neg_train_fa
417 + " "
418 + param_string
419 + " -model "
420 + model_file
421 )
360 print(check_cmd) 422 print(check_cmd)
361 output = subprocess.getoutput(check_cmd) 423 output = subprocess.getoutput(check_cmd)
362 assert output, \ 424 assert output, "The following call of GraphProt.pl produced no output:\n%s" % (
363 "The following call of GraphProt.pl produced no output:\n%s" \ 425 check_cmd
364 % (check_cmd) 426 )
365 if args.gp_output: 427 if args.gp_output:
366 print(output) 428 print(output)
367 cv_results_file = args.data_id + ".cv_results" 429 cv_results_file = args.data_id + ".cv_results"
368 assert os.path.exists(cv_results_file), \ 430 assert os.path.exists(
369 "CV output .cv_results file \"%s\" not found" % (cv_results_file) 431 cv_results_file
432 ), 'CV output .cv_results file "%s" not found' % (cv_results_file)
370 433
371 """ 434 """
372 Do the motif generation. 435 Do the motif generation.
373 436
374 """ 437 """
375 if not args.disable_motifs: 438 if not args.disable_motifs:
376 print("Starting motif generation (-action motif) ... ") 439 print("Starting motif generation (-action motif) ... ")
377 check_cmd = "GraphProt.pl -action motif -prefix " + args.data_id \ 440 check_cmd = (
378 + " -fasta " + pos_train_fa + " -negfasta " + neg_train_fa \ 441 "GraphProt.pl -action motif -prefix "
379 + " " + param_string + " -model " + model_file 442 + args.data_id
443 + " -fasta "
444 + pos_train_fa
445 + " -negfasta "
446 + neg_train_fa
447 + " "
448 + param_string
449 + " -model "
450 + model_file
451 )
380 print(check_cmd) 452 print(check_cmd)
381 output = subprocess.getoutput(check_cmd) 453 output = subprocess.getoutput(check_cmd)
382 assert output, \ 454 assert output, "The following call of GraphProt.pl produced no output:\n%s" % (
383 "The following call of GraphProt.pl produced no output:\n%s" \ 455 check_cmd
384 % (check_cmd) 456 )
385 if args.gp_output: 457 if args.gp_output:
386 print(output) 458 print(output)
387 seq_motif_file = args.data_id + ".sequence_motif" 459 seq_motif_file = args.data_id + ".sequence_motif"
388 seq_motif_png_file = args.data_id + ".sequence_motif.png" 460 seq_motif_png_file = args.data_id + ".sequence_motif.png"
389 assert os.path.exists(seq_motif_file), \ 461 assert os.path.exists(
390 "Motif output .sequence_motif file \"%s\" not found" \ 462 seq_motif_file
391 % (seq_motif_file) 463 ), 'Motif output .sequence_motif file "%s" not found' % (seq_motif_file)
392 assert os.path.exists(seq_motif_png_file), \ 464 assert os.path.exists(
393 "Motif output .sequence_motif.png file \"%s\" not found" \ 465 seq_motif_png_file
394 % (seq_motif_png_file) 466 ), 'Motif output .sequence_motif.png file "%s" not found' % (seq_motif_png_file)
395 if args.train_str_model: 467 if args.train_str_model:
396 str_motif_file = args.data_id + ".structure_motif" 468 str_motif_file = args.data_id + ".structure_motif"
397 str_motif_png_file = args.data_id + ".structure_motif.png" 469 str_motif_png_file = args.data_id + ".structure_motif.png"
398 assert os.path.exists(str_motif_file), \ 470 assert os.path.exists(
399 "Motif output .structure_motif file \"%s\" not found" \ 471 str_motif_file
400 % (str_motif_file) 472 ), 'Motif output .structure_motif file "%s" not found' % (str_motif_file)
401 assert os.path.exists(str_motif_png_file), \ 473 assert os.path.exists(
402 "Motif output .structure_motif.png file \"%s\" not found" \ 474 str_motif_png_file
403 % (str_motif_png_file) 475 ), 'Motif output .structure_motif.png file "%s" not found' % (
476 str_motif_png_file
477 )
404 478
405 """ 479 """
406 Do whole site predictions on positive training set. 480 Do whole site predictions on positive training set.
407 481
408 """ 482 """
409 print("Starting whole site predictions on positive training set " 483 print(
410 " (-action predict) ... ") 484 "Starting whole site predictions on positive training set "
411 check_cmd = "GraphProt.pl -action predict -prefix " + args.data_id \ 485 " (-action predict) ... "
412 + " -fasta " + pos_train_fa + " " + param_string \ 486 )
413 + " -model " + model_file 487 check_cmd = (
488 "GraphProt.pl -action predict -prefix "
489 + args.data_id
490 + " -fasta "
491 + pos_train_fa
492 + " "
493 + param_string
494 + " -model "
495 + model_file
496 )
414 print(check_cmd) 497 print(check_cmd)
415 output = subprocess.getoutput(check_cmd) 498 output = subprocess.getoutput(check_cmd)
416 assert output, \ 499 assert output, "The following call of GraphProt.pl produced no output:\n%s" % (
417 "The following call of GraphProt.pl produced no output:\n%s" \ 500 check_cmd
418 % (check_cmd) 501 )
419 if args.gp_output: 502 if args.gp_output:
420 print(output) 503 print(output)
421 ws_predictions_file = args.data_id + ".predictions" 504 ws_predictions_file = args.data_id + ".predictions"
422 assert os.path.exists(ws_predictions_file), \ 505 assert os.path.exists(
423 "Whole site prediction output .predictions file \"%s\" not found" \ 506 ws_predictions_file
424 % (ws_predictions_file) 507 ), 'Whole site prediction output .predictions file "%s" not found' % (
508 ws_predictions_file
509 )
425 510
426 """ 511 """
427 Do profile predictions on positive training set. 512 Do profile predictions on positive training set.
428 513
429 """ 514 """
430 print("Starting profile predictions on positive training set " 515 print(
431 "-action predict_profile) ... ") 516 "Starting profile predictions on positive training set "
432 check_cmd = "GraphProt.pl -action predict_profile -prefix " \ 517 "-action predict_profile) ... "
433 + args.data_id + " -fasta " + pos_train_fa + " " \ 518 )
434 + param_string + " -model " + model_file 519 check_cmd = (
520 "GraphProt.pl -action predict_profile -prefix "
521 + args.data_id
522 + " -fasta "
523 + pos_train_fa
524 + " "
525 + param_string
526 + " -model "
527 + model_file
528 )
435 print(check_cmd) 529 print(check_cmd)
436 output = subprocess.getoutput(check_cmd) 530 output = subprocess.getoutput(check_cmd)
437 assert output, \ 531 assert output, "The following call of GraphProt.pl produced no output:\n%s" % (
438 "The following call of GraphProt.pl produced no output:\n%s" \ 532 check_cmd
439 % (check_cmd) 533 )
440 if args.gp_output: 534 if args.gp_output:
441 print(output) 535 print(output)
442 profile_predictions_file = args.data_id + ".profile" 536 profile_predictions_file = args.data_id + ".profile"
443 assert os.path.exists(profile_predictions_file), \ 537 assert os.path.exists(
444 "Profile prediction output .profile file \"%s\" not found" \ 538 profile_predictions_file
445 % (profile_predictions_file) 539 ), 'Profile prediction output .profile file "%s" not found' % (
540 profile_predictions_file
541 )
446 542
447 """ 543 """
448 Get 50 % score (median) for .predictions and .profile file. 544 Get 50 % score (median) for .predictions and .profile file.
449 For .profile, first extract for each site the maximum score, and then 545 For .profile, first extract for each site the maximum score, and then
450 from the list of maximum site scores get the median. 546 from the list of maximum site scores get the median.
452 548
453 """ 549 """
454 print("Getting .profile and .predictions median scores ... ") 550 print("Getting .profile and .predictions median scores ... ")
455 551
456 # Whole site scores median. 552 # Whole site scores median.
457 ws_pred_median = \ 553 ws_pred_median = gplib.graphprot_predictions_get_median(ws_predictions_file)
458 gplib.graphprot_predictions_get_median(ws_predictions_file)
459 # Profile top site scores median. 554 # Profile top site scores median.
460 profile_median = \ 555 profile_median = gplib.graphprot_profile_get_tsm(
461 gplib.graphprot_profile_get_tsm(profile_predictions_file, 556 profile_predictions_file, profile_type="profile"
462 profile_type="profile") 557 )
463 ws_pred_string = "pos_train_ws_pred_median: %f" % (ws_pred_median) 558 ws_pred_string = "pos_train_ws_pred_median: %f" % (ws_pred_median)
464 profile_string = "pos_train_profile_median: %f" % (profile_median) 559 profile_string = "pos_train_profile_median: %f" % (profile_median)
465 gplib.echo_add_to_file(ws_pred_string, params_file) 560 gplib.echo_add_to_file(ws_pred_string, params_file)
466 gplib.echo_add_to_file(profile_string, params_file) 561 gplib.echo_add_to_file(profile_string, params_file)
467 # Average profile top site scores median for extlr 1 to 10. 562 # Average profile top site scores median for extlr 1 to 10.
468 for i in range(10): 563 for i in range(10):
469 i += 1 564 i += 1
470 avg_profile_median = \ 565 avg_profile_median = gplib.graphprot_profile_get_tsm(
471 gplib.graphprot_profile_get_tsm(profile_predictions_file, 566 profile_predictions_file, profile_type="avg_profile", avg_profile_extlr=i
472 profile_type="avg_profile", 567 )
473 avg_profile_extlr=i) 568
474 569 avg_profile_string = "pos_train_avg_profile_median_%i: %f" % (
475 avg_profile_string = "pos_train_avg_profile_median_%i: %f" \ 570 i,
476 % (i, avg_profile_median) 571 avg_profile_median,
572 )
477 gplib.echo_add_to_file(avg_profile_string, params_file) 573 gplib.echo_add_to_file(avg_profile_string, params_file)
478 574
479 print("Script: I'm done.") 575 print("Script: I'm done.")
480 print("Author: Good. Now go back to your file system directory.") 576 print("Author: Good. Now go back to your file system directory.")
481 print("Script: Ok.") 577 print("Script: Ok.")