comparison graphprot_train_wrapper.py @ 1:20429f4c1b95 draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/graphprot commit f3fb925b83a4982e0cf9a0c11ff93ecbb8e4e6d5"
author bgruening
date Wed, 22 Jan 2020 10:14:41 -0500
parents
children ace92c9a4653
comparison
equal deleted inserted replaced
0:215925e588c4 1:20429f4c1b95
1 #!/usr/bin/env python3
2
3 import subprocess
4 import argparse
5 import shutil
6 import gplib
7 import gzip
8 import sys
9 import os
10
11
12 """
13
14 TOOL DEPENDENCIES
15 =================
16
17 GraphProt 1.1.7
18 Best install via:
19 https://anaconda.org/bioconda/graphprot
20 Tested with: miniconda3, conda 4.7.12
21
22
23 OUTPUT FILES
24 ============
25
26 data_id.model
27 data_id.params
28 if not --disable-cv:
29 data_id.cv_results
30 if not --disable-motifs:
31 data_id.sequence_motif
32 data_id.sequence_motif.png
33 if --str-model:
34 data_id.structure_motif
35 data_id.structure_motif.png
36 Temporary:
37 data_id.predictions
38 data_id.profile
39
40
41 --opt-set-size int Hyperparameter optimization set size (taken away from both --pos and --neg) (default: 500)
42 --opt-pos str Positive (= binding site) sequences .fa file for hyperparameter optimization (default: take
43 --opt-set-size from --pos)
44 --opt-neg str Negative sequences .fa file for hyperparameter optimization (default: take --opt-set-size
45 from --neg)
46 --min-train int Minimum amount of training sites demanded (default: 500)
47 --disable-cv Disable cross validation step (default: false)
48 --disable-motifs Disable motif generation step (default: false)
49 --gp-output Print output produced by GraphProt (default: false)
50 --str-model Train a structure model (default: train a sequence model)
51
52
53 EXAMPLE CALLS
54 =============
55
56 python graphprot_train_wrapper.py --pos gp_data/SERBP1_positives.train.fa --neg gp_data/SERBP1_negatives.train.fa --data-id test2 --disable-cv --gp-output --opt-set-size 200 --min-train 400
57
58 python graphprot_train_wrapper.py --pos gp_data/SERBP1_positives.train.fa --neg gp_data/SERBP1_negatives.train.fa --data-id test2 --disable-cv --opt-set-size 100 --min-train 200
59
60 python graphprot_train_wrapper.py --pos test-data/test_positives.train.fa --neg test-data/test_negatives.train.fa --data-id gptest2 --disable-cv --opt-pos test-data/test_positives.parop.fa --opt-neg test-data/test_negatives.parop.fa
61
62 python graphprot_train_wrapper.py --pos test-data/test_positives.train.fa --neg test-data/test_negatives.train.fa --data-id gptest2 --disable-cv --disable-motifs --opt-pos test-data/test_positives.parop.fa --opt-neg test-data/test_negatives.parop.fa
63
64
65 """
66
67 ################################################################################
68
69 def setup_argument_parser():
70 """Setup argparse parser."""
71 help_description = """
72 Galaxy wrapper script for GraphProt to train a GraphProt model on
73 a given set of input sequences (positives and negatives .fa). By
74 default a sequence model is trained (due to structure models
75 being much slower to train). Also by default take a portion of
76 the input sequences for hyperparameter optimization (HPO) prior to
77 model training, and run a 10-fold cross validation and motif
78 generation after model training. Thus the following output
79 files are produced:
80 .model model file, .params model parameter file, .png motif files
81 (sequence, or sequence+structure), .cv_results CV results file.
82 After model training, predict on positives to get highest whole
83 site and profile scores found in binding sites. Take the median
84 score out of these to store in .params file, using it later
85 for outputting binding sites or peaks with higher confidence.
86
87 """
88 # Define argument parser.
89 p = argparse.ArgumentParser(add_help=False,
90 prog="graphprot_train_wrapper.py",
91 description=help_description,
92 formatter_class=argparse.MetavarTypeHelpFormatter)
93
94 # Argument groups.
95 p_man = p.add_argument_group("REQUIRED ARGUMENTS")
96 p_opt = p.add_argument_group("OPTIONAL ARGUMENTS")
97
98 # Required arguments.
99 p_opt.add_argument("-h", "--help",
100 action="help",
101 help="Print help message")
102 p_man.add_argument("--pos",
103 dest="in_pos_fa",
104 type=str,
105 required = True,
106 help = "Positive (= binding site) sequences .fa file for model training (option -fasta)")
107 p_man.add_argument("--neg",
108 dest="in_neg_fa",
109 type=str,
110 required = True,
111 help = "Negative sequences .fa file for model training (option -negfasta)")
112 p_man.add_argument("--data-id",
113 dest="data_id",
114 type=str,
115 required = True,
116 help = "Data ID (option -prefix)")
117 # Additional arguments.
118 p_opt.add_argument("--opt-set-size",
119 dest="opt_set_size",
120 type = int,
121 default = 500,
122 help = "Hyperparameter optimization set size (taken away from both --pos and --neg) (default: 500)")
123 p_opt.add_argument("--opt-pos",
124 dest="opt_pos_fa",
125 type=str,
126 help = "Positive (= binding site) sequences .fa file for hyperparameter optimization (default: take --opt-set-size from --pos)")
127 p_opt.add_argument("--opt-neg",
128 dest="opt_neg_fa",
129 type=str,
130 help = "Negative sequences .fa file for hyperparameter optimization (default: take --opt-set-size from --neg)")
131 p_opt.add_argument("--min-train",
132 dest="min_train",
133 type = int,
134 default = 500,
135 help = "Minimum amount of training sites demanded (default: 500)")
136 p_opt.add_argument("--disable-cv",
137 dest = "disable_cv",
138 default = False,
139 action = "store_true",
140 help = "Disable cross validation step (default: false)")
141 p_opt.add_argument("--disable-motifs",
142 dest = "disable_motifs",
143 default = False,
144 action = "store_true",
145 help = "Disable motif generation step (default: false)")
146 p_opt.add_argument("--gp-output",
147 dest = "gp_output",
148 default = False,
149 action = "store_true",
150 help = "Print output produced by GraphProt (default: false)")
151 p_opt.add_argument("--str-model",
152 dest = "train_str_model",
153 default = False,
154 action = "store_true",
155 help = "Train a structure model (default: train a sequence model)")
156 return p
157
158
159 ################################################################################
160
161 if __name__ == '__main__':
162
163 # Setup argparse.
164 parser = setup_argument_parser()
165 # Read in command line arguments.
166 args = parser.parse_args()
167
168 """
169 Do all sorts of sanity checking.
170
171 """
172 # Check for Linux.
173 assert "linux" in sys.platform, "please use Linux"
174 # Check tool availability.
175 assert gplib.is_tool("GraphProt.pl"), "GraphProt.pl not in PATH"
176 # Check file inputs.
177 assert os.path.exists(args.in_pos_fa), "positives .fa file \"%s\" not found" %(args.in_pos_fa)
178 assert os.path.exists(args.in_neg_fa), "negatives .fa file \"%s\" not found" %(args.in_neg_fa)
179 # Count .fa entries.
180 c_pos_fa = gplib.count_fasta_headers(args.in_pos_fa)
181 c_neg_fa = gplib.count_fasta_headers(args.in_neg_fa)
182 assert c_pos_fa, "positives .fa file \"%s\" no headers found" %(args.in_pos_fa)
183 assert c_neg_fa, "negatives .fa file \"%s\" no headers found" %(args.in_neg_fa)
184 print("# positive .fa sequences: %i" %(c_pos_fa))
185 print("# negative .fa sequences: %i" %(c_neg_fa))
186 # Check additional files.
187 if args.opt_pos_fa:
188 assert args.opt_neg_fa, "--opt-pos but no --opt-neg given"
189 if args.opt_neg_fa:
190 assert args.opt_pos_fa, "--opt-neg but no --opt-pos given"
191 # If parop .fa files given.
192 if args.opt_pos_fa and args.opt_neg_fa:
193 c_parop_pos_fa = gplib.count_fasta_headers(args.opt_pos_fa)
194 c_parop_neg_fa = gplib.count_fasta_headers(args.opt_neg_fa)
195 assert c_parop_pos_fa, "--opt-pos .fa file \"%s\" no headers found" %(args.opt_pos_fa)
196 assert c_parop_neg_fa, "--opt-neg .fa file \"%s\" no headers found" %(args.opt_neg_fa)
197 # Less than 500 for training?? You gotta be kidding.
198 assert c_pos_fa >= args.min_train, "--pos for training < %i, please provide more (try at least > 1000, the more the better)" %(args.min_train)
199 assert c_neg_fa >= args.min_train, "--neg for training < %i, please provide more (try at least > 1000, the more the better)" %(args.min_train)
200 # Looking closer at ratios.
201 pos_neg_ratio = c_parop_pos_fa / c_parop_neg_fa
202 if pos_neg_ratio < 0.8 or pos_neg_ratio > 1.25:
203 assert 0, "ratio of --opt-pos to --opt-neg < 0.8 or > 1.25 (ratio = %f). Try to keep ratio closer to 1 or better use identical numbers (keep in mind that performance measures such as accuracy or AUROC are not suitable for imbalanced datasets!)" %(pos_neg_ratio)
204 else:
205 # Define some minimum amount of training sites for the sake of sanity.
206 c_pos_train = c_pos_fa - args.opt_set_size
207 c_neg_train = c_neg_fa - args.opt_set_size
208 # Start complaining.
209 assert c_pos_fa >= args.opt_set_size, "# positives < --opt-set-size (%i < %i)" %(c_pos_fa, args.opt_set_size)
210 assert c_neg_fa >= args.opt_set_size, "# negatives < --opt-set-size (%i < %i)" %(c_neg_fa, args.opt_set_size)
211 assert c_pos_train >= args.opt_set_size, "# positives remaining for training < --opt-set-size (%i < %i)" %(c_pos_train, args.opt_set_size)
212 assert c_neg_train >= args.opt_set_size, "# negatives remaining for training < --opt-set-size (%i < %i)" %(c_neg_train, args.opt_set_size)
213 # Less than 500?? You gotta be kidding.
214 assert c_pos_train >= args.min_train, "# positives remaining for training < %i, please provide more (try at least > 1000, the more the better)" %(args.min_train)
215 assert c_neg_train >= args.min_train, "# negatives remaining for training < %i, please provide more (try at least > 1000, the more the better)" %(args.min_train)
216 # Looking closer at ratios.
217 pos_neg_ratio = c_pos_train / c_neg_train
218 if pos_neg_ratio < 0.8 or pos_neg_ratio > 1.25:
219 assert 0, "ratio of --pos to --neg < 0.8 or > 1.25 (ratio = %f). Try to keep ratio closer to 1 or better use identical numbers (keep in mind that performance measures such as accuracy or AUROC are not suitable for imbalanced datasets!)" %(pos_neg_ratio)
220
221 """
222 Generate parop + train .fa output files for hyperparameter optimization + training.
223
224 """
225 # Output files for training.
226 pos_parop_fa = args.data_id + ".positives.parop.fa"
227 neg_parop_fa = args.data_id + ".negatives.parop.fa"
228 pos_train_fa = args.data_id + ".positives.train.fa"
229 neg_train_fa = args.data_id + ".negatives.train.fa"
230
231 # If parop .fa files given.
232 if args.opt_pos_fa and args.opt_neg_fa:
233 # Just copy parop and train files.
234 gplib.make_file_copy(args.opt_pos_fa, pos_parop_fa)
235 gplib.make_file_copy(args.opt_neg_fa, neg_parop_fa)
236 gplib.make_file_copy(args.in_pos_fa, pos_train_fa)
237 gplib.make_file_copy(args.in_neg_fa, neg_train_fa)
238 else:
239 # Generate parop + train .fa files from input .fa files.
240 gplib.split_fasta_into_test_train_files(args.in_pos_fa, pos_parop_fa, pos_train_fa,
241 test_size=args.opt_set_size)
242 gplib.split_fasta_into_test_train_files(args.in_neg_fa, neg_parop_fa, neg_train_fa,
243 test_size=args.opt_set_size)
244
245 """
246 Do the hyperparameter optimization.
247
248 """
249 print("Starting hyperparameter optimization (-action ls) ... ")
250 check_cmd = "GraphProt.pl -action ls -prefix " + args.data_id + " -fasta " + pos_parop_fa + " -negfasta " + neg_parop_fa
251 # If sequence model should be trained (default).
252 if not args.train_str_model:
253 check_cmd += " -onlyseq"
254 print(check_cmd)
255 output = subprocess.getoutput(check_cmd)
256 #assert output, "The following call of GraphProt.pl produced no output:\n%s" %(check_cmd)
257 #if args.gp_output:
258 # print(output)
259 params_file = args.data_id + ".params"
260 assert os.path.exists(params_file), "Hyperparameter optimization output .params file \"%s\" not found" %(params_file)
261 # Add model type to params file.
262 if args.train_str_model:
263 gplib.echo_add_to_file("model_type: structure", params_file)
264 else:
265 gplib.echo_add_to_file("model_type: sequence", params_file)
266 # Get parameter string.
267 param_string = gplib.graphprot_get_param_string(params_file)
268
269 """
270 Do the model training. (Yowza!)
271
272 """
273 print("Starting model training (-action train) ... ")
274 check_cmd = "GraphProt.pl -action train -prefix " + args.data_id + " -fasta " + pos_train_fa + " -negfasta " + neg_train_fa + " " + param_string
275 print(check_cmd)
276 output = subprocess.getoutput(check_cmd)
277 assert output, "The following call of GraphProt.pl produced no output:\n%s" %(check_cmd)
278 if args.gp_output:
279 print(output)
280 model_file = args.data_id + ".model"
281 assert os.path.exists(model_file), "Training output .model file \"%s\" not found" %(model_file)
282
283 """
284 Do the 10-fold cross validation.
285
286 """
287 if not args.disable_cv:
288 print("Starting 10-fold cross validation (-action cv) ... ")
289 check_cmd = "GraphProt.pl -action cv -prefix " + args.data_id + " -fasta " + pos_train_fa + " -negfasta " + neg_train_fa + " " + param_string + " -model " + model_file
290 print(check_cmd)
291 output = subprocess.getoutput(check_cmd)
292 assert output, "The following call of GraphProt.pl produced no output:\n%s" %(check_cmd)
293 if args.gp_output:
294 print(output)
295 cv_results_file = args.data_id + ".cv_results"
296 assert os.path.exists(cv_results_file), "CV output .cv_results file \"%s\" not found" %(cv_results_file)
297
298 """
299 Do the motif generation.
300
301 """
302 if not args.disable_motifs:
303 print("Starting motif generation (-action motif) ... ")
304 check_cmd = "GraphProt.pl -action motif -prefix " + args.data_id + " -fasta " + pos_train_fa + " -negfasta " + neg_train_fa + " " + param_string + " -model " + model_file
305 print(check_cmd)
306 output = subprocess.getoutput(check_cmd)
307 assert output, "The following call of GraphProt.pl produced no output:\n%s" %(check_cmd)
308 if args.gp_output:
309 print(output)
310 seq_motif_file = args.data_id + ".sequence_motif"
311 seq_motif_png_file = args.data_id + ".sequence_motif.png"
312 assert os.path.exists(seq_motif_file), "Motif output .sequence_motif file \"%s\" not found" %(seq_motif_file)
313 assert os.path.exists(seq_motif_png_file), "Motif output .sequence_motif.png file \"%s\" not found" %(seq_motif_png_file)
314 if args.train_str_model:
315 str_motif_file = args.data_id + ".structure_motif"
316 str_motif_png_file = args.data_id + ".structure_motif.png"
317 assert os.path.exists(str_motif_file), "Motif output .structure_motif file \"%s\" not found" %(str_motif_file)
318 assert os.path.exists(str_motif_png_file), "Motif output .structure_motif.png file \"%s\" not found" %(str_motif_png_file)
319
320 """
321 Do whole site predictions on positive training set.
322
323 """
324 print("Starting whole site predictions on positive training set (-action predict) ... ")
325 check_cmd = "GraphProt.pl -action predict -prefix " + args.data_id + " -fasta " + pos_train_fa + " " + param_string + " -model " + model_file
326 print(check_cmd)
327 output = subprocess.getoutput(check_cmd)
328 assert output, "The following call of GraphProt.pl produced no output:\n%s" %(check_cmd)
329 if args.gp_output:
330 print(output)
331 ws_predictions_file = args.data_id + ".predictions"
332 assert os.path.exists(ws_predictions_file), "Whole site prediction output .predictions file \"%s\" not found" %(ws_predictions_file)
333
334 """
335 Do profile predictions on positive training set.
336
337 """
338 print("Starting profile predictions on positive training set (-action predict_profile) ... ")
339 check_cmd = "GraphProt.pl -action predict_profile -prefix " + args.data_id + " -fasta " + pos_train_fa + " " + param_string + " -model " + model_file
340 print(check_cmd)
341 output = subprocess.getoutput(check_cmd)
342 assert output, "The following call of GraphProt.pl produced no output:\n%s" %(check_cmd)
343 if args.gp_output:
344 print(output)
345 profile_predictions_file = args.data_id + ".profile"
346 assert os.path.exists(profile_predictions_file), "Profile prediction output .profile file \"%s\" not found" %(profile_predictions_file)
347
348 """
349 Get 50 % score (median) for .predictions and .profile file.
350 For .profile, first extract for each site the maximum score, and then
351 from the list of maximum site scores get the median.
352 For whole site .predictions, get the median from the site scores list.
353
354 """
355 print("Getting .profile and .predictions median scores ... ")
356
357 # Whole site scores median.
358 ws_pred_median = gplib.graphprot_predictions_get_median(ws_predictions_file)
359 # Profile top site scores median.
360 profile_median = gplib.graphprot_profile_get_top_scores_median(profile_predictions_file,
361 profile_type="profile")
362 ws_pred_string = "pos_train_ws_pred_median: %f" %(ws_pred_median)
363 profile_string = "pos_train_profile_median: %f" %(profile_median)
364 gplib.echo_add_to_file(ws_pred_string, params_file)
365 gplib.echo_add_to_file(profile_string, params_file)
366 # Average profile top site scores median for extlr 1 to 10.
367 for i in range(10):
368 i += 1
369 avg_profile_median = gplib.graphprot_profile_get_top_scores_median(profile_predictions_file,
370 profile_type="avg_profile",
371 avg_profile_extlr=i)
372
373 avg_profile_string = "pos_train_avg_profile_median_%i: %f" %(i, avg_profile_median)
374 gplib.echo_add_to_file(avg_profile_string, params_file)
375
376 print("Script: I'm done.")
377 print("Author: Good. Now go back to your file system directory.")
378 print("Script: Ok.")
379
380
381 """
382
383 OLD CODE ...
384
385 p.add_argument("--ap-extlr",
386 dest="ap_extlr",
387 type = int,
388 default = 5,
389 help = "Define average profile up- and downstream extension for averaging scores to produce the average profile. This is used to get the median average profile score, which will be stored in the .params file to later be used in a prediction setting as a second filter value to get more confident peak regions. NOTE that you have to use the same value in model training and prediction! (default: 5)")
390
391
392 p.add_argument("--disable-opt",
393 dest = "disable_opt",
394 default = False,
395 action = "store_true",
396 help = "Disable hyperparameter optimization (HPO) (default: optimize hyperparameters)")
397 p.add_argument("--R",
398 dest = "param_r",
399 type = int,
400 default = False,
401 help = "GraphProt model R parameter (default: determined by HPO)")
402 p.add_argument("--D",
403 dest = "param_d",
404 type = int,
405 default = False,
406 help = "GraphProt model D parameter (default: determined by HPO)")
407 p.add_argument("--epochs",
408 dest = "param_epochs",
409 type = int,
410 default = False,
411 help = "GraphProt model epochs parameter (default: determined by HPO)")
412 p.add_argument("--lambda",
413 dest = "param_lambda",
414 type = float,
415 default = False,
416 help = "GraphProt model lambda parameter (default: determined by HPO)")
417 p.add_argument("--bitsize",
418 dest = "param_bitsize",
419 type = int,
420 default = False,
421 help = "GraphProt model bitsize parameter (default: determined by HPO)")
422 p.add_argument("--abstraction",
423 dest = "param_abstraction",
424 type = int,
425 default = False,
426 help = "GraphProt model RNAshapes abstraction level parameter for training structure models (default: determined by HPO)")
427
428 """
429
430
431