comparison graphprot_train_wrapper.py @ 3:ace92c9a4653 draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/graphprot commit efcac98677c3ea9039c1c61eaa9e58f78287ccb3"
author bgruening
date Wed, 27 Jan 2021 19:27:47 +0000
parents 20429f4c1b95
children ddcf35a868b8
comparison
equal deleted inserted replaced
2:7bbb7bf6304f 3:ace92c9a4653
1 #!/usr/bin/env python3 1 #!/usr/bin/env python3
2 2
3 import argparse as ap
4 import os
3 import subprocess 5 import subprocess
4 import argparse 6 import sys
5 import shutil 7
6 import gplib 8 import gplib
7 import gzip
8 import sys
9 import os
10 9
11 10
12 """ 11 """
13 12
14 TOOL DEPENDENCIES 13 TOOL DEPENDENCIES
36 Temporary: 35 Temporary:
37 data_id.predictions 36 data_id.predictions
38 data_id.profile 37 data_id.profile
39 38
40 39
41 --opt-set-size int Hyperparameter optimization set size (taken away from both --pos and --neg) (default: 500)
42 --opt-pos str Positive (= binding site) sequences .fa file for hyperparameter optimization (default: take
43 --opt-set-size from --pos)
44 --opt-neg str Negative sequences .fa file for hyperparameter optimization (default: take --opt-set-size
45 from --neg)
46 --min-train int Minimum amount of training sites demanded (default: 500)
47 --disable-cv Disable cross validation step (default: false)
48 --disable-motifs Disable motif generation step (default: false)
49 --gp-output Print output produced by GraphProt (default: false)
50 --str-model Train a structure model (default: train a sequence model)
51
52
53 EXAMPLE CALLS 40 EXAMPLE CALLS
54 ============= 41 =============
55 42
56 python graphprot_train_wrapper.py --pos gp_data/SERBP1_positives.train.fa --neg gp_data/SERBP1_negatives.train.fa --data-id test2 --disable-cv --gp-output --opt-set-size 200 --min-train 400 43 python graphprot_train_wrapper.py --pos gp_data/SERBP1_positives.train.fa
57 44 --neg gp_data/SERBP1_negatives.train.fa --data-id test2 --disable-cv
58 python graphprot_train_wrapper.py --pos gp_data/SERBP1_positives.train.fa --neg gp_data/SERBP1_negatives.train.fa --data-id test2 --disable-cv --opt-set-size 100 --min-train 200 45 --gp-output --opt-set-size 200 --min-train 400
59 46
60 python graphprot_train_wrapper.py --pos test-data/test_positives.train.fa --neg test-data/test_negatives.train.fa --data-id gptest2 --disable-cv --opt-pos test-data/test_positives.parop.fa --opt-neg test-data/test_negatives.parop.fa 47 python graphprot_train_wrapper.py --pos gp_data/SERBP1_positives.train.fa
61 48 --neg gp_data/SERBP1_negatives.train.fa --data-id test2 --disable-cv
62 python graphprot_train_wrapper.py --pos test-data/test_positives.train.fa --neg test-data/test_negatives.train.fa --data-id gptest2 --disable-cv --disable-motifs --opt-pos test-data/test_positives.parop.fa --opt-neg test-data/test_negatives.parop.fa 49 --opt-set-size 100 --min-train 200
50
51 python graphprot_train_wrapper.py --pos test-data/test_positives.train.fa
52 --neg test-data/test_negatives.train.fa --data-id gptest2 --disable-cv
53 --opt-pos test-data/test_positives.parop.fa
54 --opt-neg test-data/test_negatives.parop.fa
55
56 python graphprot_train_wrapper.py --pos test-data/test_positives.train.fa
57 --neg test-data/test_negatives.train.fa --data-id gptest2 --disable-cv
58 --disable-motifs --opt-pos test-data/test_positives.parop.fa --opt-neg
59 test-data/test_negatives.parop.fa
63 60
64 61
65 """ 62 """
66 63
67 ################################################################################ 64
65 ###############################################################################
68 66
69 def setup_argument_parser(): 67 def setup_argument_parser():
70 """Setup argparse parser.""" 68 """Setup argparse parser."""
71 help_description = """ 69 help_description = """
72 Galaxy wrapper script for GraphProt to train a GraphProt model on 70 Galaxy wrapper script for GraphProt to train a GraphProt model on
73 a given set of input sequences (positives and negatives .fa). By 71 a given set of input sequences (positives and negatives .fa). By
74 default a sequence model is trained (due to structure models 72 default a sequence model is trained (due to structure models
75 being much slower to train). Also by default take a portion of 73 being much slower to train). Also by default take a portion of
76 the input sequences for hyperparameter optimization (HPO) prior to 74 the input sequences for hyperparameter optimization (HPO) prior to
77 model training, and run a 10-fold cross validation and motif 75 model training, and run a 10-fold cross validation and motif
78 generation after model training. Thus the following output 76 generation after model training. Thus the following output
79 files are produced: 77 files are produced:
80 .model model file, .params model parameter file, .png motif files 78 .model model file, .params model parameter file, .png motif files
81 (sequence, or sequence+structure), .cv_results CV results file. 79 (sequence, or sequence+structure), .cv_results CV results file.
82 After model training, predict on positives to get highest whole 80 After model training, predict on positives to get highest whole
83 site and profile scores found in binding sites. Take the median 81 site and profile scores found in binding sites. Take the median
84 score out of these to store in .params file, using it later 82 score out of these to store in .params file, using it later
85 for outputting binding sites or peaks with higher confidence. 83 for outputting binding sites or peaks with higher confidence.
86 84
87 """ 85 """
88 # Define argument parser. 86 # Define argument parser.
89 p = argparse.ArgumentParser(add_help=False, 87 p = ap.ArgumentParser(add_help=False,
90 prog="graphprot_train_wrapper.py", 88 prog="graphprot_train_wrapper.py",
91 description=help_description, 89 description=help_description,
92 formatter_class=argparse.MetavarTypeHelpFormatter) 90 formatter_class=ap.MetavarTypeHelpFormatter)
93 91
94 # Argument groups. 92 # Argument groups.
95 p_man = p.add_argument_group("REQUIRED ARGUMENTS") 93 p_man = p.add_argument_group("REQUIRED ARGUMENTS")
96 p_opt = p.add_argument_group("OPTIONAL ARGUMENTS") 94 p_opt = p.add_argument_group("OPTIONAL ARGUMENTS")
97 95
98 # Required arguments. 96 # Required arguments.
99 p_opt.add_argument("-h", "--help", 97 p_opt.add_argument("-h", "--help",
100 action="help", 98 action="help",
101 help="Print help message") 99 help="Print help message")
102 p_man.add_argument("--pos", 100 p_man.add_argument("--pos",
103 dest="in_pos_fa", 101 dest="in_pos_fa",
104 type=str, 102 type=str,
105 required = True, 103 required=True,
106 help = "Positive (= binding site) sequences .fa file for model training (option -fasta)") 104 help="Positive (= binding site) sequences .fa file "
105 "for model training (option -fasta)")
107 p_man.add_argument("--neg", 106 p_man.add_argument("--neg",
108 dest="in_neg_fa", 107 dest="in_neg_fa",
109 type=str, 108 type=str,
110 required = True, 109 required=True,
111 help = "Negative sequences .fa file for model training (option -negfasta)") 110 help="Negative sequences .fa file for model "
111 "training (option -negfasta)")
112 p_man.add_argument("--data-id", 112 p_man.add_argument("--data-id",
113 dest="data_id", 113 dest="data_id",
114 type=str, 114 type=str,
115 required = True, 115 required=True,
116 help = "Data ID (option -prefix)") 116 help="Data ID (option -prefix)")
117 # Additional arguments. 117 # Additional arguments.
118 p_opt.add_argument("--opt-set-size", 118 p_opt.add_argument("--opt-set-size",
119 dest="opt_set_size", 119 dest="opt_set_size",
120 type = int, 120 type=int,
121 default = 500, 121 default=500,
122 help = "Hyperparameter optimization set size (taken away from both --pos and --neg) (default: 500)") 122 help="Hyperparameter optimization set size (taken "
123 "away from both --pos and --neg) (default: 500)")
123 p_opt.add_argument("--opt-pos", 124 p_opt.add_argument("--opt-pos",
124 dest="opt_pos_fa", 125 dest="opt_pos_fa",
125 type=str, 126 type=str,
126 help = "Positive (= binding site) sequences .fa file for hyperparameter optimization (default: take --opt-set-size from --pos)") 127 help="Positive (= binding site) sequences .fa file "
128 "for hyperparameter optimization (default: take "
129 "--opt-set-size from --pos)")
127 p_opt.add_argument("--opt-neg", 130 p_opt.add_argument("--opt-neg",
128 dest="opt_neg_fa", 131 dest="opt_neg_fa",
129 type=str, 132 type=str,
130 help = "Negative sequences .fa file for hyperparameter optimization (default: take --opt-set-size from --neg)") 133 help="Negative sequences .fa file for hyperparameter "
134 "optimization (default: take --opt-set-size "
135 "from --neg)")
131 p_opt.add_argument("--min-train", 136 p_opt.add_argument("--min-train",
132 dest="min_train", 137 dest="min_train",
133 type = int, 138 type=int,
134 default = 500, 139 default=500,
135 help = "Minimum amount of training sites demanded (default: 500)") 140 help="Minimum amount of training sites demanded "
141 "(default: 500)")
136 p_opt.add_argument("--disable-cv", 142 p_opt.add_argument("--disable-cv",
137 dest = "disable_cv", 143 dest="disable_cv",
138 default = False, 144 default=False,
139 action = "store_true", 145 action="store_true",
140 help = "Disable cross validation step (default: false)") 146 help="Disable cross validation step (default: false)")
141 p_opt.add_argument("--disable-motifs", 147 p_opt.add_argument("--disable-motifs",
142 dest = "disable_motifs", 148 dest="disable_motifs",
143 default = False, 149 default=False,
144 action = "store_true", 150 action="store_true",
145 help = "Disable motif generation step (default: false)") 151 help="Disable motif generation step (default: false)")
146 p_opt.add_argument("--gp-output", 152 p_opt.add_argument("--gp-output",
147 dest = "gp_output", 153 dest="gp_output",
148 default = False, 154 default=False,
149 action = "store_true", 155 action="store_true",
150 help = "Print output produced by GraphProt (default: false)") 156 help="Print output produced by GraphProt "
157 "(default: false)")
151 p_opt.add_argument("--str-model", 158 p_opt.add_argument("--str-model",
152 dest = "train_str_model", 159 dest="train_str_model",
153 default = False, 160 default=False,
154 action = "store_true", 161 action="store_true",
155 help = "Train a structure model (default: train a sequence model)") 162 help="Train a structure model (default: train "
163 "a sequence model)")
156 return p 164 return p
157 165
158 166
159 ################################################################################ 167 ###############################################################################
160 168
161 if __name__ == '__main__': 169 if __name__ == '__main__':
162 170
163 # Setup argparse. 171 # Setup argparse.
164 parser = setup_argument_parser() 172 parser = setup_argument_parser()
165 # Read in command line arguments. 173 # Read in command line arguments.
166 args = parser.parse_args() 174 args = parser.parse_args()
167 175
168 """ 176 """
169 Do all sorts of sanity checking. 177 Do all sorts of sanity checking.
170 178
171 """ 179 """
172 # Check for Linux. 180 # Check for Linux.
173 assert "linux" in sys.platform, "please use Linux" 181 assert "linux" in sys.platform, "please use Linux"
174 # Check tool availability. 182 # Check tool availability.
175 assert gplib.is_tool("GraphProt.pl"), "GraphProt.pl not in PATH" 183 assert gplib.is_tool("GraphProt.pl"), "GraphProt.pl not in PATH"
176 # Check file inputs. 184 # Check file inputs.
177 assert os.path.exists(args.in_pos_fa), "positives .fa file \"%s\" not found" %(args.in_pos_fa) 185 assert os.path.exists(args.in_pos_fa), \
178 assert os.path.exists(args.in_neg_fa), "negatives .fa file \"%s\" not found" %(args.in_neg_fa) 186 "positives .fa file \"%s\" not found" % (args.in_pos_fa)
187 assert os.path.exists(args.in_neg_fa), \
188 "negatives .fa file \"%s\" not found" % (args.in_neg_fa)
179 # Count .fa entries. 189 # Count .fa entries.
180 c_pos_fa = gplib.count_fasta_headers(args.in_pos_fa) 190 c_pos_fa = gplib.count_fasta_headers(args.in_pos_fa)
181 c_neg_fa = gplib.count_fasta_headers(args.in_neg_fa) 191 c_neg_fa = gplib.count_fasta_headers(args.in_neg_fa)
182 assert c_pos_fa, "positives .fa file \"%s\" no headers found" %(args.in_pos_fa) 192 assert c_pos_fa, "positives .fa file \"%s\" no headers found" % \
183 assert c_neg_fa, "negatives .fa file \"%s\" no headers found" %(args.in_neg_fa) 193 (args.in_pos_fa)
184 print("# positive .fa sequences: %i" %(c_pos_fa)) 194 assert c_neg_fa, "negatives .fa file \"%s\" no headers found" % \
185 print("# negative .fa sequences: %i" %(c_neg_fa)) 195 (args.in_neg_fa)
196 print("# positive .fa sequences: %i" % (c_pos_fa))
197 print("# negative .fa sequences: %i" % (c_neg_fa))
186 # Check additional files. 198 # Check additional files.
187 if args.opt_pos_fa: 199 if args.opt_pos_fa:
188 assert args.opt_neg_fa, "--opt-pos but no --opt-neg given" 200 assert args.opt_neg_fa, "--opt-pos but no --opt-neg given"
189 if args.opt_neg_fa: 201 if args.opt_neg_fa:
190 assert args.opt_pos_fa, "--opt-neg but no --opt-pos given" 202 assert args.opt_pos_fa, "--opt-neg but no --opt-pos given"
203 # Check for lowercase only sequences, which cause GP to crash.
204 error_mess = "input sequences encountered containing "\
205 "only lowercase characters or lowercase characters in between "\
206 "uppercase characters. Please provide either all uppercase "\
207 "sequences or sequences containing uppercase regions surrounded "\
208 "by lowercase context regions for structure calculation (see "\
209 "viewpoint concept in original GraphProt publication "\
210 "for more details)"
211 seqs_dic = gplib.read_fasta_into_dic(args.in_pos_fa)
212 bad_ids = gplib.check_seqs_dic_format(seqs_dic)
213 assert not bad_ids, "%s" % (error_mess)
214 seqs_dic = gplib.read_fasta_into_dic(args.in_neg_fa)
215 bad_ids = gplib.check_seqs_dic_format(seqs_dic)
216 assert not bad_ids, "%s" % (error_mess)
217 if args.opt_pos_fa:
218 seqs_dic = gplib.read_fasta_into_dic(args.opt_pos_fa)
219 bad_ids = gplib.check_seqs_dic_format(seqs_dic)
220 assert not bad_ids, "%s" % (error_mess)
221 if args.opt_neg_fa:
222 seqs_dic = gplib.read_fasta_into_dic(args.opt_neg_fa)
223 bad_ids = gplib.check_seqs_dic_format(seqs_dic)
224 assert not bad_ids, "%s" % (error_mess)
225
191 # If parop .fa files given. 226 # If parop .fa files given.
192 if args.opt_pos_fa and args.opt_neg_fa: 227 if args.opt_pos_fa and args.opt_neg_fa:
193 c_parop_pos_fa = gplib.count_fasta_headers(args.opt_pos_fa) 228 c_parop_pos_fa = gplib.count_fasta_headers(args.opt_pos_fa)
194 c_parop_neg_fa = gplib.count_fasta_headers(args.opt_neg_fa) 229 c_parop_neg_fa = gplib.count_fasta_headers(args.opt_neg_fa)
195 assert c_parop_pos_fa, "--opt-pos .fa file \"%s\" no headers found" %(args.opt_pos_fa) 230 assert c_parop_pos_fa, "--opt-pos .fa file \"%s\" no headers found" \
196 assert c_parop_neg_fa, "--opt-neg .fa file \"%s\" no headers found" %(args.opt_neg_fa) 231 % (args.opt_pos_fa)
232 assert c_parop_neg_fa, "--opt-neg .fa file \"%s\" no headers found" \
233 % (args.opt_neg_fa)
197 # Less than 500 for training?? You gotta be kidding. 234 # Less than 500 for training?? You gotta be kidding.
198 assert c_pos_fa >= args.min_train, "--pos for training < %i, please provide more (try at least > 1000, the more the better)" %(args.min_train) 235 assert c_pos_fa >= args.min_train, \
199 assert c_neg_fa >= args.min_train, "--neg for training < %i, please provide more (try at least > 1000, the more the better)" %(args.min_train) 236 "--pos for training < %i, please provide more (try at least "\
237 "> 1000, the more the better)" % (args.min_train)
238 assert c_neg_fa >= args.min_train, \
239 "--neg for training < %i, please provide more (try at least "\
240 "> 1000, the more the better)" % (args.min_train)
200 # Looking closer at ratios. 241 # Looking closer at ratios.
201 pos_neg_ratio = c_parop_pos_fa / c_parop_neg_fa 242 pos_neg_ratio = c_parop_pos_fa / c_parop_neg_fa
202 if pos_neg_ratio < 0.8 or pos_neg_ratio > 1.25: 243 if pos_neg_ratio < 0.8 or pos_neg_ratio > 1.25:
203 assert 0, "ratio of --opt-pos to --opt-neg < 0.8 or > 1.25 (ratio = %f). Try to keep ratio closer to 1 or better use identical numbers (keep in mind that performance measures such as accuracy or AUROC are not suitable for imbalanced datasets!)" %(pos_neg_ratio) 244 assert 0, "ratio of --opt-pos to --opt-neg < 0.8 or > 1.25 "\
245 "(ratio = %f). Try to keep ratio closer to 1 or better use "\
246 "identical numbers (keep in mind that performance measures "\
247 "such as accuracy or AUROC are not suitable for imbalanced "\
248 " datasets!)" % (pos_neg_ratio)
204 else: 249 else:
205 # Define some minimum amount of training sites for the sake of sanity. 250 # Define some minimum amount of training sites for the sake of sanity.
206 c_pos_train = c_pos_fa - args.opt_set_size 251 c_pos_train = c_pos_fa - args.opt_set_size
207 c_neg_train = c_neg_fa - args.opt_set_size 252 c_neg_train = c_neg_fa - args.opt_set_size
208 # Start complaining. 253 # Start complaining.
209 assert c_pos_fa >= args.opt_set_size, "# positives < --opt-set-size (%i < %i)" %(c_pos_fa, args.opt_set_size) 254 assert c_pos_fa >= args.opt_set_size, \
210 assert c_neg_fa >= args.opt_set_size, "# negatives < --opt-set-size (%i < %i)" %(c_neg_fa, args.opt_set_size) 255 "# positives < --opt-set-size (%i < %i)" \
211 assert c_pos_train >= args.opt_set_size, "# positives remaining for training < --opt-set-size (%i < %i)" %(c_pos_train, args.opt_set_size) 256 % (c_pos_fa, args.opt_set_size)
212 assert c_neg_train >= args.opt_set_size, "# negatives remaining for training < --opt-set-size (%i < %i)" %(c_neg_train, args.opt_set_size) 257 assert c_neg_fa >= args.opt_set_size, \
258 "# negatives < --opt-set-size (%i < %i)" \
259 % (c_neg_fa, args.opt_set_size)
260 assert c_pos_train >= args.opt_set_size, \
261 "# positives remaining for training < --opt-set-size "\
262 "(%i < %i)" % (c_pos_train, args.opt_set_size)
263 assert c_neg_train >= args.opt_set_size, "# negatives remaining "\
264 "for training < --opt-set-size (%i < %i)" \
265 % (c_neg_train, args.opt_set_size)
213 # Less than 500?? You gotta be kidding. 266 # Less than 500?? You gotta be kidding.
214 assert c_pos_train >= args.min_train, "# positives remaining for training < %i, please provide more (try at least > 1000, the more the better)" %(args.min_train) 267 assert c_pos_train >= args.min_train, \
215 assert c_neg_train >= args.min_train, "# negatives remaining for training < %i, please provide more (try at least > 1000, the more the better)" %(args.min_train) 268 "# positives remaining for training < %i, please provide more "\
269 " (try at least > 1000, the more the better)" % (args.min_train)
270 assert c_neg_train >= args.min_train, \
271 "# negatives remaining for training < %i, please provide more "\
272 "(try at least > 1000, the more the better)" % (args.min_train)
216 # Looking closer at ratios. 273 # Looking closer at ratios.
217 pos_neg_ratio = c_pos_train / c_neg_train 274 pos_neg_ratio = c_pos_train / c_neg_train
218 if pos_neg_ratio < 0.8 or pos_neg_ratio > 1.25: 275 if pos_neg_ratio < 0.8 or pos_neg_ratio > 1.25:
219 assert 0, "ratio of --pos to --neg < 0.8 or > 1.25 (ratio = %f). Try to keep ratio closer to 1 or better use identical numbers (keep in mind that performance measures such as accuracy or AUROC are not suitable for imbalanced datasets!)" %(pos_neg_ratio) 276 assert 0, "ratio of --pos to --neg < 0.8 or > 1.25 "\
220 277 "(ratio = %f). Try to keep ratio closer to 1 or better use "\
221 """ 278 "identical numbers (keep in mind that performance measures "\
222 Generate parop + train .fa output files for hyperparameter optimization + training. 279 "such as accuracy or AUROC are not suitable for imbalanced "\
223 280 "datasets!)" % (pos_neg_ratio)
281
282 """
283 Generate parop + train .fa output files for hyperparameter
284 optimization + training.
285
224 """ 286 """
225 # Output files for training. 287 # Output files for training.
226 pos_parop_fa = args.data_id + ".positives.parop.fa" 288 pos_parop_fa = args.data_id + ".positives.parop.fa"
227 neg_parop_fa = args.data_id + ".negatives.parop.fa" 289 neg_parop_fa = args.data_id + ".negatives.parop.fa"
228 pos_train_fa = args.data_id + ".positives.train.fa" 290 pos_train_fa = args.data_id + ".positives.train.fa"
235 gplib.make_file_copy(args.opt_neg_fa, neg_parop_fa) 297 gplib.make_file_copy(args.opt_neg_fa, neg_parop_fa)
236 gplib.make_file_copy(args.in_pos_fa, pos_train_fa) 298 gplib.make_file_copy(args.in_pos_fa, pos_train_fa)
237 gplib.make_file_copy(args.in_neg_fa, neg_train_fa) 299 gplib.make_file_copy(args.in_neg_fa, neg_train_fa)
238 else: 300 else:
239 # Generate parop + train .fa files from input .fa files. 301 # Generate parop + train .fa files from input .fa files.
240 gplib.split_fasta_into_test_train_files(args.in_pos_fa, pos_parop_fa, pos_train_fa, 302 gplib.split_fasta_into_test_train_files(args.in_pos_fa, pos_parop_fa,
241 test_size=args.opt_set_size) 303 pos_train_fa,
242 gplib.split_fasta_into_test_train_files(args.in_neg_fa, neg_parop_fa, neg_train_fa, 304 test_size=args.opt_set_size)
243 test_size=args.opt_set_size) 305 gplib.split_fasta_into_test_train_files(args.in_neg_fa, neg_parop_fa,
306 neg_train_fa,
307 test_size=args.opt_set_size)
244 308
245 """ 309 """
246 Do the hyperparameter optimization. 310 Do the hyperparameter optimization.
247 311
248 """ 312 """
249 print("Starting hyperparameter optimization (-action ls) ... ") 313 print("Starting hyperparameter optimization (-action ls) ... ")
250 check_cmd = "GraphProt.pl -action ls -prefix " + args.data_id + " -fasta " + pos_parop_fa + " -negfasta " + neg_parop_fa 314 check_cmd = "GraphProt.pl -action ls -prefix " + args.data_id + \
315 " -fasta " + pos_parop_fa + " -negfasta " + neg_parop_fa
251 # If sequence model should be trained (default). 316 # If sequence model should be trained (default).
252 if not args.train_str_model: 317 if not args.train_str_model:
253 check_cmd += " -onlyseq" 318 check_cmd += " -onlyseq"
254 print(check_cmd) 319 print(check_cmd)
255 output = subprocess.getoutput(check_cmd) 320 output = subprocess.getoutput(check_cmd)
256 #assert output, "The following call of GraphProt.pl produced no output:\n%s" %(check_cmd)
257 #if args.gp_output:
258 # print(output)
259 params_file = args.data_id + ".params" 321 params_file = args.data_id + ".params"
260 assert os.path.exists(params_file), "Hyperparameter optimization output .params file \"%s\" not found" %(params_file) 322 assert os.path.exists(params_file), "Hyperparameter optimization output "\
323 " .params file \"%s\" not found" % (params_file)
261 # Add model type to params file. 324 # Add model type to params file.
262 if args.train_str_model: 325 if args.train_str_model:
263 gplib.echo_add_to_file("model_type: structure", params_file) 326 gplib.echo_add_to_file("model_type: structure", params_file)
264 else: 327 else:
265 gplib.echo_add_to_file("model_type: sequence", params_file) 328 gplib.echo_add_to_file("model_type: sequence", params_file)
266 # Get parameter string. 329 # Get parameter string.
267 param_string = gplib.graphprot_get_param_string(params_file) 330 param_string = gplib.graphprot_get_param_string(params_file)
268 331
269 """ 332 """
270 Do the model training. (Yowza!) 333 Do the model training. (Yowza!)
271 334
272 """ 335 """
273 print("Starting model training (-action train) ... ") 336 print("Starting model training (-action train) ... ")
274 check_cmd = "GraphProt.pl -action train -prefix " + args.data_id + " -fasta " + pos_train_fa + " -negfasta " + neg_train_fa + " " + param_string 337 check_cmd = "GraphProt.pl -action train -prefix " + args.data_id \
338 + " -fasta " + pos_train_fa + " -negfasta " + neg_train_fa \
339 + " " + param_string
275 print(check_cmd) 340 print(check_cmd)
276 output = subprocess.getoutput(check_cmd) 341 output = subprocess.getoutput(check_cmd)
277 assert output, "The following call of GraphProt.pl produced no output:\n%s" %(check_cmd) 342 assert output, \
343 "The following call of GraphProt.pl produced no output:\n%s" \
344 % (check_cmd)
278 if args.gp_output: 345 if args.gp_output:
279 print(output) 346 print(output)
280 model_file = args.data_id + ".model" 347 model_file = args.data_id + ".model"
281 assert os.path.exists(model_file), "Training output .model file \"%s\" not found" %(model_file) 348 assert os.path.exists(model_file), \
349 "Training output .model file \"%s\" not found" % (model_file)
282 350
283 """ 351 """
284 Do the 10-fold cross validation. 352 Do the 10-fold cross validation.
285 353
286 """ 354 """
287 if not args.disable_cv: 355 if not args.disable_cv:
288 print("Starting 10-fold cross validation (-action cv) ... ") 356 print("Starting 10-fold cross validation (-action cv) ... ")
289 check_cmd = "GraphProt.pl -action cv -prefix " + args.data_id + " -fasta " + pos_train_fa + " -negfasta " + neg_train_fa + " " + param_string + " -model " + model_file 357 check_cmd = "GraphProt.pl -action cv -prefix " + args.data_id \
358 + " -fasta " + pos_train_fa + " -negfasta " + neg_train_fa \
359 + " " + param_string + " -model " + model_file
290 print(check_cmd) 360 print(check_cmd)
291 output = subprocess.getoutput(check_cmd) 361 output = subprocess.getoutput(check_cmd)
292 assert output, "The following call of GraphProt.pl produced no output:\n%s" %(check_cmd) 362 assert output, \
363 "The following call of GraphProt.pl produced no output:\n%s" \
364 % (check_cmd)
293 if args.gp_output: 365 if args.gp_output:
294 print(output) 366 print(output)
295 cv_results_file = args.data_id + ".cv_results" 367 cv_results_file = args.data_id + ".cv_results"
296 assert os.path.exists(cv_results_file), "CV output .cv_results file \"%s\" not found" %(cv_results_file) 368 assert os.path.exists(cv_results_file), \
369 "CV output .cv_results file \"%s\" not found" % (cv_results_file)
297 370
298 """ 371 """
299 Do the motif generation. 372 Do the motif generation.
300 373
301 """ 374 """
302 if not args.disable_motifs: 375 if not args.disable_motifs:
303 print("Starting motif generation (-action motif) ... ") 376 print("Starting motif generation (-action motif) ... ")
304 check_cmd = "GraphProt.pl -action motif -prefix " + args.data_id + " -fasta " + pos_train_fa + " -negfasta " + neg_train_fa + " " + param_string + " -model " + model_file 377 check_cmd = "GraphProt.pl -action motif -prefix " + args.data_id \
378 + " -fasta " + pos_train_fa + " -negfasta " + neg_train_fa \
379 + " " + param_string + " -model " + model_file
305 print(check_cmd) 380 print(check_cmd)
306 output = subprocess.getoutput(check_cmd) 381 output = subprocess.getoutput(check_cmd)
307 assert output, "The following call of GraphProt.pl produced no output:\n%s" %(check_cmd) 382 assert output, \
383 "The following call of GraphProt.pl produced no output:\n%s" \
384 % (check_cmd)
308 if args.gp_output: 385 if args.gp_output:
309 print(output) 386 print(output)
310 seq_motif_file = args.data_id + ".sequence_motif" 387 seq_motif_file = args.data_id + ".sequence_motif"
311 seq_motif_png_file = args.data_id + ".sequence_motif.png" 388 seq_motif_png_file = args.data_id + ".sequence_motif.png"
312 assert os.path.exists(seq_motif_file), "Motif output .sequence_motif file \"%s\" not found" %(seq_motif_file) 389 assert os.path.exists(seq_motif_file), \
313 assert os.path.exists(seq_motif_png_file), "Motif output .sequence_motif.png file \"%s\" not found" %(seq_motif_png_file) 390 "Motif output .sequence_motif file \"%s\" not found" \
391 % (seq_motif_file)
392 assert os.path.exists(seq_motif_png_file), \
393 "Motif output .sequence_motif.png file \"%s\" not found" \
394 % (seq_motif_png_file)
314 if args.train_str_model: 395 if args.train_str_model:
315 str_motif_file = args.data_id + ".structure_motif" 396 str_motif_file = args.data_id + ".structure_motif"
316 str_motif_png_file = args.data_id + ".structure_motif.png" 397 str_motif_png_file = args.data_id + ".structure_motif.png"
317 assert os.path.exists(str_motif_file), "Motif output .structure_motif file \"%s\" not found" %(str_motif_file) 398 assert os.path.exists(str_motif_file), \
318 assert os.path.exists(str_motif_png_file), "Motif output .structure_motif.png file \"%s\" not found" %(str_motif_png_file) 399 "Motif output .structure_motif file \"%s\" not found" \
400 % (str_motif_file)
401 assert os.path.exists(str_motif_png_file), \
402 "Motif output .structure_motif.png file \"%s\" not found" \
403 % (str_motif_png_file)
319 404
320 """ 405 """
321 Do whole site predictions on positive training set. 406 Do whole site predictions on positive training set.
322 407
323 """ 408 """
324 print("Starting whole site predictions on positive training set (-action predict) ... ") 409 print("Starting whole site predictions on positive training set "
325 check_cmd = "GraphProt.pl -action predict -prefix " + args.data_id + " -fasta " + pos_train_fa + " " + param_string + " -model " + model_file 410 " (-action predict) ... ")
411 check_cmd = "GraphProt.pl -action predict -prefix " + args.data_id \
412 + " -fasta " + pos_train_fa + " " + param_string \
413 + " -model " + model_file
326 print(check_cmd) 414 print(check_cmd)
327 output = subprocess.getoutput(check_cmd) 415 output = subprocess.getoutput(check_cmd)
328 assert output, "The following call of GraphProt.pl produced no output:\n%s" %(check_cmd) 416 assert output, \
417 "The following call of GraphProt.pl produced no output:\n%s" \
418 % (check_cmd)
329 if args.gp_output: 419 if args.gp_output:
330 print(output) 420 print(output)
331 ws_predictions_file = args.data_id + ".predictions" 421 ws_predictions_file = args.data_id + ".predictions"
332 assert os.path.exists(ws_predictions_file), "Whole site prediction output .predictions file \"%s\" not found" %(ws_predictions_file) 422 assert os.path.exists(ws_predictions_file), \
423 "Whole site prediction output .predictions file \"%s\" not found" \
424 % (ws_predictions_file)
333 425
334 """ 426 """
335 Do profile predictions on positive training set. 427 Do profile predictions on positive training set.
336 428
337 """ 429 """
338 print("Starting profile predictions on positive training set (-action predict_profile) ... ") 430 print("Starting profile predictions on positive training set "
339 check_cmd = "GraphProt.pl -action predict_profile -prefix " + args.data_id + " -fasta " + pos_train_fa + " " + param_string + " -model " + model_file 431 "-action predict_profile) ... ")
432 check_cmd = "GraphProt.pl -action predict_profile -prefix " \
433 + args.data_id + " -fasta " + pos_train_fa + " " \
434 + param_string + " -model " + model_file
340 print(check_cmd) 435 print(check_cmd)
341 output = subprocess.getoutput(check_cmd) 436 output = subprocess.getoutput(check_cmd)
342 assert output, "The following call of GraphProt.pl produced no output:\n%s" %(check_cmd) 437 assert output, \
438 "The following call of GraphProt.pl produced no output:\n%s" \
439 % (check_cmd)
343 if args.gp_output: 440 if args.gp_output:
344 print(output) 441 print(output)
345 profile_predictions_file = args.data_id + ".profile" 442 profile_predictions_file = args.data_id + ".profile"
346 assert os.path.exists(profile_predictions_file), "Profile prediction output .profile file \"%s\" not found" %(profile_predictions_file) 443 assert os.path.exists(profile_predictions_file), \
444 "Profile prediction output .profile file \"%s\" not found" \
445 % (profile_predictions_file)
347 446
348 """ 447 """
349 Get 50 % score (median) for .predictions and .profile file. 448 Get 50 % score (median) for .predictions and .profile file.
350 For .profile, first extract for each site the maximum score, and then 449 For .profile, first extract for each site the maximum score, and then
351 from the list of maximum site scores get the median. 450 from the list of maximum site scores get the median.
352 For whole site .predictions, get the median from the site scores list. 451 For whole site .predictions, get the median from the site scores list.
353 452
354 """ 453 """
355 print("Getting .profile and .predictions median scores ... ") 454 print("Getting .profile and .predictions median scores ... ")
356 455
357 # Whole site scores median. 456 # Whole site scores median.
358 ws_pred_median = gplib.graphprot_predictions_get_median(ws_predictions_file) 457 ws_pred_median = \
458 gplib.graphprot_predictions_get_median(ws_predictions_file)
359 # Profile top site scores median. 459 # Profile top site scores median.
360 profile_median = gplib.graphprot_profile_get_top_scores_median(profile_predictions_file, 460 profile_median = \
361 profile_type="profile") 461 gplib.graphprot_profile_get_tsm(profile_predictions_file,
362 ws_pred_string = "pos_train_ws_pred_median: %f" %(ws_pred_median) 462 profile_type="profile")
363 profile_string = "pos_train_profile_median: %f" %(profile_median) 463 ws_pred_string = "pos_train_ws_pred_median: %f" % (ws_pred_median)
464 profile_string = "pos_train_profile_median: %f" % (profile_median)
364 gplib.echo_add_to_file(ws_pred_string, params_file) 465 gplib.echo_add_to_file(ws_pred_string, params_file)
365 gplib.echo_add_to_file(profile_string, params_file) 466 gplib.echo_add_to_file(profile_string, params_file)
366 # Average profile top site scores median for extlr 1 to 10. 467 # Average profile top site scores median for extlr 1 to 10.
367 for i in range(10): 468 for i in range(10):
368 i += 1 469 i += 1
369 avg_profile_median = gplib.graphprot_profile_get_top_scores_median(profile_predictions_file, 470 avg_profile_median = \
370 profile_type="avg_profile", 471 gplib.graphprot_profile_get_tsm(profile_predictions_file,
371 avg_profile_extlr=i) 472 profile_type="avg_profile",
372 473 avg_profile_extlr=i)
373 avg_profile_string = "pos_train_avg_profile_median_%i: %f" %(i, avg_profile_median) 474
475 avg_profile_string = "pos_train_avg_profile_median_%i: %f" \
476 % (i, avg_profile_median)
374 gplib.echo_add_to_file(avg_profile_string, params_file) 477 gplib.echo_add_to_file(avg_profile_string, params_file)
375 478
376 print("Script: I'm done.") 479 print("Script: I'm done.")
377 print("Author: Good. Now go back to your file system directory.") 480 print("Author: Good. Now go back to your file system directory.")
378 print("Script: Ok.") 481 print("Script: Ok.")
379
380
381 """
382
383 OLD CODE ...
384
385 p.add_argument("--ap-extlr",
386 dest="ap_extlr",
387 type = int,
388 default = 5,
389 help = "Define average profile up- and downstream extension for averaging scores to produce the average profile. This is used to get the median average profile score, which will be stored in the .params file to later be used in a prediction setting as a second filter value to get more confident peak regions. NOTE that you have to use the same value in model training and prediction! (default: 5)")
390
391
392 p.add_argument("--disable-opt",
393 dest = "disable_opt",
394 default = False,
395 action = "store_true",
396 help = "Disable hyperparameter optimization (HPO) (default: optimize hyperparameters)")
397 p.add_argument("--R",
398 dest = "param_r",
399 type = int,
400 default = False,
401 help = "GraphProt model R parameter (default: determined by HPO)")
402 p.add_argument("--D",
403 dest = "param_d",
404 type = int,
405 default = False,
406 help = "GraphProt model D parameter (default: determined by HPO)")
407 p.add_argument("--epochs",
408 dest = "param_epochs",
409 type = int,
410 default = False,
411 help = "GraphProt model epochs parameter (default: determined by HPO)")
412 p.add_argument("--lambda",
413 dest = "param_lambda",
414 type = float,
415 default = False,
416 help = "GraphProt model lambda parameter (default: determined by HPO)")
417 p.add_argument("--bitsize",
418 dest = "param_bitsize",
419 type = int,
420 default = False,
421 help = "GraphProt model bitsize parameter (default: determined by HPO)")
422 p.add_argument("--abstraction",
423 dest = "param_abstraction",
424 type = int,
425 default = False,
426 help = "GraphProt model RNAshapes abstraction level parameter for training structure models (default: determined by HPO)")
427
428 """
429
430
431