comparison main.py @ 2:76251d1ccdcc draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
author bgruening
date Fri, 11 Oct 2019 18:24:54 -0400
parents 12764915e1c5
children 5b3c08710e47
comparison
equal deleted inserted replaced
1:12764915e1c5 2:76251d1ccdcc
6 import numpy as np 6 import numpy as np
7 import argparse 7 import argparse
8 import time 8 import time
9 9
10 # machine learning library 10 # machine learning library
11 import tensorflow as tf
12 from keras import backend as K
11 import keras.callbacks as callbacks 13 import keras.callbacks as callbacks
12 14
13 import extract_workflow_connections 15 import extract_workflow_connections
14 import prepare_data 16 import prepare_data
15 import optimise_hyperparameters 17 import optimise_hyperparameters
17 19
18 20
19 class PredictTool: 21 class PredictTool:
20 22
21 @classmethod 23 @classmethod
22 def __init__(self): 24 def __init__(self, num_cpus):
23 """ Init method. """ 25 """ Init method. """
26 # set the number of cpus
27 cpu_config = tf.ConfigProto(
28 device_count={"CPU": num_cpus},
29 intra_op_parallelism_threads=num_cpus,
30 inter_op_parallelism_threads=num_cpus,
31 allow_soft_placement=True
32 )
33 K.set_session(tf.Session(config=cpu_config))
24 34
25 @classmethod 35 @classmethod
26 def find_train_best_network(self, network_config, reverse_dictionary, train_data, train_labels, test_data, test_labels, n_epochs, class_weights, usage_pred, compatible_next_tools): 36 def find_train_best_network(self, network_config, reverse_dictionary, train_data, train_labels, test_data, test_labels, n_epochs, class_weights, usage_pred, compatible_next_tools):
27 """ 37 """
28 Define recurrent neural network and train sequential data 38 Define recurrent neural network and train sequential data
29 """ 39 """
30 print("Start hyperparameter optimisation...") 40 print("Start hyperparameter optimisation...")
31 hyper_opt = optimise_hyperparameters.HyperparameterOptimisation() 41 hyper_opt = optimise_hyperparameters.HyperparameterOptimisation()
32 best_params = hyper_opt.train_model(network_config, reverse_dictionary, train_data, train_labels, test_data, test_labels, class_weights) 42 best_params, best_model = hyper_opt.train_model(network_config, reverse_dictionary, train_data, train_labels, class_weights)
33
34 # retrieve the model and train on complete dataset without validation set
35 model, best_params = utils.set_recurrent_network(best_params, reverse_dictionary, class_weights)
36 43
37 # define callbacks 44 # define callbacks
45 early_stopping = callbacks.EarlyStopping(monitor='loss', mode='min', verbose=1, min_delta=1e-4, restore_best_weights=True)
38 predict_callback_test = PredictCallback(test_data, test_labels, reverse_dictionary, n_epochs, compatible_next_tools, usage_pred) 46 predict_callback_test = PredictCallback(test_data, test_labels, reverse_dictionary, n_epochs, compatible_next_tools, usage_pred)
39 # tensor_board = callbacks.TensorBoard(log_dir=log_directory, histogram_freq=0, write_graph=True, write_images=True) 47
40 callbacks_list = [predict_callback_test] 48 callbacks_list = [predict_callback_test, early_stopping]
41 49
42 print("Start training on the best model...") 50 print("Start training on the best model...")
43 model_fit = model.fit( 51 train_performance = dict()
44 train_data,
45 train_labels,
46 batch_size=int(best_params["batch_size"]),
47 epochs=n_epochs,
48 verbose=2,
49 callbacks=callbacks_list,
50 shuffle="batch",
51 validation_data=(test_data, test_labels)
52 )
53
54 train_performance = {
55 "train_loss": np.array(model_fit.history["loss"]),
56 "model": model,
57 "best_parameters": best_params
58 }
59
60 # if there is test data, add more information
61 if len(test_data) > 0: 52 if len(test_data) > 0:
62 train_performance["validation_loss"] = np.array(model_fit.history["val_loss"]) 53 trained_model = best_model.fit(
54 train_data,
55 train_labels,
56 batch_size=int(best_params["batch_size"]),
57 epochs=n_epochs,
58 verbose=2,
59 callbacks=callbacks_list,
60 shuffle="batch",
61 validation_data=(test_data, test_labels)
62 )
63 train_performance["validation_loss"] = np.array(trained_model.history["val_loss"])
63 train_performance["precision"] = predict_callback_test.precision 64 train_performance["precision"] = predict_callback_test.precision
64 train_performance["usage_weights"] = predict_callback_test.usage_weights 65 train_performance["usage_weights"] = predict_callback_test.usage_weights
66 else:
67 trained_model = best_model.fit(
68 train_data,
69 train_labels,
70 batch_size=int(best_params["batch_size"]),
71 epochs=n_epochs,
72 verbose=2,
73 callbacks=callbacks_list,
74 shuffle="batch"
75 )
76 train_performance["train_loss"] = np.array(trained_model.history["loss"])
77 train_performance["model"] = best_model
78 train_performance["best_parameters"] = best_params
65 return train_performance 79 return train_performance
66 80
67 81
68 class PredictCallback(callbacks.Callback): 82 class PredictCallback(callbacks.Callback):
69 def __init__(self, test_data, test_labels, reverse_data_dictionary, n_epochs, next_compatible_tools, usg_scores): 83 def __init__(self, test_data, test_labels, reverse_data_dictionary, n_epochs, next_compatible_tools, usg_scores):
88 print("Epoch %d usage weights: %s" % (epoch + 1, usage_weights)) 102 print("Epoch %d usage weights: %s" % (epoch + 1, usage_weights))
89 103
90 104
91 if __name__ == "__main__": 105 if __name__ == "__main__":
92 start_time = time.time() 106 start_time = time.time()
107
93 arg_parser = argparse.ArgumentParser() 108 arg_parser = argparse.ArgumentParser()
94 arg_parser.add_argument("-wf", "--workflow_file", required=True, help="workflows tabular file") 109 arg_parser.add_argument("-wf", "--workflow_file", required=True, help="workflows tabular file")
95 arg_parser.add_argument("-tu", "--tool_usage_file", required=True, help="tool usage file") 110 arg_parser.add_argument("-tu", "--tool_usage_file", required=True, help="tool usage file")
96 arg_parser.add_argument("-om", "--output_model", required=True, help="trained model file") 111 arg_parser.add_argument("-om", "--output_model", required=True, help="trained model file")
97 # data parameters 112 # data parameters
110 arg_parser.add_argument("-sd", "--spatial_dropout", required=True, help="1d dropout used for embedding layer") 125 arg_parser.add_argument("-sd", "--spatial_dropout", required=True, help="1d dropout used for embedding layer")
111 arg_parser.add_argument("-rd", "--recurrent_dropout", required=True, help="dropout for the recurrent layers") 126 arg_parser.add_argument("-rd", "--recurrent_dropout", required=True, help="dropout for the recurrent layers")
112 arg_parser.add_argument("-lr", "--learning_rate", required=True, help="learning rate") 127 arg_parser.add_argument("-lr", "--learning_rate", required=True, help="learning rate")
113 arg_parser.add_argument("-ar", "--activation_recurrent", required=True, help="activation function for recurrent layers") 128 arg_parser.add_argument("-ar", "--activation_recurrent", required=True, help="activation function for recurrent layers")
114 arg_parser.add_argument("-ao", "--activation_output", required=True, help="activation function for output layers") 129 arg_parser.add_argument("-ao", "--activation_output", required=True, help="activation function for output layers")
130
115 # get argument values 131 # get argument values
116 args = vars(arg_parser.parse_args()) 132 args = vars(arg_parser.parse_args())
117 tool_usage_path = args["tool_usage_file"] 133 tool_usage_path = args["tool_usage_file"]
118 workflows_path = args["workflow_file"] 134 workflows_path = args["workflow_file"]
119 cutoff_date = args["cutoff_date"] 135 cutoff_date = args["cutoff_date"]
131 spatial_dropout = args["spatial_dropout"] 147 spatial_dropout = args["spatial_dropout"]
132 recurrent_dropout = args["recurrent_dropout"] 148 recurrent_dropout = args["recurrent_dropout"]
133 learning_rate = args["learning_rate"] 149 learning_rate = args["learning_rate"]
134 activation_recurrent = args["activation_recurrent"] 150 activation_recurrent = args["activation_recurrent"]
135 activation_output = args["activation_output"] 151 activation_output = args["activation_output"]
152 num_cpus = 16
136 153
137 config = { 154 config = {
138 'cutoff_date': cutoff_date, 155 'cutoff_date': cutoff_date,
139 'maximum_path_length': maximum_path_length, 156 'maximum_path_length': maximum_path_length,
140 'n_epochs': n_epochs, 157 'n_epochs': n_epochs,
159 # Process the paths from workflows 176 # Process the paths from workflows
160 print("Dividing data...") 177 print("Dividing data...")
161 data = prepare_data.PrepareData(maximum_path_length, test_share) 178 data = prepare_data.PrepareData(maximum_path_length, test_share)
162 train_data, train_labels, test_data, test_labels, data_dictionary, reverse_dictionary, class_weights, usage_pred = data.get_data_labels_matrices(workflow_paths, tool_usage_path, cutoff_date, compatible_next_tools) 179 train_data, train_labels, test_data, test_labels, data_dictionary, reverse_dictionary, class_weights, usage_pred = data.get_data_labels_matrices(workflow_paths, tool_usage_path, cutoff_date, compatible_next_tools)
163 # find the best model and start training 180 # find the best model and start training
164 predict_tool = PredictTool() 181 predict_tool = PredictTool(num_cpus)
165 # start training with weighted classes 182 # start training with weighted classes
166 print("Training with weighted classes and samples ...") 183 print("Training with weighted classes and samples ...")
167 results_weighted = predict_tool.find_train_best_network(config, reverse_dictionary, train_data, train_labels, test_data, test_labels, n_epochs, class_weights, usage_pred, compatible_next_tools) 184 results_weighted = predict_tool.find_train_best_network(config, reverse_dictionary, train_data, train_labels, test_data, test_labels, n_epochs, class_weights, usage_pred, compatible_next_tools)
168 print() 185 print()
169 print("Best parameters \n") 186 print("Best parameters \n")