create_tool_recommendation_model: main.py comparison

comparison main.py @ 2:76251d1ccdcc draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 6fa2a0294d615c9f267b766337dca0b2d3637219"

author	bgruening
date	Fri, 11 Oct 2019 18:24:54 -0400
parents	12764915e1c5
children	5b3c08710e47

comparison

equal deleted inserted replaced

-:12764915e1c5
+:76251d1ccdcc
 import numpy as np
 import argparse
 import time
 # machine learning library
+import tensorflow as tf
+from keras import backend as K
 import keras.callbacks as callbacks
 import extract_workflow_connections
 import prepare_data
 import optimise_hyperparameters
 class PredictTool:
 @classmethod
-def __init__(self):
+def __init__(self, num_cpus):
 """ Init method. """
+# set the number of cpus
+cpu_config = tf.ConfigProto(
+device_count={"CPU": num_cpus},
+intra_op_parallelism_threads=num_cpus,
+inter_op_parallelism_threads=num_cpus,
+allow_soft_placement=True
+)
+K.set_session(tf.Session(config=cpu_config))
 @classmethod
 def find_train_best_network(self, network_config, reverse_dictionary, train_data, train_labels, test_data, test_labels, n_epochs, class_weights, usage_pred, compatible_next_tools):
 """
 Define recurrent neural network and train sequential data
 """
 print("Start hyperparameter optimisation...")
 hyper_opt = optimise_hyperparameters.HyperparameterOptimisation()
-best_params = hyper_opt.train_model(network_config, reverse_dictionary, train_data, train_labels, test_data, test_labels, class_weights)
+best_params, best_model = hyper_opt.train_model(network_config, reverse_dictionary, train_data, train_labels, class_weights)
-# retrieve the model and train on complete dataset without validation set
-model, best_params = utils.set_recurrent_network(best_params, reverse_dictionary, class_weights)
 # define callbacks
+early_stopping = callbacks.EarlyStopping(monitor='loss', mode='min', verbose=1, min_delta=1e-4, restore_best_weights=True)
 predict_callback_test = PredictCallback(test_data, test_labels, reverse_dictionary, n_epochs, compatible_next_tools, usage_pred)
-# tensor_board = callbacks.TensorBoard(log_dir=log_directory, histogram_freq=0, write_graph=True, write_images=True)
-callbacks_list = [predict_callback_test]
+callbacks_list = [predict_callback_test, early_stopping]
 print("Start training on the best model...")
-model_fit = model.fit(
+train_performance = dict()
-train_data,
-train_labels,
-batch_size=int(best_params["batch_size"]),
-epochs=n_epochs,
-verbose=2,
-callbacks=callbacks_list,
-shuffle="batch",
-validation_data=(test_data, test_labels)
-)
-train_performance = {
-"train_loss": np.array(model_fit.history["loss"]),
-"model": model,
-"best_parameters": best_params
-}
-# if there is test data, add more information
 if len(test_data) > 0:
-train_performance["validation_loss"] = np.array(model_fit.history["val_loss"])
+trained_model = best_model.fit(
+train_data,
+train_labels,
+batch_size=int(best_params["batch_size"]),
+epochs=n_epochs,
+verbose=2,
+callbacks=callbacks_list,
+shuffle="batch",
+validation_data=(test_data, test_labels)
+)
+train_performance["validation_loss"] = np.array(trained_model.history["val_loss"])
 train_performance["precision"] = predict_callback_test.precision
 train_performance["usage_weights"] = predict_callback_test.usage_weights
+else:
+trained_model = best_model.fit(
+train_data,
+train_labels,
+batch_size=int(best_params["batch_size"]),
+epochs=n_epochs,
+verbose=2,
+callbacks=callbacks_list,
+shuffle="batch"
+)
+train_performance["train_loss"] = np.array(trained_model.history["loss"])
+train_performance["model"] = best_model
+train_performance["best_parameters"] = best_params
 return train_performance
 class PredictCallback(callbacks.Callback):
 def __init__(self, test_data, test_labels, reverse_data_dictionary, n_epochs, next_compatible_tools, usg_scores):
 print("Epoch %d usage weights: %s" % (epoch + 1, usage_weights))
 if __name__ == "__main__":
 start_time = time.time()
 arg_parser = argparse.ArgumentParser()
 arg_parser.add_argument("-wf", "--workflow_file", required=True, help="workflows tabular file")
 arg_parser.add_argument("-tu", "--tool_usage_file", required=True, help="tool usage file")
 arg_parser.add_argument("-om", "--output_model", required=True, help="trained model file")
 # data parameters
 arg_parser.add_argument("-sd", "--spatial_dropout", required=True, help="1d dropout used for embedding layer")
 arg_parser.add_argument("-rd", "--recurrent_dropout", required=True, help="dropout for the recurrent layers")
 arg_parser.add_argument("-lr", "--learning_rate", required=True, help="learning rate")
 arg_parser.add_argument("-ar", "--activation_recurrent", required=True, help="activation function for recurrent layers")
 arg_parser.add_argument("-ao", "--activation_output", required=True, help="activation function for output layers")
 # get argument values
 args = vars(arg_parser.parse_args())
 tool_usage_path = args["tool_usage_file"]
 workflows_path = args["workflow_file"]
 cutoff_date = args["cutoff_date"]
 spatial_dropout = args["spatial_dropout"]
 recurrent_dropout = args["recurrent_dropout"]
 learning_rate = args["learning_rate"]
 activation_recurrent = args["activation_recurrent"]
 activation_output = args["activation_output"]
+num_cpus = 16
 config = {
 'cutoff_date': cutoff_date,
 'maximum_path_length': maximum_path_length,
 'n_epochs': n_epochs,
 # Process the paths from workflows
 print("Dividing data...")
 data = prepare_data.PrepareData(maximum_path_length, test_share)
 train_data, train_labels, test_data, test_labels, data_dictionary, reverse_dictionary, class_weights, usage_pred = data.get_data_labels_matrices(workflow_paths, tool_usage_path, cutoff_date, compatible_next_tools)
 # find the best model and start training
-predict_tool = PredictTool()
+predict_tool = PredictTool(num_cpus)
 # start training with weighted classes
 print("Training with weighted classes and samples ...")
 results_weighted = predict_tool.find_train_best_network(config, reverse_dictionary, train_data, train_labels, test_data, test_labels, n_epochs, class_weights, usage_pred, compatible_next_tools)
 print()
 print("Best parameters \n")

Mercurial > repos > bgruening > create_tool_recommendation_model

comparison main.py @ 2:76251d1ccdcc draft