Mercurial > repos > bgruening > create_tool_recommendation_model
diff main.py @ 5:4f7e6612906b draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 5eebc0cb44e71f581d548b7e842002705dd155eb"
author | bgruening |
---|---|
date | Fri, 06 May 2022 09:05:18 +0000 |
parents | afec8c595124 |
children | e94dc7945639 |
line wrap: on
line diff
--- a/main.py Tue Jul 07 03:25:49 2020 -0400 +++ b/main.py Fri May 06 09:05:18 2022 +0000 @@ -3,35 +3,36 @@ using machine learning (recurrent neural network) """ -import numpy as np import argparse import time -# machine learning library -import tensorflow as tf -from keras import backend as K +import extract_workflow_connections import keras.callbacks as callbacks - -import extract_workflow_connections +import numpy as np +import optimise_hyperparameters import prepare_data -import optimise_hyperparameters import utils class PredictTool: - def __init__(self, num_cpus): """ Init method. """ - # set the number of cpus - cpu_config = tf.ConfigProto( - device_count={"CPU": num_cpus}, - intra_op_parallelism_threads=num_cpus, - inter_op_parallelism_threads=num_cpus, - allow_soft_placement=True - ) - K.set_session(tf.Session(config=cpu_config)) - def find_train_best_network(self, network_config, reverse_dictionary, train_data, train_labels, test_data, test_labels, n_epochs, class_weights, usage_pred, standard_connections, tool_freq, tool_tr_samples): + def find_train_best_network( + self, + network_config, + reverse_dictionary, + train_data, + train_labels, + test_data, + test_labels, + n_epochs, + class_weights, + usage_pred, + standard_connections, + tool_freq, + tool_tr_samples, + ): """ Define recurrent neural network and train sequential data """ @@ -40,11 +41,34 @@ print("Start hyperparameter optimisation...") hyper_opt = optimise_hyperparameters.HyperparameterOptimisation() - best_params, best_model = hyper_opt.train_model(network_config, reverse_dictionary, train_data, train_labels, test_data, test_labels, tool_tr_samples, class_weights) + best_params, best_model = hyper_opt.train_model( + network_config, + reverse_dictionary, + train_data, + train_labels, + test_data, + test_labels, + tool_tr_samples, + class_weights, + ) # define callbacks - early_stopping = callbacks.EarlyStopping(monitor='loss', mode='min', verbose=1, min_delta=1e-1, restore_best_weights=True) - predict_callback_test = PredictCallback(test_data, test_labels, reverse_dictionary, n_epochs, usage_pred, standard_connections, lowest_tool_ids) + early_stopping = callbacks.EarlyStopping( + monitor="loss", + mode="min", + verbose=1, + min_delta=1e-1, + restore_best_weights=True, + ) + predict_callback_test = PredictCallback( + test_data, + test_labels, + reverse_dictionary, + n_epochs, + usage_pred, + standard_connections, + lowest_tool_ids, + ) callbacks_list = [predict_callback_test, early_stopping] batch_size = int(best_params["batch_size"]) @@ -57,21 +81,29 @@ train_labels, batch_size, tool_tr_samples, - reverse_dictionary + reverse_dictionary, ), steps_per_epoch=len(train_data) // batch_size, epochs=n_epochs, callbacks=callbacks_list, validation_data=(test_data, test_labels), verbose=2, - shuffle=True + shuffle=True, ) - train_performance["validation_loss"] = np.array(trained_model.history["val_loss"]) + train_performance["validation_loss"] = np.array( + trained_model.history["val_loss"] + ) train_performance["precision"] = predict_callback_test.precision train_performance["usage_weights"] = predict_callback_test.usage_weights - train_performance["published_precision"] = predict_callback_test.published_precision - train_performance["lowest_pub_precision"] = predict_callback_test.lowest_pub_precision - train_performance["lowest_norm_precision"] = predict_callback_test.lowest_norm_precision + train_performance[ + "published_precision" + ] = predict_callback_test.published_precision + train_performance[ + "lowest_pub_precision" + ] = predict_callback_test.lowest_pub_precision + train_performance[ + "lowest_norm_precision" + ] = predict_callback_test.lowest_norm_precision train_performance["train_loss"] = np.array(trained_model.history["loss"]) train_performance["model"] = best_model train_performance["best_parameters"] = best_params @@ -79,7 +111,16 @@ class PredictCallback(callbacks.Callback): - def __init__(self, test_data, test_labels, reverse_data_dictionary, n_epochs, usg_scores, standard_connections, lowest_tool_ids): + def __init__( + self, + test_data, + test_labels, + reverse_data_dictionary, + n_epochs, + usg_scores, + standard_connections, + lowest_tool_ids, + ): self.test_data = test_data self.test_labels = test_labels self.reverse_data_dictionary = reverse_data_dictionary @@ -98,7 +139,22 @@ Compute absolute and compatible precision for test data """ if len(self.test_data) > 0: - usage_weights, precision, precision_pub, low_pub_prec, low_norm_prec, low_num = utils.verify_model(self.model, self.test_data, self.test_labels, self.reverse_data_dictionary, self.pred_usage_scores, self.standard_connections, self.lowest_tool_ids) + ( + usage_weights, + precision, + precision_pub, + low_pub_prec, + low_norm_prec, + low_num, + ) = utils.verify_model( + self.model, + self.test_data, + self.test_labels, + self.reverse_data_dictionary, + self.pred_usage_scores, + self.standard_connections, + self.lowest_tool_ids, + ) self.precision.append(precision) self.usage_weights.append(usage_weights) self.published_precision.append(precision_pub) @@ -109,31 +165,96 @@ print("Epoch %d published precision: %s" % (epoch + 1, precision_pub)) print("Epoch %d lowest published precision: %s" % (epoch + 1, low_pub_prec)) print("Epoch %d lowest normal precision: %s" % (epoch + 1, low_norm_prec)) - print("Epoch %d number of test samples with lowest tool ids: %s" % (epoch + 1, low_num)) + print( + "Epoch %d number of test samples with lowest tool ids: %s" + % (epoch + 1, low_num) + ) if __name__ == "__main__": start_time = time.time() arg_parser = argparse.ArgumentParser() - arg_parser.add_argument("-wf", "--workflow_file", required=True, help="workflows tabular file") - arg_parser.add_argument("-tu", "--tool_usage_file", required=True, help="tool usage file") - arg_parser.add_argument("-om", "--output_model", required=True, help="trained model file") + arg_parser.add_argument( + "-wf", "--workflow_file", required=True, help="workflows tabular file" + ) + arg_parser.add_argument( + "-tu", "--tool_usage_file", required=True, help="tool usage file" + ) + arg_parser.add_argument( + "-om", "--output_model", required=True, help="trained model file" + ) # data parameters - arg_parser.add_argument("-cd", "--cutoff_date", required=True, help="earliest date for taking tool usage") - arg_parser.add_argument("-pl", "--maximum_path_length", required=True, help="maximum length of tool path") - arg_parser.add_argument("-ep", "--n_epochs", required=True, help="number of iterations to run to create model") - arg_parser.add_argument("-oe", "--optimize_n_epochs", required=True, help="number of iterations to run to find best model parameters") - arg_parser.add_argument("-me", "--max_evals", required=True, help="maximum number of configuration evaluations") - arg_parser.add_argument("-ts", "--test_share", required=True, help="share of data to be used for testing") + arg_parser.add_argument( + "-cd", + "--cutoff_date", + required=True, + help="earliest date for taking tool usage", + ) + arg_parser.add_argument( + "-pl", + "--maximum_path_length", + required=True, + help="maximum length of tool path", + ) + arg_parser.add_argument( + "-ep", + "--n_epochs", + required=True, + help="number of iterations to run to create model", + ) + arg_parser.add_argument( + "-oe", + "--optimize_n_epochs", + required=True, + help="number of iterations to run to find best model parameters", + ) + arg_parser.add_argument( + "-me", + "--max_evals", + required=True, + help="maximum number of configuration evaluations", + ) + arg_parser.add_argument( + "-ts", + "--test_share", + required=True, + help="share of data to be used for testing", + ) # neural network parameters - arg_parser.add_argument("-bs", "--batch_size", required=True, help="size of the tranining batch i.e. the number of samples per batch") - arg_parser.add_argument("-ut", "--units", required=True, help="number of hidden recurrent units") - arg_parser.add_argument("-es", "--embedding_size", required=True, help="size of the fixed vector learned for each tool") - arg_parser.add_argument("-dt", "--dropout", required=True, help="percentage of neurons to be dropped") - arg_parser.add_argument("-sd", "--spatial_dropout", required=True, help="1d dropout used for embedding layer") - arg_parser.add_argument("-rd", "--recurrent_dropout", required=True, help="dropout for the recurrent layers") - arg_parser.add_argument("-lr", "--learning_rate", required=True, help="learning rate") + arg_parser.add_argument( + "-bs", + "--batch_size", + required=True, + help="size of the tranining batch i.e. the number of samples per batch", + ) + arg_parser.add_argument( + "-ut", "--units", required=True, help="number of hidden recurrent units" + ) + arg_parser.add_argument( + "-es", + "--embedding_size", + required=True, + help="size of the fixed vector learned for each tool", + ) + arg_parser.add_argument( + "-dt", "--dropout", required=True, help="percentage of neurons to be dropped" + ) + arg_parser.add_argument( + "-sd", + "--spatial_dropout", + required=True, + help="1d dropout used for embedding layer", + ) + arg_parser.add_argument( + "-rd", + "--recurrent_dropout", + required=True, + help="dropout for the recurrent layers", + ) + arg_parser.add_argument( + "-lr", "--learning_rate", required=True, help="learning rate" + ) # get argument values args = vars(arg_parser.parse_args()) @@ -156,33 +277,74 @@ num_cpus = 16 config = { - 'cutoff_date': cutoff_date, - 'maximum_path_length': maximum_path_length, - 'n_epochs': n_epochs, - 'optimize_n_epochs': optimize_n_epochs, - 'max_evals': max_evals, - 'test_share': test_share, - 'batch_size': batch_size, - 'units': units, - 'embedding_size': embedding_size, - 'dropout': dropout, - 'spatial_dropout': spatial_dropout, - 'recurrent_dropout': recurrent_dropout, - 'learning_rate': learning_rate + "cutoff_date": cutoff_date, + "maximum_path_length": maximum_path_length, + "n_epochs": n_epochs, + "optimize_n_epochs": optimize_n_epochs, + "max_evals": max_evals, + "test_share": test_share, + "batch_size": batch_size, + "units": units, + "embedding_size": embedding_size, + "dropout": dropout, + "spatial_dropout": spatial_dropout, + "recurrent_dropout": recurrent_dropout, + "learning_rate": learning_rate, } # Extract and process workflows connections = extract_workflow_connections.ExtractWorkflowConnections() - workflow_paths, compatible_next_tools, standard_connections = connections.read_tabular_file(workflows_path) + ( + workflow_paths, + compatible_next_tools, + standard_connections, + ) = connections.read_tabular_file(workflows_path) # Process the paths from workflows print("Dividing data...") data = prepare_data.PrepareData(maximum_path_length, test_share) - train_data, train_labels, test_data, test_labels, data_dictionary, reverse_dictionary, class_weights, usage_pred, train_tool_freq, tool_tr_samples = data.get_data_labels_matrices(workflow_paths, tool_usage_path, cutoff_date, compatible_next_tools, standard_connections) + ( + train_data, + train_labels, + test_data, + test_labels, + data_dictionary, + reverse_dictionary, + class_weights, + usage_pred, + train_tool_freq, + tool_tr_samples, + ) = data.get_data_labels_matrices( + workflow_paths, + tool_usage_path, + cutoff_date, + compatible_next_tools, + standard_connections, + ) # find the best model and start training predict_tool = PredictTool(num_cpus) # start training with weighted classes print("Training with weighted classes and samples ...") - results_weighted = predict_tool.find_train_best_network(config, reverse_dictionary, train_data, train_labels, test_data, test_labels, n_epochs, class_weights, usage_pred, standard_connections, train_tool_freq, tool_tr_samples) - utils.save_model(results_weighted, data_dictionary, compatible_next_tools, trained_model_path, class_weights, standard_connections) + results_weighted = predict_tool.find_train_best_network( + config, + reverse_dictionary, + train_data, + train_labels, + test_data, + test_labels, + n_epochs, + class_weights, + usage_pred, + standard_connections, + train_tool_freq, + tool_tr_samples, + ) + utils.save_model( + results_weighted, + data_dictionary, + compatible_next_tools, + trained_model_path, + class_weights, + standard_connections, + ) end_time = time.time() print("Program finished in %s seconds" % str(end_time - start_time))