# HG changeset patch # User bgruening # Date 1594106749 14400 # Node ID afec8c5951248a90495a79028fcff6f0efb61875 # Parent 5b3c08710e47183546a9311e13302c2b1d744e2f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 65d36f271296a38deeceb0d0e8d471b2898ee8f4" diff -r 5b3c08710e47 -r afec8c595124 create_tool_recommendation_model.xml --- a/create_tool_recommendation_model.xml Sat May 09 05:38:23 2020 -0400 +++ b/create_tool_recommendation_model.xml Tue Jul 07 03:25:49 2020 -0400 @@ -1,4 +1,4 @@ - + using deep learning python diff -r 5b3c08710e47 -r afec8c595124 main.py --- a/main.py Sat May 09 05:38:23 2020 -0400 +++ b/main.py Tue Jul 07 03:25:49 2020 -0400 @@ -31,23 +31,22 @@ ) K.set_session(tf.Session(config=cpu_config)) - def find_train_best_network(self, network_config, reverse_dictionary, train_data, train_labels, test_data, test_labels, n_epochs, class_weights, usage_pred, standard_connections, l_tool_freq, l_tool_tr_samples): + def find_train_best_network(self, network_config, reverse_dictionary, train_data, train_labels, test_data, test_labels, n_epochs, class_weights, usage_pred, standard_connections, tool_freq, tool_tr_samples): """ Define recurrent neural network and train sequential data """ # get tools with lowest representation - lowest_tool_ids = utils.get_lowest_tools(l_tool_freq) + lowest_tool_ids = utils.get_lowest_tools(tool_freq) print("Start hyperparameter optimisation...") hyper_opt = optimise_hyperparameters.HyperparameterOptimisation() - best_params, best_model = hyper_opt.train_model(network_config, reverse_dictionary, train_data, train_labels, test_data, test_labels, l_tool_tr_samples, class_weights) + best_params, best_model = hyper_opt.train_model(network_config, reverse_dictionary, train_data, train_labels, test_data, test_labels, tool_tr_samples, class_weights) # define callbacks early_stopping = callbacks.EarlyStopping(monitor='loss', mode='min', verbose=1, min_delta=1e-1, restore_best_weights=True) predict_callback_test = PredictCallback(test_data, test_labels, reverse_dictionary, n_epochs, usage_pred, standard_connections, lowest_tool_ids) callbacks_list = [predict_callback_test, early_stopping] - batch_size = int(best_params["batch_size"]) print("Start training on the best model...") @@ -57,7 +56,8 @@ train_data, train_labels, batch_size, - l_tool_tr_samples + tool_tr_samples, + reverse_dictionary ), steps_per_epoch=len(train_data) // batch_size, epochs=n_epochs, @@ -177,13 +177,12 @@ # Process the paths from workflows print("Dividing data...") data = prepare_data.PrepareData(maximum_path_length, test_share) - train_data, train_labels, test_data, test_labels, data_dictionary, reverse_dictionary, class_weights, usage_pred, l_tool_freq, l_tool_tr_samples = data.get_data_labels_matrices(workflow_paths, tool_usage_path, cutoff_date, compatible_next_tools, standard_connections) + train_data, train_labels, test_data, test_labels, data_dictionary, reverse_dictionary, class_weights, usage_pred, train_tool_freq, tool_tr_samples = data.get_data_labels_matrices(workflow_paths, tool_usage_path, cutoff_date, compatible_next_tools, standard_connections) # find the best model and start training predict_tool = PredictTool(num_cpus) # start training with weighted classes print("Training with weighted classes and samples ...") - results_weighted = predict_tool.find_train_best_network(config, reverse_dictionary, train_data, train_labels, test_data, test_labels, n_epochs, class_weights, usage_pred, standard_connections, l_tool_freq, l_tool_tr_samples) + results_weighted = predict_tool.find_train_best_network(config, reverse_dictionary, train_data, train_labels, test_data, test_labels, n_epochs, class_weights, usage_pred, standard_connections, train_tool_freq, tool_tr_samples) utils.save_model(results_weighted, data_dictionary, compatible_next_tools, trained_model_path, class_weights, standard_connections) end_time = time.time() - print() print("Program finished in %s seconds" % str(end_time - start_time)) diff -r 5b3c08710e47 -r afec8c595124 optimise_hyperparameters.py --- a/optimise_hyperparameters.py Sat May 09 05:38:23 2020 -0400 +++ b/optimise_hyperparameters.py Tue Jul 07 03:25:49 2020 -0400 @@ -20,7 +20,7 @@ def __init__(self): """ Init method. """ - def train_model(self, config, reverse_dictionary, train_data, train_labels, test_data, test_labels, l_tool_tr_samples, class_weights): + def train_model(self, config, reverse_dictionary, train_data, train_labels, test_data, test_labels, tool_tr_samples, class_weights): """ Train a model and report accuracy """ @@ -71,7 +71,8 @@ train_data, train_labels, batch_size, - l_tool_tr_samples + tool_tr_samples, + reverse_dictionary ), steps_per_epoch=len(train_data) // batch_size, epochs=optimize_n_epochs, diff -r 5b3c08710e47 -r afec8c595124 prepare_data.py --- a/prepare_data.py Sat May 09 05:38:23 2020 -0400 +++ b/prepare_data.py Tue Jul 07 03:25:49 2020 -0400 @@ -10,7 +10,6 @@ import random import predict_tool_usage -import utils main_path = os.getcwd() @@ -211,16 +210,15 @@ to estimate the frequency of tool sequences """ last_tool_freq = dict() - inv_freq = dict() + freq_dict_names = dict() for path in train_paths: last_tool = path.split(",")[-1] if last_tool not in last_tool_freq: last_tool_freq[last_tool] = 0 + freq_dict_names[reverse_dictionary[int(last_tool)]] = 0 last_tool_freq[last_tool] += 1 - max_freq = max(last_tool_freq.values()) - for t in last_tool_freq: - inv_freq[t] = int(np.round(max_freq / float(last_tool_freq[t]), 0)) - return last_tool_freq, inv_freq + freq_dict_names[reverse_dictionary[int(last_tool)]] += 1 + return last_tool_freq def get_toolid_samples(self, train_data, l_tool_freq): l_tool_tr_samples = dict() @@ -254,9 +252,6 @@ print("Complete data: %d" % len(multilabels_paths)) train_paths_dict, test_paths_dict = self.split_test_train_data(multilabels_paths) - # get sample frequency - l_tool_freq, inv_last_tool_freq = self.get_train_last_tool_freq(train_paths_dict, rev_dict) - print("Train data: %d" % len(train_paths_dict)) print("Test data: %d" % len(test_paths_dict)) @@ -265,6 +260,8 @@ test_data, test_labels = self.pad_paths(test_paths_dict, num_classes, standard_connections, rev_dict) train_data, train_labels = self.pad_paths(train_paths_dict, num_classes, standard_connections, rev_dict) + print("Estimating sample frequency...") + l_tool_freq = self.get_train_last_tool_freq(train_paths_dict, rev_dict) l_tool_tr_samples = self.get_toolid_samples(train_data, l_tool_freq) # Predict tools usage diff -r 5b3c08710e47 -r afec8c595124 utils.py --- a/utils.py Sat May 09 05:38:23 2020 -0400 +++ b/utils.py Tue Jul 07 03:25:49 2020 -0400 @@ -1,8 +1,8 @@ -import os import numpy as np import json import h5py import random +from numpy.random import choice from keras import backend as K @@ -54,7 +54,6 @@ """ weight_values = list(class_weights.values()) weight_values.extend(weight_values) - def weighted_binary_crossentropy(y_true, y_pred): # add another dimension to compute dot product expanded_weights = K.expand_dims(weight_values, axis=-1) @@ -62,16 +61,17 @@ return weighted_binary_crossentropy -def balanced_sample_generator(train_data, train_labels, batch_size, l_tool_tr_samples): +def balanced_sample_generator(train_data, train_labels, batch_size, l_tool_tr_samples, reverse_dictionary): while True: dimension = train_data.shape[1] n_classes = train_labels.shape[1] tool_ids = list(l_tool_tr_samples.keys()) + random.shuffle(tool_ids) generator_batch_data = np.zeros([batch_size, dimension]) generator_batch_labels = np.zeros([batch_size, n_classes]) + generated_tool_ids = choice(tool_ids, batch_size) for i in range(batch_size): - random_toolid_index = random.sample(range(0, len(tool_ids)), 1)[0] - random_toolid = tool_ids[random_toolid_index] + random_toolid = generated_tool_ids[i] sample_indices = l_tool_tr_samples[str(random_toolid)] random_index = random.sample(range(0, len(sample_indices)), 1)[0] random_tr_index = sample_indices[random_index] @@ -129,12 +129,20 @@ pred_t_name = reverse_data_dictionary[int(standard_topk_prediction_pos)] if last_tool_name in standard_conn: pub_tools = standard_conn[last_tool_name] - if pred_t_name in pub_tools: - pub_precision = 1.0 - if last_tool_id in lowest_tool_ids: - lowest_pub_prec = 1.0 - if standard_topk_prediction_pos in usage_scores: - usage_wt_score.append(np.log(usage_scores[standard_topk_prediction_pos] + 1.0)) + if pred_t_name in pub_tools: + pub_precision = 1.0 + # count precision only when there is actually true published tools + if last_tool_id in lowest_tool_ids: + lowest_pub_prec = 1.0 + else: + lowest_pub_prec = np.nan + if standard_topk_prediction_pos in usage_scores: + usage_wt_score.append(np.log(usage_scores[standard_topk_prediction_pos] + 1.0)) + else: + # count precision only when there is actually true published tools + # else set to np.nan. Set to 0 only when there is wrong prediction + pub_precision = np.nan + lowest_pub_prec = np.nan # compute scores for normal recommendations if normal_topk_prediction_pos in reverse_data_dictionary: pred_t_name = reverse_data_dictionary[int(normal_topk_prediction_pos)] @@ -144,6 +152,8 @@ top_precision = 1.0 if last_tool_id in lowest_tool_ids: lowest_norm_prec = 1.0 + else: + lowest_norm_prec = np.nan if len(usage_wt_score) > 0: mean_usage = np.mean(usage_wt_score) return mean_usage, top_precision, pub_precision, lowest_pub_prec, lowest_norm_prec @@ -168,7 +178,7 @@ epo_pub_prec = np.zeros([len(y), len(topk_list)]) epo_lowest_tools_pub_prec = list() epo_lowest_tools_norm_prec = list() - + lowest_counter = 0 # loop over all the test samples and find prediction precision for i in range(size): lowest_pub_topk = list() @@ -181,18 +191,18 @@ precision[i][index] = absolute_precision usage_weights[i][index] = usg_wt_score epo_pub_prec[i][index] = pub_prec - if last_tool_id in lowest_tool_ids: - lowest_pub_topk.append(lowest_p_prec) - lowest_norm_topk.append(lowest_n_prec) + lowest_pub_topk.append(lowest_p_prec) + lowest_norm_topk.append(lowest_n_prec) + epo_lowest_tools_pub_prec.append(lowest_pub_topk) + epo_lowest_tools_norm_prec.append(lowest_norm_topk) if last_tool_id in lowest_tool_ids: - epo_lowest_tools_pub_prec.append(lowest_pub_topk) - epo_lowest_tools_norm_prec.append(lowest_norm_topk) + lowest_counter += 1 mean_precision = np.mean(precision, axis=0) mean_usage = np.mean(usage_weights, axis=0) - mean_pub_prec = np.mean(epo_pub_prec, axis=0) - mean_lowest_pub_prec = np.mean(epo_lowest_tools_pub_prec, axis=0) - mean_lowest_norm_prec = np.mean(epo_lowest_tools_norm_prec, axis=0) - return mean_usage, mean_precision, mean_pub_prec, mean_lowest_pub_prec, mean_lowest_norm_prec, len(epo_lowest_tools_pub_prec) + mean_pub_prec = np.nanmean(epo_pub_prec, axis=0) + mean_lowest_pub_prec = np.nanmean(epo_lowest_tools_pub_prec, axis=0) + mean_lowest_norm_prec = np.nanmean(epo_lowest_tools_norm_prec, axis=0) + return mean_usage, mean_precision, mean_pub_prec, mean_lowest_pub_prec, mean_lowest_norm_prec, lowest_counter def save_model(results, data_dictionary, compatible_next_tools, trained_model_path, class_weights, standard_connections):