changeset 4:afec8c595124 draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 65d36f271296a38deeceb0d0e8d471b2898ee8f4"
author bgruening
date Tue, 07 Jul 2020 03:25:49 -0400
parents 5b3c08710e47
children 4f7e6612906b
files create_tool_recommendation_model.xml main.py optimise_hyperparameters.py prepare_data.py utils.py
diffstat 5 files changed, 48 insertions(+), 41 deletions(-) [+]
line wrap: on
line diff
--- a/create_tool_recommendation_model.xml	Sat May 09 05:38:23 2020 -0400
+++ b/create_tool_recommendation_model.xml	Tue Jul 07 03:25:49 2020 -0400
@@ -1,4 +1,4 @@
-<tool id="create_tool_recommendation_model" name="Create a model to recommend tools" version="0.0.2">
+<tool id="create_tool_recommendation_model" name="Create a model to recommend tools" version="0.0.3">
     <description>using deep learning</description>
     <requirements>
         <requirement type="package" version="3.6">python</requirement>
--- a/main.py	Sat May 09 05:38:23 2020 -0400
+++ b/main.py	Tue Jul 07 03:25:49 2020 -0400
@@ -31,23 +31,22 @@
         )
         K.set_session(tf.Session(config=cpu_config))
 
-    def find_train_best_network(self, network_config, reverse_dictionary, train_data, train_labels, test_data, test_labels, n_epochs, class_weights, usage_pred, standard_connections, l_tool_freq, l_tool_tr_samples):
+    def find_train_best_network(self, network_config, reverse_dictionary, train_data, train_labels, test_data, test_labels, n_epochs, class_weights, usage_pred, standard_connections, tool_freq, tool_tr_samples):
         """
         Define recurrent neural network and train sequential data
         """
         # get tools with lowest representation
-        lowest_tool_ids = utils.get_lowest_tools(l_tool_freq)
+        lowest_tool_ids = utils.get_lowest_tools(tool_freq)
 
         print("Start hyperparameter optimisation...")
         hyper_opt = optimise_hyperparameters.HyperparameterOptimisation()
-        best_params, best_model = hyper_opt.train_model(network_config, reverse_dictionary, train_data, train_labels, test_data, test_labels, l_tool_tr_samples, class_weights)
+        best_params, best_model = hyper_opt.train_model(network_config, reverse_dictionary, train_data, train_labels, test_data, test_labels, tool_tr_samples, class_weights)
 
         # define callbacks
         early_stopping = callbacks.EarlyStopping(monitor='loss', mode='min', verbose=1, min_delta=1e-1, restore_best_weights=True)
         predict_callback_test = PredictCallback(test_data, test_labels, reverse_dictionary, n_epochs, usage_pred, standard_connections, lowest_tool_ids)
 
         callbacks_list = [predict_callback_test, early_stopping]
-
         batch_size = int(best_params["batch_size"])
 
         print("Start training on the best model...")
@@ -57,7 +56,8 @@
                 train_data,
                 train_labels,
                 batch_size,
-                l_tool_tr_samples
+                tool_tr_samples,
+                reverse_dictionary
             ),
             steps_per_epoch=len(train_data) // batch_size,
             epochs=n_epochs,
@@ -177,13 +177,12 @@
     # Process the paths from workflows
     print("Dividing data...")
     data = prepare_data.PrepareData(maximum_path_length, test_share)
-    train_data, train_labels, test_data, test_labels, data_dictionary, reverse_dictionary, class_weights, usage_pred, l_tool_freq, l_tool_tr_samples = data.get_data_labels_matrices(workflow_paths, tool_usage_path, cutoff_date, compatible_next_tools, standard_connections)
+    train_data, train_labels, test_data, test_labels, data_dictionary, reverse_dictionary, class_weights, usage_pred, train_tool_freq, tool_tr_samples = data.get_data_labels_matrices(workflow_paths, tool_usage_path, cutoff_date, compatible_next_tools, standard_connections)
     # find the best model and start training
     predict_tool = PredictTool(num_cpus)
     # start training with weighted classes
     print("Training with weighted classes and samples ...")
-    results_weighted = predict_tool.find_train_best_network(config, reverse_dictionary, train_data, train_labels, test_data, test_labels, n_epochs, class_weights, usage_pred, standard_connections, l_tool_freq, l_tool_tr_samples)
+    results_weighted = predict_tool.find_train_best_network(config, reverse_dictionary, train_data, train_labels, test_data, test_labels, n_epochs, class_weights, usage_pred, standard_connections, train_tool_freq, tool_tr_samples)
     utils.save_model(results_weighted, data_dictionary, compatible_next_tools, trained_model_path, class_weights, standard_connections)
     end_time = time.time()
-    print()
     print("Program finished in %s seconds" % str(end_time - start_time))
--- a/optimise_hyperparameters.py	Sat May 09 05:38:23 2020 -0400
+++ b/optimise_hyperparameters.py	Tue Jul 07 03:25:49 2020 -0400
@@ -20,7 +20,7 @@
     def __init__(self):
         """ Init method. """
 
-    def train_model(self, config, reverse_dictionary, train_data, train_labels, test_data, test_labels, l_tool_tr_samples, class_weights):
+    def train_model(self, config, reverse_dictionary, train_data, train_labels, test_data, test_labels, tool_tr_samples, class_weights):
         """
         Train a model and report accuracy
         """
@@ -71,7 +71,8 @@
                     train_data,
                     train_labels,
                     batch_size,
-                    l_tool_tr_samples
+                    tool_tr_samples,
+                    reverse_dictionary
                 ),
                 steps_per_epoch=len(train_data) // batch_size,
                 epochs=optimize_n_epochs,
--- a/prepare_data.py	Sat May 09 05:38:23 2020 -0400
+++ b/prepare_data.py	Tue Jul 07 03:25:49 2020 -0400
@@ -10,7 +10,6 @@
 import random
 
 import predict_tool_usage
-import utils
 
 main_path = os.getcwd()
 
@@ -211,16 +210,15 @@
         to estimate the frequency of tool sequences
         """
         last_tool_freq = dict()
-        inv_freq = dict()
+        freq_dict_names = dict()
         for path in train_paths:
             last_tool = path.split(",")[-1]
             if last_tool not in last_tool_freq:
                 last_tool_freq[last_tool] = 0
+                freq_dict_names[reverse_dictionary[int(last_tool)]] = 0
             last_tool_freq[last_tool] += 1
-        max_freq = max(last_tool_freq.values())
-        for t in last_tool_freq:
-            inv_freq[t] = int(np.round(max_freq / float(last_tool_freq[t]), 0))
-        return last_tool_freq, inv_freq
+            freq_dict_names[reverse_dictionary[int(last_tool)]] += 1
+        return last_tool_freq
 
     def get_toolid_samples(self, train_data, l_tool_freq):
         l_tool_tr_samples = dict()
@@ -254,9 +252,6 @@
         print("Complete data: %d" % len(multilabels_paths))
         train_paths_dict, test_paths_dict = self.split_test_train_data(multilabels_paths)
 
-        # get sample frequency
-        l_tool_freq, inv_last_tool_freq = self.get_train_last_tool_freq(train_paths_dict, rev_dict)
-
         print("Train data: %d" % len(train_paths_dict))
         print("Test data: %d" % len(test_paths_dict))
 
@@ -265,6 +260,8 @@
         test_data, test_labels = self.pad_paths(test_paths_dict, num_classes, standard_connections, rev_dict)
         train_data, train_labels = self.pad_paths(train_paths_dict, num_classes, standard_connections, rev_dict)
 
+        print("Estimating sample frequency...")
+        l_tool_freq = self.get_train_last_tool_freq(train_paths_dict, rev_dict)
         l_tool_tr_samples = self.get_toolid_samples(train_data, l_tool_freq)
 
         # Predict tools usage
--- a/utils.py	Sat May 09 05:38:23 2020 -0400
+++ b/utils.py	Tue Jul 07 03:25:49 2020 -0400
@@ -1,8 +1,8 @@
-import os
 import numpy as np
 import json
 import h5py
 import random
+from numpy.random import choice
 
 from keras import backend as K
 
@@ -54,7 +54,6 @@
     """
     weight_values = list(class_weights.values())
     weight_values.extend(weight_values)
-
     def weighted_binary_crossentropy(y_true, y_pred):
         # add another dimension to compute dot product
         expanded_weights = K.expand_dims(weight_values, axis=-1)
@@ -62,16 +61,17 @@
     return weighted_binary_crossentropy
 
 
-def balanced_sample_generator(train_data, train_labels, batch_size, l_tool_tr_samples):
+def balanced_sample_generator(train_data, train_labels, batch_size, l_tool_tr_samples, reverse_dictionary):
     while True:
         dimension = train_data.shape[1]
         n_classes = train_labels.shape[1]
         tool_ids = list(l_tool_tr_samples.keys())
+        random.shuffle(tool_ids)
         generator_batch_data = np.zeros([batch_size, dimension])
         generator_batch_labels = np.zeros([batch_size, n_classes])
+        generated_tool_ids = choice(tool_ids, batch_size)
         for i in range(batch_size):
-            random_toolid_index = random.sample(range(0, len(tool_ids)), 1)[0]
-            random_toolid = tool_ids[random_toolid_index]
+            random_toolid = generated_tool_ids[i]
             sample_indices = l_tool_tr_samples[str(random_toolid)]
             random_index = random.sample(range(0, len(sample_indices)), 1)[0]
             random_tr_index = sample_indices[random_index]
@@ -129,12 +129,20 @@
         pred_t_name = reverse_data_dictionary[int(standard_topk_prediction_pos)]
         if last_tool_name in standard_conn:
             pub_tools = standard_conn[last_tool_name]
-        if pred_t_name in pub_tools:
-            pub_precision = 1.0
-            if last_tool_id in lowest_tool_ids:
-                lowest_pub_prec = 1.0
-            if standard_topk_prediction_pos in usage_scores:
-                usage_wt_score.append(np.log(usage_scores[standard_topk_prediction_pos] + 1.0))
+            if pred_t_name in pub_tools:
+                pub_precision = 1.0
+                # count precision only when there is actually true published tools
+                if last_tool_id in lowest_tool_ids:
+                    lowest_pub_prec = 1.0
+                else:
+                    lowest_pub_prec = np.nan
+                if standard_topk_prediction_pos in usage_scores:
+                    usage_wt_score.append(np.log(usage_scores[standard_topk_prediction_pos] + 1.0))
+        else:
+            # count precision only when there is actually true published tools
+            # else set to np.nan. Set to 0 only when there is wrong prediction
+            pub_precision = np.nan
+            lowest_pub_prec = np.nan
     # compute scores for normal recommendations
     if normal_topk_prediction_pos in reverse_data_dictionary:
         pred_t_name = reverse_data_dictionary[int(normal_topk_prediction_pos)]
@@ -144,6 +152,8 @@
             top_precision = 1.0
             if last_tool_id in lowest_tool_ids:
                 lowest_norm_prec = 1.0
+            else:
+                lowest_norm_prec = np.nan
     if len(usage_wt_score) > 0:
         mean_usage = np.mean(usage_wt_score)
     return mean_usage, top_precision, pub_precision, lowest_pub_prec, lowest_norm_prec
@@ -168,7 +178,7 @@
     epo_pub_prec = np.zeros([len(y), len(topk_list)])
     epo_lowest_tools_pub_prec = list()
     epo_lowest_tools_norm_prec = list()
-
+    lowest_counter = 0
     # loop over all the test samples and find prediction precision
     for i in range(size):
         lowest_pub_topk = list()
@@ -181,18 +191,18 @@
             precision[i][index] = absolute_precision
             usage_weights[i][index] = usg_wt_score
             epo_pub_prec[i][index] = pub_prec
-            if last_tool_id in lowest_tool_ids:
-                lowest_pub_topk.append(lowest_p_prec)
-                lowest_norm_topk.append(lowest_n_prec)
+            lowest_pub_topk.append(lowest_p_prec)
+            lowest_norm_topk.append(lowest_n_prec)
+        epo_lowest_tools_pub_prec.append(lowest_pub_topk)
+        epo_lowest_tools_norm_prec.append(lowest_norm_topk)
         if last_tool_id in lowest_tool_ids:
-            epo_lowest_tools_pub_prec.append(lowest_pub_topk)
-            epo_lowest_tools_norm_prec.append(lowest_norm_topk)
+            lowest_counter += 1
     mean_precision = np.mean(precision, axis=0)
     mean_usage = np.mean(usage_weights, axis=0)
-    mean_pub_prec = np.mean(epo_pub_prec, axis=0)
-    mean_lowest_pub_prec = np.mean(epo_lowest_tools_pub_prec, axis=0)
-    mean_lowest_norm_prec = np.mean(epo_lowest_tools_norm_prec, axis=0)
-    return mean_usage, mean_precision, mean_pub_prec, mean_lowest_pub_prec, mean_lowest_norm_prec, len(epo_lowest_tools_pub_prec)
+    mean_pub_prec = np.nanmean(epo_pub_prec, axis=0)
+    mean_lowest_pub_prec = np.nanmean(epo_lowest_tools_pub_prec, axis=0)
+    mean_lowest_norm_prec = np.nanmean(epo_lowest_tools_norm_prec, axis=0)
+    return mean_usage, mean_precision, mean_pub_prec, mean_lowest_pub_prec, mean_lowest_norm_prec, lowest_counter
 
 
 def save_model(results, data_dictionary, compatible_next_tools, trained_model_path, class_weights, standard_connections):