Repository 'create_tool_recommendation_model'
hg clone https://toolshed.g2.bx.psu.edu/repos/bgruening/create_tool_recommendation_model

Changeset 0:9bf25dbe00ad (2019-08-28)
Next changeset 1:12764915e1c5 (2019-09-25)
Commit message:
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
added:
create_tool_recommendation_model.xml
extract_workflow_connections.py
main.py
optimise_hyperparameters.py
predict_tool_usage.py
prepare_data.py
test-data/test_tool_usage
test-data/test_workflows
utils.py
b
diff -r 000000000000 -r 9bf25dbe00ad create_tool_recommendation_model.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/create_tool_recommendation_model.xml Wed Aug 28 07:19:38 2019 -0400
[
b'@@ -0,0 +1,164 @@\n+<tool id="create_tool_recommendation_model" name="Create a model to recommend tools" version="0.0.1">\n+    <description>using deep learning</description>\n+    <requirements>\n+        <requirement type="package" version="3.6">python</requirement>\n+        <requirement type="package" version="1.14.0">tensorflow</requirement>\n+        <requirement type="package" version="2.2.4">keras</requirement>\n+        <requirement type="package" version="0.20.1">scikit-learn</requirement>\n+        <requirement type="package" version="2.9.0">h5py</requirement>\n+        <requirement type="package" version="1.0">csvkit</requirement>\n+        <requirement type="package" version="0.1.2">hyperopt</requirement>\n+    </requirements>\n+    <version_command>echo "@VERSION@"</version_command>\n+    <command detect_errors="aggressive">\n+<![CDATA[\n+        python \'$__tool_directory__/main.py\'\n+            --workflow_file \'$input_tabular_workflows\'\n+            --tool_usage_file \'$input_tabular_tool_usage\'\n+            --cutoff_date \'$data_parameters.input_cutoff_date\'\n+            --maximum_path_length \'$data_parameters.input_maximum_path_length\'\n+            --n_epochs \'$training_parameters.n_epochs\'\n+            --optimize_n_epochs \'$training_parameters.optimize_n_epochs\'\n+            --max_evals \'$training_parameters.max_evals\'\n+            --test_share \'$training_parameters.test_share\'\n+            --validation_share \'$training_parameters.validation_share\'\n+            --batch_size \'$nn_parameters.batch_size\'\n+            --units \'$nn_parameters.units\'\n+            --embedding_size \'$nn_parameters.embedding_size\'\n+            --dropout \'$nn_parameters.dropout\'\n+            --spatial_dropout \'$nn_parameters.spatial_dropout\'\n+            --recurrent_dropout \'$nn_parameters.recurrent_dropout\'\n+            --learning_rate \'$nn_parameters.learning_rate\'\n+            --activation_recurrent \'$nn_parameters.activation_recurrent\'\n+            --activation_output \'$nn_parameters.activation_output\'\n+            --loss_type \'$nn_parameters.loss_type\'\n+            --output_model \'$outfile_model\'\n+]]>\n+    </command>\n+    <inputs>\n+        <param name="input_tabular_workflows" type="data" format="tabular" label="Dataset containing workflows" help="Please provide Galaxy workflows as a tabular file."/>\n+        <param name="input_tabular_tool_usage" type="data" format="tabular" label="Dataset containing usage frequencies of tools" help="Please provide tools usage frequencies as a tabular file."/>\n+        <section name="data_parameters" title="Data parameters" expanded="False">\n+            <param name="input_cutoff_date" type="text" value="2017-12-01" label="Cutoff date" help="Provide a date (in the past) in yyyy-mm-dd format. The earliest date from which usage of tools will be extracted. For example, 2017-12-01 specifies that the usage of tools from this date until the data extraction date is extracted. The usage of tools before this date is not considered."/>\n+            <param name="input_maximum_path_length" type="integer" value="25" label="Maximum number of tools in a tool sequence" help="Provide an integer between 1 and 25. A workflow is divided into unique paths and this number specifies the maximum number of tools a path can have. Paths longer than this number are ignored and are not included in the deep learning training."/>\n+        </section>\n+        <section name="training_parameters" title="Training parameters" expanded="False">\n+            <param name="max_evals" type="integer" value="2" label="Maximum number of evaluations of different configurations of parameters" help="Provide an integer. Different combinations of parameters are sampled and optimized to find the best one. This number specifies the number of different configurations sampled and tested."/>\n+            <param name="optimize_n_epochs" type="integer" value="2" label="Number of training iterations to optimize the neural network parameters" help="Provide an integer. '..b'n an unseen set.\n+    - "validation_share": It specifies the size of the validation set. For example, if it is 0.5, then the validation set is half of the entire data available. It should not be set to more than 0.5. This set is used for computing error while training on the best configuration.\n+    \n+3. Neural network parameters:\n+    - "batch_size": The training of the neural network is done using batch learning in this work. The training data is divided into equal batches and for each epoch (a training iteration), all batches of data are trained one after another. A higher or lower value can unsettle the training. Therefore, this parameter should be optimised.\n+    - "units": This number is the number of hidden recurrent units. A higher number means stronger learning (may lead to overfitting) and a lower number means weaker learning (may lead to underfitting). Therefore, this number should be optimised.\n+    - "embedding_size": For each tool, a fixed-size vector is learned and this fixed-size is known as the embedding size. This size remains same for all the tools. A lower number may underfit and a higher number may overfit. This parameter should be optimised as well.\n+    - "dropout": A neural network tends to overfit (especially when it is stronger). Therefore, to avoid or minimize overfitting, dropout is used. The fraction specified by dropout is the fraction of units "deleted" randomly from the network to impose randomness which helps in avoiding overfitting. This parameter should be optimised as well.\n+    - "spatial_dropout": Similar to dropout, this is used to reduce overfitting in the embedding layer. This parameter should be optimised as well.\n+    - "recurrent_dropout": Similar to dropout and spatial dropout, this is used to reduce overfitting in the recurrent layers (hidden). This parameter should be optimised as well.\n+    - "learning_rate": The learning rate specifies the speed of learning. A higher value ensures fast learning (the optimiser may diverge) and a lower value causes slow learning (may not reach the optimum). This parameter should be optimised as well.\n+    - "activation_recurrent": Activations are mathematical functions to transform input into output. This takes the name of an activation function from the list of Keras activations (https://keras.io/activations/) for recurrent layers.\n+    - "activation_output": This takes the activation for transforming the input of the last layer to the output of the neural network. It is also taken from Keras activations (https://keras.io/activations/).\n+    - "loss_type": This is also a mathematical function which computes the error between true and predicted outputs. An optimizer uses this loss function to compute error and minimize it. It is taken from the list of Keras optimisers (https://keras.io/optimizers/).\n+\n+-----\n+\n+**Output file**\n+\n+The output file (model) is an HDF5 file (http://docs.h5py.org/en/latest/high/file.html) containing multiple attributes like a dictionary of tools, neural network configuration and weights for each layer, weights of all tools and so on. After the tool has finished executing, it can be downloaded and placed at "/galaxy/database/" inside a Galaxy instance codebase. To see the recommended tools (enable the UI integrations) in Galaxy, the following changes should be made to "galaxy.yml" file:\n+\n+    - Enable and then set the property "enable_tool_recommendation" to "true".\n+    - Enable and then set the property "model_path" to "database/<<model_file_name>>".\n+\n+        ]]>\n+    </help>\n+    <citations>\n+        <citation type="bibtex">\n+            @ARTICLE{anuprulez_galaxytools,\n+                Author = {Anup Kumar and Bj\xc3\xb6rn Gr\xc3\xbcning},\n+                keywords = {bioinformatics, recommendation system, deep learning},\n+                title = {{Tool recommendation system for Galaxy workflows}},\n+                url = {https://github.com/anuprulez/galaxytools}\n+            }\n+        </citation>\n+    </citations>\n+</tool>\n'
b
diff -r 000000000000 -r 9bf25dbe00ad extract_workflow_connections.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/extract_workflow_connections.py Wed Aug 28 07:19:38 2019 -0400
[
@@ -0,0 +1,140 @@
+"""
+Extract workflow paths from the tabular file containing
+input and output tools
+"""
+
+import csv
+import random
+
+import utils
+
+
+class ExtractWorkflowConnections:
+
+    @classmethod
+    def __init__(self):
+        """ Init method. """
+
+    @classmethod
+    def read_tabular_file(self, raw_file_path):
+        """
+        Read tabular file and extract workflow connections
+        """
+        print("Reading workflows...")
+        workflows = {}
+        workflow_paths_dup = ""
+        workflow_parents = dict()
+        workflow_paths = list()
+        unique_paths = list()
+        with open(raw_file_path, 'rt') as workflow_connections_file:
+            workflow_connections = csv.reader(workflow_connections_file, delimiter='\t')
+            for index, row in enumerate(workflow_connections):
+                wf_id = str(row[0])
+                in_tool = row[3]
+                out_tool = row[6]
+                if wf_id not in workflows:
+                    workflows[wf_id] = list()
+                if out_tool and in_tool and out_tool != in_tool:
+                    workflows[wf_id].append((in_tool, out_tool))
+        print("Processing workflows...")
+        wf_ctr = 0
+        for wf_id in workflows:
+            wf_ctr += 1
+            workflow_parents[wf_id] = self.read_workflow(wf_id, workflows[wf_id])
+
+        for wf_id in workflow_parents:
+            flow_paths = list()
+            parents_graph = workflow_parents[wf_id]
+            roots, leaves = self.get_roots_leaves(parents_graph)
+            for root in roots:
+                for leaf in leaves:
+                    paths = self.find_tool_paths_workflow(parents_graph, root, leaf)
+                    # reverse the paths as they are computed from leaves to roots leaf
+                    paths = [tool_path for tool_path in paths]
+                    if len(paths) > 0:
+                        flow_paths.extend(paths)
+            workflow_paths.extend(flow_paths)
+
+        print("Workflows processed: %d" % wf_ctr)
+
+        # remove slashes from the tool ids
+        wf_paths_no_slash = list()
+        for path in workflow_paths:
+            path_no_slash = [utils.format_tool_id(tool_id) for tool_id in path]
+            wf_paths_no_slash.append(path_no_slash)
+
+        # collect duplicate paths
+        for path in wf_paths_no_slash:
+            workflow_paths_dup += ",".join(path) + "\n"
+
+        # collect unique paths
+        unique_paths = list(workflow_paths_dup.split("\n"))
+        unique_paths = list(filter(None, unique_paths))
+        random.shuffle(unique_paths)
+        no_dup_paths = list(set(unique_paths))
+
+        print("Finding compatible next tools...")
+        compatible_next_tools = self.set_compatible_next_tools(no_dup_paths)
+        return unique_paths, compatible_next_tools
+
+    @classmethod
+    def set_compatible_next_tools(self, workflow_paths):
+        """
+        Find next tools for each tool
+        """
+        next_tools = dict()
+        for path in workflow_paths:
+            path_split = path.split(",")
+            for window in range(0, len(path_split) - 1):
+                current_next_tools = path_split[window: window + 2]
+                current_tool = current_next_tools[0]
+                next_tool = current_next_tools[1]
+                try:
+                    next_tools[current_tool] += "," + next_tool
+                except Exception:
+                    next_tools[current_tool] = next_tool
+        for tool in next_tools:
+            next_tools[tool] = ",".join(list(set(next_tools[tool].split(","))))
+        return next_tools
+
+    @classmethod
+    def read_workflow(self, wf_id, workflow_rows):
+        """
+        Read all connections for a workflow
+        """
+        tool_parents = dict()
+        for connection in workflow_rows:
+            in_tool = connection[0]
+            out_tool = connection[1]
+            if out_tool not in tool_parents:
+                tool_parents[out_tool] = list()
+            if in_tool not in tool_parents[out_tool]:
+                tool_parents[out_tool].append(in_tool)
+        return tool_parents
+
+    @classmethod
+    def get_roots_leaves(self, graph):
+        roots = list()
+        leaves = list()
+        all_parents = list()
+        for item in graph:
+            all_parents.extend(graph[item])
+        all_parents = list(set(all_parents))
+        children = graph.keys()
+        roots = list(set(all_parents).difference(set(children)))
+        leaves = list(set(children).difference(set(all_parents)))
+        return roots, leaves
+
+    @classmethod
+    def find_tool_paths_workflow(self, graph, start, end, path=[]):
+        path = path + [end]
+        if start == end:
+            return [path]
+        path_list = list()
+        if end in graph:
+            for node in graph[end]:
+                if node not in path:
+                    new_tools_paths = self.find_tool_paths_workflow(graph, start, node, path)
+                    for tool_path in new_tools_paths:
+                        path_list.append(tool_path)
+        return path_list
b
diff -r 000000000000 -r 9bf25dbe00ad main.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/main.py Wed Aug 28 07:19:38 2019 -0400
[
b'@@ -0,0 +1,178 @@\n+"""\n+Predict next tools in the Galaxy workflows\n+using machine learning (recurrent neural network)\n+"""\n+\n+import numpy as np\n+import argparse\n+import time\n+\n+# machine learning library\n+import keras.callbacks as callbacks\n+\n+import extract_workflow_connections\n+import prepare_data\n+import optimise_hyperparameters\n+import utils\n+\n+\n+class PredictTool:\n+\n+    @classmethod\n+    def __init__(self):\n+        """ Init method. """\n+\n+    @classmethod\n+    def find_train_best_network(self, network_config, reverse_dictionary, train_data, train_labels, test_data, test_labels, n_epochs, class_weights, usage_pred, compatible_next_tools):\n+        """\n+        Define recurrent neural network and train sequential data\n+        """\n+        print("Start hyperparameter optimisation...")\n+        hyper_opt = optimise_hyperparameters.HyperparameterOptimisation()\n+        best_params = hyper_opt.train_model(network_config, reverse_dictionary, train_data, train_labels, test_data, test_labels, class_weights)\n+\n+        # retrieve the model and train on complete dataset without validation set\n+        model, best_params = utils.set_recurrent_network(best_params, reverse_dictionary, class_weights)\n+\n+        # define callbacks\n+        predict_callback_test = PredictCallback(test_data, test_labels, reverse_dictionary, n_epochs, compatible_next_tools, usage_pred)\n+        # tensor_board = callbacks.TensorBoard(log_dir=log_directory, histogram_freq=0, write_graph=True, write_images=True)\n+        callbacks_list = [predict_callback_test]\n+\n+        print("Start training on the best model...")\n+        model_fit = model.fit(\n+            train_data,\n+            train_labels,\n+            batch_size=int(best_params["batch_size"]),\n+            epochs=n_epochs,\n+            verbose=2,\n+            callbacks=callbacks_list,\n+            shuffle="batch",\n+            validation_data=(test_data, test_labels)\n+        )\n+\n+        train_performance = {\n+            "train_loss": np.array(model_fit.history["loss"]),\n+            "model": model,\n+            "best_parameters": best_params\n+        }\n+\n+        # if there is test data, add more information\n+        if len(test_data) > 0:\n+            train_performance["validation_loss"] = np.array(model_fit.history["val_loss"])\n+            train_performance["precision"] = predict_callback_test.precision\n+            train_performance["usage_weights"] = predict_callback_test.usage_weights\n+        return train_performance\n+\n+\n+class PredictCallback(callbacks.Callback):\n+    def __init__(self, test_data, test_labels, reverse_data_dictionary, n_epochs, next_compatible_tools, usg_scores):\n+        self.test_data = test_data\n+        self.test_labels = test_labels\n+        self.reverse_data_dictionary = reverse_data_dictionary\n+        self.precision = list()\n+        self.usage_weights = list()\n+        self.n_epochs = n_epochs\n+        self.next_compatible_tools = next_compatible_tools\n+        self.pred_usage_scores = usg_scores\n+\n+    def on_epoch_end(self, epoch, logs={}):\n+        """\n+        Compute absolute and compatible precision for test data\n+        """\n+        if len(self.test_data) > 0:\n+            precision, usage_weights = utils.verify_model(self.model, self.test_data, self.test_labels, self.reverse_data_dictionary, self.next_compatible_tools, self.pred_usage_scores)\n+            self.precision.append(precision)\n+            self.usage_weights.append(usage_weights)\n+            print("Epoch %d precision: %s" % (epoch + 1, precision))\n+            print("Epoch %d usage weights: %s" % (epoch + 1, usage_weights))\n+\n+\n+if __name__ == "__main__":\n+    start_time = time.time()\n+    arg_parser = argparse.ArgumentParser()\n+    arg_parser.add_argument("-wf", "--workflow_file", required=True, help="workflows tabular file")\n+    arg_parser.add_argument("-tu", "--tool_usage_file", required=True, help="tool usage file")\n+    arg_parser.add_argument("-om", "--output_model", required=True,'..b'f samples per batch")\n+    arg_parser.add_argument("-ut", "--units", required=True, help="number of hidden recurrent units")\n+    arg_parser.add_argument("-es", "--embedding_size", required=True, help="size of the fixed vector learned for each tool")\n+    arg_parser.add_argument("-dt", "--dropout", required=True, help="percentage of neurons to be dropped")\n+    arg_parser.add_argument("-sd", "--spatial_dropout", required=True, help="1d dropout used for embedding layer")\n+    arg_parser.add_argument("-rd", "--recurrent_dropout", required=True, help="dropout for the recurrent layers")\n+    arg_parser.add_argument("-lr", "--learning_rate", required=True, help="learning rate")\n+    arg_parser.add_argument("-ar", "--activation_recurrent", required=True, help="activation function for recurrent layers")\n+    arg_parser.add_argument("-ao", "--activation_output", required=True, help="activation function for output layers")\n+    arg_parser.add_argument("-lt", "--loss_type", required=True, help="type of the loss/error function")\n+    # get argument values\n+    args = vars(arg_parser.parse_args())\n+    tool_usage_path = args["tool_usage_file"]\n+    workflows_path = args["workflow_file"]\n+    cutoff_date = args["cutoff_date"]\n+    maximum_path_length = int(args["maximum_path_length"])\n+    trained_model_path = args["output_model"]\n+    n_epochs = int(args["n_epochs"])\n+    optimize_n_epochs = int(args["optimize_n_epochs"])\n+    max_evals = int(args["max_evals"])\n+    test_share = float(args["test_share"])\n+    validation_share = float(args["validation_share"])\n+    batch_size = args["batch_size"]\n+    units = args["units"]\n+    embedding_size = args["embedding_size"]\n+    dropout = args["dropout"]\n+    spatial_dropout = args["spatial_dropout"]\n+    recurrent_dropout = args["recurrent_dropout"]\n+    learning_rate = args["learning_rate"]\n+    activation_recurrent = args["activation_recurrent"]\n+    activation_output = args["activation_output"]\n+    loss_type = args["loss_type"]\n+\n+    config = {\n+        \'cutoff_date\': cutoff_date,\n+        \'maximum_path_length\': maximum_path_length,\n+        \'n_epochs\': n_epochs,\n+        \'optimize_n_epochs\': optimize_n_epochs,\n+        \'max_evals\': max_evals,\n+        \'test_share\': test_share,\n+        \'validation_share\': validation_share,\n+        \'batch_size\': batch_size,\n+        \'units\': units,\n+        \'embedding_size\': embedding_size,\n+        \'dropout\': dropout,\n+        \'spatial_dropout\': spatial_dropout,\n+        \'recurrent_dropout\': recurrent_dropout,\n+        \'learning_rate\': learning_rate,\n+        \'activation_recurrent\': activation_recurrent,\n+        \'activation_output\': activation_output,\n+        \'loss_type\': loss_type\n+    }\n+\n+    # Extract and process workflows\n+    connections = extract_workflow_connections.ExtractWorkflowConnections()\n+    workflow_paths, compatible_next_tools = connections.read_tabular_file(workflows_path)\n+    # Process the paths from workflows\n+    print("Dividing data...")\n+    data = prepare_data.PrepareData(maximum_path_length, test_share)\n+    train_data, train_labels, test_data, test_labels, data_dictionary, reverse_dictionary, class_weights, usage_pred = data.get_data_labels_matrices(workflow_paths, tool_usage_path, cutoff_date, compatible_next_tools)\n+    # find the best model and start training\n+    predict_tool = PredictTool()\n+    # start training with weighted classes\n+    print("Training with weighted classes and samples ...")\n+    results_weighted = predict_tool.find_train_best_network(config, reverse_dictionary, train_data, train_labels, test_data, test_labels, n_epochs, class_weights, usage_pred, compatible_next_tools)\n+    print()\n+    print("Best parameters \\n")\n+    print(results_weighted["best_parameters"])\n+    print()\n+    utils.save_model(results_weighted, data_dictionary, compatible_next_tools, trained_model_path, class_weights)\n+    end_time = time.time()\n+    print()\n+    print("Program finished in %s seconds" % str(end_time - start_time))\n'
b
diff -r 000000000000 -r 9bf25dbe00ad optimise_hyperparameters.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/optimise_hyperparameters.py Wed Aug 28 07:19:38 2019 -0400
[
@@ -0,0 +1,99 @@
+"""
+Find the optimal combination of hyperparameters
+"""
+
+import numpy as np
+from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
+
+from keras.models import Sequential
+from keras.layers import Dense, GRU, Dropout
+from keras.layers.embeddings import Embedding
+from keras.layers.core import SpatialDropout1D
+from keras.optimizers import RMSprop
+from keras.callbacks import EarlyStopping
+
+import utils
+
+
+class HyperparameterOptimisation:
+
+    @classmethod
+    def __init__(self):
+        """ Init method. """
+
+    @classmethod
+    def train_model(self, config, reverse_dictionary, train_data, train_labels, test_data, test_labels, class_weights):
+        """
+        Train a model and report accuracy
+        """
+        l_recurrent_activations = config["activation_recurrent"].split(",")
+        l_output_activations = config["activation_output"].split(",")
+
+        # convert items to integer
+        l_batch_size = list(map(int, config["batch_size"].split(",")))
+        l_embedding_size = list(map(int, config["embedding_size"].split(",")))
+        l_units = list(map(int, config["units"].split(",")))
+
+        # convert items to float
+        l_learning_rate = list(map(float, config["learning_rate"].split(",")))
+        l_dropout = list(map(float, config["dropout"].split(",")))
+        l_spatial_dropout = list(map(float, config["spatial_dropout"].split(",")))
+        l_recurrent_dropout = list(map(float, config["recurrent_dropout"].split(",")))
+
+        optimize_n_epochs = int(config["optimize_n_epochs"])
+        validation_split = float(config["validation_share"])
+
+        # get dimensions
+        dimensions = len(reverse_dictionary) + 1
+        best_model_params = dict()
+        early_stopping = EarlyStopping(monitor='val_loss', mode='min', min_delta=1e-4, verbose=1, patience=1)
+
+        # specify the search space for finding the best combination of parameters using Bayesian optimisation
+        params = {
+            "embedding_size": hp.quniform("embedding_size", l_embedding_size[0], l_embedding_size[1], 1),
+            "units": hp.quniform("units", l_units[0], l_units[1], 1),
+            "batch_size": hp.quniform("batch_size", l_batch_size[0], l_batch_size[1], 1),
+            "activation_recurrent": hp.choice("activation_recurrent", l_recurrent_activations),
+            "activation_output": hp.choice("activation_output", l_output_activations),
+            "learning_rate": hp.loguniform("learning_rate", np.log(l_learning_rate[0]), np.log(l_learning_rate[1])),
+            "dropout": hp.uniform("dropout", l_dropout[0], l_dropout[1]),
+            "spatial_dropout": hp.uniform("spatial_dropout", l_spatial_dropout[0], l_spatial_dropout[1]),
+            "recurrent_dropout": hp.uniform("recurrent_dropout", l_recurrent_dropout[0], l_recurrent_dropout[1])
+        }
+
+        def create_model(params):
+            model = Sequential()
+            model.add(Embedding(dimensions, int(params["embedding_size"]), mask_zero=True))
+            model.add(SpatialDropout1D(params["spatial_dropout"]))
+            model.add(GRU(int(params["units"]), dropout=params["dropout"], recurrent_dropout=params["recurrent_dropout"], return_sequences=True, activation=params["activation_recurrent"]))
+            model.add(Dropout(params["dropout"]))
+            model.add(GRU(int(params["units"]), dropout=params["dropout"], recurrent_dropout=params["recurrent_dropout"], return_sequences=False, activation=params["activation_recurrent"]))
+            model.add(Dropout(params["dropout"]))
+            model.add(Dense(dimensions, activation=params["activation_output"]))
+            optimizer_rms = RMSprop(lr=params["learning_rate"])
+            model.compile(loss=utils.weighted_loss(class_weights), optimizer=optimizer_rms)
+            model_fit = model.fit(
+                train_data,
+                train_labels,
+                batch_size=int(params["batch_size"]),
+                epochs=optimize_n_epochs,
+                shuffle="batch",
+                verbose=2,
+                validation_split=validation_split,
+                callbacks=[early_stopping]
+            )
+            return {'loss': model_fit.history["val_loss"][-1], 'status': STATUS_OK}
+        # minimize the objective function using the set of parameters above4
+        trials = Trials()
+        learned_params = fmin(create_model, params, trials=trials, algo=tpe.suggest, max_evals=int(config["max_evals"]))
+        print(learned_params)
+        # set the best params with respective values
+        for item in learned_params:
+            item_val = learned_params[item]
+            if item == 'activation_output':
+                best_model_params[item] = l_output_activations[item_val]
+            elif item == 'activation_recurrent':
+                best_model_params[item] = l_recurrent_activations[item_val]
+            else:
+                best_model_params[item] = item_val
+        return best_model_params
b
diff -r 000000000000 -r 9bf25dbe00ad predict_tool_usage.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/predict_tool_usage.py Wed Aug 28 07:19:38 2019 -0400
[
@@ -0,0 +1,113 @@
+"""
+Predict tool usage to weigh the predicted tools
+"""
+
+import os
+import numpy as np
+import warnings
+import csv
+import collections
+
+from sklearn.svm import SVR
+from sklearn.model_selection import GridSearchCV
+from sklearn.pipeline import Pipeline
+
+import utils
+
+warnings.filterwarnings("ignore")
+
+main_path = os.getcwd()
+
+
+class ToolPopularity:
+
+    @classmethod
+    def __init__(self):
+        """ Init method. """
+
+    @classmethod
+    def extract_tool_usage(self, tool_usage_file, cutoff_date, dictionary):
+        """
+        Extract the tool usage over time for each tool
+        """
+        tool_usage_dict = dict()
+        all_dates = list()
+        all_tool_list = list(dictionary.keys())
+        with open(tool_usage_file, 'rt') as usage_file:
+            tool_usage = csv.reader(usage_file, delimiter='\t')
+            for index, row in enumerate(tool_usage):
+                if (str(row[1]) > cutoff_date) is True:
+                    tool_id = utils.format_tool_id(row[0])
+                    if tool_id in all_tool_list:
+                        all_dates.append(row[1])
+                        if tool_id not in tool_usage_dict:
+                            tool_usage_dict[tool_id] = dict()
+                            tool_usage_dict[tool_id][row[1]] = int(row[2])
+                        else:
+                            curr_date = row[1]
+                            # merge the usage of different version of tools into one
+                            if curr_date in tool_usage_dict[tool_id]:
+                                tool_usage_dict[tool_id][curr_date] += int(row[2])
+                            else:
+                                tool_usage_dict[tool_id][curr_date] = int(row[2])
+        # get unique dates
+        unique_dates = list(set(all_dates))
+        for tool in tool_usage_dict:
+            usage = tool_usage_dict[tool]
+            # extract those dates for which tool's usage is not present in raw data
+            dates_not_present = list(set(unique_dates) ^ set(usage.keys()))
+            # impute the missing values by 0
+            for dt in dates_not_present:
+                tool_usage_dict[tool][dt] = 0
+            # sort the usage list by date
+            tool_usage_dict[tool] = collections.OrderedDict(sorted(usage.items()))
+        return tool_usage_dict
+
+    @classmethod
+    def learn_tool_popularity(self, x_reshaped, y_reshaped):
+        """
+        Fit a curve for the tool usage over time to predict future tool usage
+        """
+        epsilon = 0.0
+        cv = 5
+        s_typ = 'neg_mean_absolute_error'
+        n_jobs = 4
+        s_error = 1
+        iid = True
+        tr_score = False
+        try:
+            pipe = Pipeline(steps=[('regressor', SVR(gamma='scale'))])
+            param_grid = {
+                'regressor__kernel': ['rbf', 'poly', 'linear'],
+                'regressor__degree': [2, 3]
+            }
+            search = GridSearchCV(pipe, param_grid, iid=iid, cv=cv, scoring=s_typ, n_jobs=n_jobs, error_score=s_error, return_train_score=tr_score)
+            search.fit(x_reshaped, y_reshaped.ravel())
+            model = search.best_estimator_
+            # set the next time point to get prediction for
+            prediction_point = np.reshape([x_reshaped[-1][0] + 1], (1, 1))
+            prediction = model.predict(prediction_point)
+            if prediction < epsilon:
+                prediction = [epsilon]
+            return prediction[0]
+        except Exception:
+            return epsilon
+
+    @classmethod
+    def get_pupularity_prediction(self, tools_usage):
+        """
+        Get the popularity prediction for each tool
+        """
+        usage_prediction = dict()
+        for tool_name, usage in tools_usage.items():
+            y_val = list()
+            x_val = list()
+            for x, y in usage.items():
+                x_val.append(x)
+                y_val.append(y)
+            x_pos = np.arange(len(x_val))
+            x_reshaped = x_pos.reshape(len(x_pos), 1)
+            y_reshaped = np.reshape(y_val, (len(x_pos), 1))
+            prediction = np.round(self.learn_tool_popularity(x_reshaped, y_reshaped), 8)
+            usage_prediction[tool_name] = prediction
+        return usage_prediction
b
diff -r 000000000000 -r 9bf25dbe00ad prepare_data.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/prepare_data.py Wed Aug 28 07:19:38 2019 -0400
[
b'@@ -0,0 +1,251 @@\n+"""\n+Prepare the workflow paths to be used by downstream\n+machine learning algorithm. The paths are divided\n+into the test and training sets\n+"""\n+\n+import os\n+import collections\n+import numpy as np\n+import random\n+\n+import predict_tool_usage\n+\n+main_path = os.getcwd()\n+\n+\n+class PrepareData:\n+\n+    @classmethod\n+    def __init__(self, max_seq_length, test_data_share):\n+        """ Init method. """\n+        self.max_tool_sequence_len = max_seq_length\n+        self.test_share = test_data_share\n+\n+    @classmethod\n+    def process_workflow_paths(self, workflow_paths):\n+        """\n+        Get all the tools and complete set of individual paths for each workflow\n+        """\n+        tokens = list()\n+        raw_paths = workflow_paths\n+        raw_paths = [x.replace("\\n", \'\') for x in raw_paths]\n+        for item in raw_paths:\n+            split_items = item.split(",")\n+            for token in split_items:\n+                if token is not "":\n+                    tokens.append(token)\n+        tokens = list(set(tokens))\n+        tokens = np.array(tokens)\n+        tokens = np.reshape(tokens, [-1, ])\n+        return tokens, raw_paths\n+\n+    @classmethod\n+    def create_new_dict(self, new_data_dict):\n+        """\n+        Create new data dictionary\n+        """\n+        reverse_dict = dict((v, k) for k, v in new_data_dict.items())\n+        return new_data_dict, reverse_dict\n+\n+    @classmethod\n+    def assemble_dictionary(self, new_data_dict, old_data_dictionary={}):\n+        """\n+        Create/update tools indices in the forward and backward dictionary\n+        """\n+        new_data_dict, reverse_dict = self.create_new_dict(new_data_dict)\n+        return new_data_dict, reverse_dict\n+\n+    @classmethod\n+    def create_data_dictionary(self, words, old_data_dictionary={}):\n+        """\n+        Create two dictionaries having tools names and their indexes\n+        """\n+        count = collections.Counter(words).most_common()\n+        dictionary = dict()\n+        for word, _ in count:\n+            dictionary[word] = len(dictionary) + 1\n+        dictionary, reverse_dictionary = self.assemble_dictionary(dictionary, old_data_dictionary)\n+        return dictionary, reverse_dictionary\n+\n+    @classmethod\n+    def decompose_paths(self, paths, dictionary):\n+        """\n+        Decompose the paths to variable length sub-paths keeping the first tool fixed\n+        """\n+        sub_paths_pos = list()\n+        for index, item in enumerate(paths):\n+            tools = item.split(",")\n+            len_tools = len(tools)\n+            if len_tools <= self.max_tool_sequence_len:\n+                for window in range(1, len_tools):\n+                    sequence = tools[0: window + 1]\n+                    tools_pos = [str(dictionary[str(tool_item)]) for tool_item in sequence]\n+                    if len(tools_pos) > 1:\n+                        sub_paths_pos.append(",".join(tools_pos))\n+        sub_paths_pos = list(set(sub_paths_pos))\n+        return sub_paths_pos\n+\n+    @classmethod\n+    def prepare_paths_labels_dictionary(self, dictionary, reverse_dictionary, paths, compatible_next_tools):\n+        """\n+        Create a dictionary of sequences with their labels for training and test paths\n+        """\n+        paths_labels = dict()\n+        random.shuffle(paths)\n+        for item in paths:\n+            if item and item not in "":\n+                tools = item.split(",")\n+                label = tools[-1]\n+                train_tools = tools[:len(tools) - 1]\n+                last_but_one_name = reverse_dictionary[int(train_tools[-1])]\n+                try:\n+                    compatible_tools = compatible_next_tools[last_but_one_name].split(",")\n+                except Exception:\n+                    continue\n+                if len(compatible_tools) > 0:\n+                    compatible_tools_ids = [str(dictionary[x]) for x in compatible_tools]\n+                    compatible_tools_ids.append(label)\n+                    composite_la'..b'rify the overlapping of samples in train and test data\n+        """\n+        intersection = list(set(train_paths).intersection(set(test_paths)))\n+        print("Overlap in train and test: %d" % len(intersection))\n+\n+    @classmethod\n+    def get_predicted_usage(self, data_dictionary, predicted_usage):\n+        """\n+        Get predicted usage for tools\n+        """\n+        usage = dict()\n+        epsilon = 0.0\n+        # index 0 does not belong to any tool\n+        usage[0] = epsilon\n+        for k, v in data_dictionary.items():\n+            try:\n+                usg = predicted_usage[k]\n+                if usg < epsilon:\n+                    usg = epsilon\n+                usage[v] = usg\n+            except Exception:\n+                usage[v] = epsilon\n+                continue\n+        return usage\n+\n+    @classmethod\n+    def assign_class_weights(self, n_classes, predicted_usage):\n+        """\n+        Compute class weights using usage\n+        """\n+        class_weights = dict()\n+        class_weights[str(0)] = 0.0\n+        for key in range(1, n_classes):\n+            u_score = predicted_usage[key]\n+            if u_score < 1.0:\n+                u_score += 1.0\n+            class_weights[key] = np.log(u_score)\n+        return class_weights\n+\n+    @classmethod\n+    def get_sample_weights(self, train_data, reverse_dictionary, paths_frequency):\n+        """\n+        Compute the frequency of paths in training data\n+        """\n+        path_weights = np.zeros(len(train_data))\n+        for path_index, path in enumerate(train_data):\n+            sample_pos = np.where(path > 0)[0]\n+            sample_tool_pos = path[sample_pos[0]:]\n+            path_name = ",".join([reverse_dictionary[int(tool_pos)] for tool_pos in sample_tool_pos])\n+            try:\n+                path_weights[path_index] = int(paths_frequency[path_name])\n+            except Exception:\n+                path_weights[path_index] = 1\n+        return path_weights\n+\n+    @classmethod\n+    def get_data_labels_matrices(self, workflow_paths, tool_usage_path, cutoff_date, compatible_next_tools, old_data_dictionary={}):\n+        """\n+        Convert the training and test paths into corresponding numpy matrices\n+        """\n+        processed_data, raw_paths = self.process_workflow_paths(workflow_paths)\n+        dictionary, reverse_dictionary = self.create_data_dictionary(processed_data, old_data_dictionary)\n+        num_classes = len(dictionary)\n+\n+        print("Raw paths: %d" % len(raw_paths))\n+        random.shuffle(raw_paths)\n+\n+        print("Decomposing paths...")\n+        all_unique_paths = self.decompose_paths(raw_paths, dictionary)\n+        random.shuffle(all_unique_paths)\n+\n+        print("Creating dictionaries...")\n+        multilabels_paths = self.prepare_paths_labels_dictionary(dictionary, reverse_dictionary, all_unique_paths, compatible_next_tools)\n+\n+        print("Complete data: %d" % len(multilabels_paths))\n+        train_paths_dict, test_paths_dict = self.split_test_train_data(multilabels_paths)\n+\n+        print("Train data: %d" % len(train_paths_dict))\n+        print("Test data: %d" % len(test_paths_dict))\n+\n+        test_data, test_labels = self.pad_paths(test_paths_dict, num_classes)\n+        train_data, train_labels = self.pad_paths(train_paths_dict, num_classes)\n+\n+        # Predict tools usage\n+        print("Predicting tools\' usage...")\n+        usage_pred = predict_tool_usage.ToolPopularity()\n+        usage = usage_pred.extract_tool_usage(tool_usage_path, cutoff_date, dictionary)\n+        tool_usage_prediction = usage_pred.get_pupularity_prediction(usage)\n+        tool_predicted_usage = self.get_predicted_usage(dictionary, tool_usage_prediction)\n+\n+        # get class weights using the predicted usage for each tool\n+        class_weights = self.assign_class_weights(train_labels.shape[1], tool_predicted_usage)\n+\n+        return train_data, train_labels, test_data, test_labels, dictionary, reverse_dictionary, class_weights, tool_predicted_usage\n'
b
diff -r 000000000000 -r 9bf25dbe00ad test-data/test_tool_usage
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_tool_usage Wed Aug 28 07:19:38 2019 -0400
b
b'@@ -0,0 +1,6216 @@\n+upload1\t2019-03-01\t176\n+toolshed.g2.bx.psu.edu/repos/devteam/fastqc/fastqc/0.72\t2019-03-01\t97\n+toolshed.g2.bx.psu.edu/repos/bgruening/deeptools_bam_coverage/deeptools_bam_coverage/3.0.2.0\t2019-03-01\t67\n+toolshed.g2.bx.psu.edu/repos/iuc/featurecounts/featurecounts/1.6.3+galaxy2\t2019-03-01\t53\n+toolshed.g2.bx.psu.edu/repos/iuc/sra_tools/fastq_dump/2.9.1.3\t2019-03-01\t51\n+toolshed.g2.bx.psu.edu/repos/devteam/samtools_flagstat/samtools_flagstat/2.0.2\t2019-03-01\t38\n+toolshed.g2.bx.psu.edu/repos/bgruening/text_processing/tp_tail_tool/1.1.0\t2019-03-01\t38\n+toolshed.g2.bx.psu.edu/repos/iuc/multiqc/multiqc/1.7\t2019-03-01\t31\n+toolshed.g2.bx.psu.edu/repos/devteam/fastqc/fastqc/0.52\t2019-03-01\t30\n+toolshed.g2.bx.psu.edu/repos/iuc/rgrnastar/rna_star/2.6.0b-1\t2019-03-01\t28\n+CONVERTER_gz_to_uncompressed\t2019-03-01\t26\n+toolshed.g2.bx.psu.edu/repos/bgruening/deeptools_bam_compare/deeptools_bam_compare/3.0.2.0\t2019-03-01\t25\n+toolshed.g2.bx.psu.edu/repos/pjbriggs/trimmomatic/trimmomatic/0.36.5\t2019-03-01\t25\n+toolshed.g2.bx.psu.edu/repos/iuc/macs2/macs2_callpeak/2.1.1.20160309.5\t2019-03-01\t22\n+toolshed.g2.bx.psu.edu/repos/devteam/samtools_idxstats/samtools_idxstats/2.0.2\t2019-03-01\t21\n+toolshed.g2.bx.psu.edu/repos/lparsons/cutadapt/cutadapt/1.16.5\t2019-03-01\t19\n+toolshed.g2.bx.psu.edu/repos/devteam/ncbi_blast_plus/ncbi_blastn_wrapper/0.3.1\t2019-03-01\t18\n+toolshed.g2.bx.psu.edu/repos/bgruening/deeptools_compute_matrix/deeptools_compute_matrix/3.0.2.0\t2019-03-01\t17\n+toolshed.g2.bx.psu.edu/repos/nilesh/rseqc/rseqc_infer_experiment/2.6.4.1\t2019-03-01\t17\n+toolshed.g2.bx.psu.edu/repos/bgruening/deeptools_compute_gc_bias/deeptools_compute_gc_bias/3.0.2.0\t2019-03-01\t16\n+toolshed.g2.bx.psu.edu/repos/bgruening/deeptools_plot_fingerprint/deeptools_plot_fingerprint/3.0.2.0\t2019-03-01\t16\n+toolshed.g2.bx.psu.edu/repos/galaxyp/cardinal_classification/cardinal_classification/1.12.1.2\t2019-03-01\t16\n+toolshed.g2.bx.psu.edu/repos/iuc/hisat2/hisat2/2.1.0+galaxy4\t2019-03-01\t13\n+toolshed.g2.bx.psu.edu/repos/iuc/flash/flash/1.2.11.3\t2019-03-01\t13\n+toolshed.g2.bx.psu.edu/repos/bgruening/text_processing/tp_cut_tool/1.1.0\t2019-03-01\t13\n+toolshed.g2.bx.psu.edu/repos/iuc/featurecounts/featurecounts/1.4.6.p5\t2019-03-01\t12\n+toolshed.g2.bx.psu.edu/repos/bgruening/deeptools_multi_bam_summary/deeptools_multi_bam_summary/3.0.2.0\t2019-03-01\t12\n+toolshed.g2.bx.psu.edu/repos/galaxyp/cardinal_preprocessing/cardinal_preprocessing/1.12.1.2\t2019-03-01\t12\n+cat1\t2019-03-01\t11\n+toolshed.g2.bx.psu.edu/repos/iuc/ngsutils_bam_filter/ngsutils_bam_filter/0.5.9\t2019-03-01\t11\n+toolshed.g2.bx.psu.edu/repos/iuc/datamash_ops/datamash_ops/1.1.0\t2019-03-01\t10\n+wig_to_bigWig\t2019-03-01\t10\n+CONVERTER_interval_to_bed_0\t2019-03-01\t10\n+toolshed.g2.bx.psu.edu/repos/iuc/goseq/goseq/1.34.0\t2019-03-01\t9\n+toolshed.g2.bx.psu.edu/repos/iuc/bedtools/bedtools_sortbed/2.27.1\t2019-03-01\t9\n+toolshed.g2.bx.psu.edu/repos/bgruening/pileometh/pileometh/0.3.0.1\t2019-03-01\t9\n+toolshed.g2.bx.psu.edu/repos/iuc/deseq2/deseq2/2.11.40.6\t2019-03-01\t9\n+toolshed.g2.bx.psu.edu/repos/bgruening/hicexplorer_hicplottads/hicexplorer_hicplottads/2.1.4.0\t2019-03-01\t9\n+toolshed.g2.bx.psu.edu/repos/peterjc/blast_rbh/blast_reciprocal_best_hits/0.1.11\t2019-03-01\t7\n+toolshed.g2.bx.psu.edu/repos/bgruening/deeptools_plot_correlation/deeptools_plot_correlation/3.0.2.0\t2019-03-01\t7\n+toolshed.g2.bx.psu.edu/repos/bgruening/text_processing/tp_find_and_replace/1.1.1\t2019-03-01\t6\n+toolshed.g2.bx.psu.edu/repos/bgruening/deeptools_bam_coverage/deeptools_bam_coverage/3.0.1.0\t2019-03-01\t6\n+secure_hash_message_digest\t2019-03-01\t6\n+CONVERTER_interval_to_bedstrict_0\t2019-03-01\t6\n+toolshed.g2.bx.psu.edu/repos/iuc/gtftobed12/gtftobed12/357\t2019-03-01\t6\n+CONVERTER_bedgraph_to_bigwig\t2019-03-01\t5\n+toolshed.g2.bx.psu.edu/repos/devteam/concat/gops_concat_1/1.0.1\t2019-03-01\t5\n+toolshed.g2.bx.psu.edu/repos/bgruening/deeptools_plot_heatmap/deeptools_plot_heatmap/3.0.2.0\t2019-03-01\t5\n+toolshed.g2.bx.psu.edu/repos/devteam/fastqtofasta/fastq_to_fasta_pyt'..b'n/nanopolish_methylation/0.1.0\t2018-09-01\t1\n+random_lines1\t2018-09-01\t1\n+toolshed.g2.bx.psu.edu/repos/iuc/mothur_lefse/mothur_lefse/1.36.1.0\t2018-09-01\t1\n+toolshed.g2.bx.psu.edu/repos/iuc/mothur_make_contigs/mothur_make_contigs/1.36.1.0\t2018-09-01\t1\n+__RELABEL_FROM_FILE__\t2018-09-01\t1\n+toolshed.g2.bx.psu.edu/repos/iuc/sra_tools/sra_pileup/2.9.1.2\t2018-09-01\t1\n+toolshed.g2.bx.psu.edu/repos/iuc/sra_tools/sam_dump/2.9.1.3\t2018-09-01\t1\n+toolshed.g2.bx.psu.edu/repos/iuc/mothur_metastats/mothur_metastats/1.36.1.0\t2018-09-01\t1\n+secure_hash_message_digest\t2018-09-01\t1\n+toolshed.g2.bx.psu.edu/repos/devteam/picard/PicardInsertSize/1.56.0\t2018-09-01\t1\n+toolshed.g2.bx.psu.edu/repos/iuc/mothur_otu_hierarchy/mothur_otu_hierarchy/1.39.5.0\t2018-09-01\t1\n+toolshed.g2.bx.psu.edu/repos/iuc/mothur_parse_list/mothur_parse_list/1.39.5.0\t2018-09-01\t1\n+toolshed.g2.bx.psu.edu/repos/iuc/mothur_pre_cluster/mothur_pre_cluster/1.36.1.0\t2018-09-01\t1\n+toolshed.g2.bx.psu.edu/repos/galaxyp/openms_filefilter/FileFilter/2.2.0\t2018-09-01\t1\n+toolshed.g2.bx.psu.edu/repos/iuc/sqlite_to_tabular/sqlite_to_tabular/2.0.0\t2018-09-01\t1\n+toolshed.g2.bx.psu.edu/repos/iuc/snpsift/snpsift_vartype/4.3.1\t2018-09-01\t1\n+toolshed.g2.bx.psu.edu/repos/iuc/mothur_remove_groups/mothur_remove_groups/1.36.1.0\t2018-09-01\t1\n+toolshed.g2.bx.psu.edu/repos/iuc/mothur_remove_lineage/mothur_remove_lineage/1.36.1.0\t2018-09-01\t1\n+__APPLY_RULES__\t2018-09-01\t1\n+toolshed.g2.bx.psu.edu/repos/iuc/mothur_seq_error/mothur_seq_error/1.36.1.0\t2018-09-01\t1\n+toolshed.g2.bx.psu.edu/repos/iuc/snpsift_dbnsfp_generic/snpSift_dbnsfp_generic/4.1.1\t2018-09-01\t1\n+toolshed.g2.bx.psu.edu/repos/iuc/mothur_sub_sample/mothur_sub_sample/1.36.1.0\t2018-09-01\t1\n+toolshed.g2.bx.psu.edu/repos/bgruening/hicexplorer_hiccorrectmatrix/hicexplorer_hiccorrectmatrix/1.8.0\t2018-09-01\t1\n+toolshed.g2.bx.psu.edu/repos/devteam/picard/picard_MergeSamFiles/2.7.1.0\t2018-09-01\t1\n+toolshed.g2.bx.psu.edu/repos/bgruening/hicexplorer_hiccomparematrices/hicexplorer_hiccomparematrices/2.1.4.0\t2018-09-01\t1\n+toolshed.g2.bx.psu.edu/repos/galaxyp/openms_qcimporter/QCImporter/2.3.0\t2018-09-01\t1\n+toolshed.g2.bx.psu.edu/repos/iuc/snpeff/snpEff_databases/4.3r.1\t2018-09-01\t1\n+toolshed.g2.bx.psu.edu/repos/iuc/mothur_summary_single/mothur_summary_single/1.36.1.0\t2018-09-01\t1\n+toolshed.g2.bx.psu.edu/repos/iuc/snpeff/snpEff_databases/4.0.0\t2018-09-01\t1\n+toolshed.g2.bx.psu.edu/repos/galaxyp/openms_qcembedder/QCEmbedder/2.3.0\t2018-09-01\t1\n+toolshed.g2.bx.psu.edu/repos/iuc/mothur_summary_single/mothur_summary_single/1.39.5.0\t2018-09-01\t1\n+toolshed.g2.bx.psu.edu/repos/galaxyp/openms_highresprecursormasscorrector/HighResPrecursorMassCorrector/2.2.0\t2018-09-01\t1\n+toolshed.g2.bx.psu.edu/repos/bgruening/graphmap_align/graphmap_align/0.5.2\t2018-09-01\t1\n+toolshed.g2.bx.psu.edu/repos/devteam/picard/picard_ReorderSam/2.18.2.1\t2018-09-01\t1\n+toolshed.g2.bx.psu.edu/repos/bebatut/cdhit/cd_hit_est/1.3\t2018-09-01\t1\n+toolshed.g2.bx.psu.edu/repos/iuc/mothur_venn/mothur_venn/1.36.1.0\t2018-09-01\t1\n+toolshed.g2.bx.psu.edu/repos/galaxyp/openms_idfileconverter/IDFileConverter/2.1.0\t2018-09-01\t1\n+toolshed.g2.bx.psu.edu/repos/bgruening/diamond/bg_diamond_makedb/0.8.24\t2018-09-01\t1\n+toolshed.g2.bx.psu.edu/repos/galaxyp/openms_idfileconverter/IDFileConverter/2.2.0\t2018-09-01\t1\n+toolshed.g2.bx.psu.edu/repos/iuc/snpeff/snpEff/3.4\t2018-09-01\t1\n+toolshed.g2.bx.psu.edu/repos/bgruening/augustus/augustus/3.2.3\t2018-09-01\t1\n+toolshed.g2.bx.psu.edu/repos/iuc/ncbi_eutils_einfo/ncbi_eutils_einfo/1.1\t2018-09-01\t1\n+toolshed.g2.bx.psu.edu/repos/bgruening/bismark/bismark_deduplicate/0.16.3\t2018-09-01\t1\n+toolshed.g2.bx.psu.edu/repos/devteam/picard/rgPicFixMate/1.56.0\t2018-09-01\t1\n+toolshed.g2.bx.psu.edu/repos/iuc/seqtk/seqtk_mergepe/1.2.0\t2018-09-01\t1\n+toolshed.g2.bx.psu.edu/repos/iuc/seqtk/seqtk_fqchk/1.2.0\t2018-09-01\t1\n+toolshed.g2.bx.psu.edu/repos/iuc/raceid_diffgene/raceid_diffgene/1.0.0\t2018-09-01\t1\n+toolshed.g2.bx.psu.edu/repos/iuc/qiime_assign_taxonomy/qiime_assign_taxonomy/1.9.1.0\t2018-09-01\t1\n'
b
diff -r 000000000000 -r 9bf25dbe00ad test-data/test_workflows
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_workflows Wed Aug 28 07:19:38 2019 -0400
b
b'@@ -0,0 +1,3126 @@\n+wf_id\twf_updated\tin_id\tin_tool\tin_tool_v\tout_id\tout_tool\tout_tool_v\n+3\t2013-02-07 16:48:00\t7\tRemove beginning1\t1.0.0\t5\tGrep1\t1.0.1\n+4\t2013-02-07 16:48:00\t16\twc_gnu\t1.0.0\t14\tbedtools_intersectBed\t\n+4\t2013-02-07 16:48:00\t18\taddValue\t1.0.0\t16\twc_gnu\t1.0.0\n+4\t2013-02-07 16:48:00\t13\tcat1\t1.0.0\t18\taddValue\t1.0.0\n+4\t2013-02-07 16:48:00\t21\tcshl_uniq_tool\t1.0.0\t19\tcshl_awk_tool\t\n+4\t2013-02-07 16:48:00\t13\tcat1\t1.0.0\t20\tCount1\t1.0.0\n+4\t2013-02-07 16:48:00\t20\tCount1\t1.0.0\t21\tcshl_uniq_tool\t1.0.0\n+4\t2013-02-07 16:48:00\t14\tbedtools_intersectBed\t\t23\t\t\n+4\t2013-02-07 16:48:00\t14\tbedtools_intersectBed\t\t24\t\t\n+5\t2013-02-07 16:49:00\t26\tcat1\t1.0.0\t25\taddValue\t1.0.0\n+5\t2013-02-07 16:49:00\t59\tPaste1\t1.0.0\t27\tCut1\t1.0.1\n+5\t2013-02-07 16:49:00\t59\tPaste1\t1.0.0\t28\tCut1\t1.0.1\n+5\t2013-02-07 16:49:00\t66\tPaste1\t1.0.0\t29\tCut1\t1.0.1\n+5\t2013-02-07 16:49:00\t66\tPaste1\t1.0.0\t30\tCut1\t1.0.1\n+5\t2013-02-07 16:49:00\t36\tPaste1\t1.0.0\t31\tCut1\t1.0.1\n+5\t2013-02-07 16:49:00\t36\tPaste1\t1.0.0\t32\tCut1\t1.0.1\n+5\t2013-02-07 16:49:00\t60\tPaste1\t1.0.0\t33\tCut1\t1.0.1\n+5\t2013-02-07 16:49:00\t60\tPaste1\t1.0.0\t34\tCut1\t1.0.1\n+5\t2013-02-07 16:49:00\t64\tAdd_a_column1\t1.1.0\t36\tPaste1\t1.0.0\n+5\t2013-02-07 16:49:00\t26\tcat1\t1.0.0\t37\taddValue\t1.0.0\n+5\t2013-02-07 16:49:00\t26\tcat1\t1.0.0\t38\taddValue\t1.0.0\n+5\t2013-02-07 16:49:00\t39\tFilter1\t1.1.0\t40\tgops_coverage_1\t1.0.0\n+5\t2013-02-07 16:49:00\t51\tSummary_Statistics1\t1.1.0\t41\tcshl_grep_tool\t1.0.0\n+5\t2013-02-07 16:49:00\t52\tSummary_Statistics1\t1.1.0\t41\tcshl_grep_tool\t1.0.0\n+5\t2013-02-07 16:49:00\t49\tSummary_Statistics1\t1.1.0\t42\tFilter1\t1.1.0\n+5\t2013-02-07 16:49:00\t50\tSummary_Statistics1\t1.1.0\t42\tFilter1\t1.1.0\n+5\t2013-02-07 16:49:00\t55\tSummary_Statistics1\t1.1.0\t43\tFilter1\t1.1.0\n+5\t2013-02-07 16:49:00\t54\tSummary_Statistics1\t1.1.0\t44\tFilter1\t1.1.0\n+5\t2013-02-07 16:49:00\t57\tCut1\t1.0.1\t45\tSummary_Statistics1\t1.1.0\n+5\t2013-02-07 16:49:00\t26\tcat1\t1.0.0\t47\taddValue\t1.0.0\n+5\t2013-02-07 16:49:00\t26\tcat1\t1.0.0\t48\taddValue\t1.0.0\n+5\t2013-02-07 16:49:00\t31\tCut1\t1.0.1\t50\tSummary_Statistics1\t1.1.0\n+5\t2013-02-07 16:49:00\t34\tCut1\t1.0.1\t51\tSummary_Statistics1\t1.1.0\n+5\t2013-02-07 16:49:00\t33\tCut1\t1.0.1\t52\tSummary_Statistics1\t1.1.0\n+5\t2013-02-07 16:49:00\t28\tCut1\t1.0.1\t53\tSummary_Statistics1\t1.1.0\n+5\t2013-02-07 16:49:00\t27\tCut1\t1.0.1\t54\tSummary_Statistics1\t1.1.0\n+5\t2013-02-07 16:49:00\t30\tCut1\t1.0.1\t55\tSummary_Statistics1\t1.1.0\n+5\t2013-02-07 16:49:00\t29\tCut1\t1.0.1\t56\tSummary_Statistics1\t1.1.0\n+5\t2013-02-07 16:49:00\t35\tPaste1\t1.0.0\t57\tCut1\t1.0.1\n+5\t2013-02-07 16:49:00\t35\tPaste1\t1.0.0\t58\tCut1\t1.0.1\n+5\t2013-02-07 16:49:00\t62\tAdd_a_column1\t1.1.0\t59\tPaste1\t1.0.0\n+5\t2013-02-07 16:49:00\t63\tAdd_a_column1\t1.1.0\t60\tPaste1\t1.0.0\n+5\t2013-02-07 16:49:00\t38\taddValue\t1.0.0\t62\tAdd_a_column1\t1.1.0\n+5\t2013-02-07 16:49:00\t37\taddValue\t1.0.0\t63\tAdd_a_column1\t1.1.0\n+5\t2013-02-07 16:49:00\t47\taddValue\t1.0.0\t64\tAdd_a_column1\t1.1.0\n+5\t2013-02-07 16:49:00\t40\tgops_coverage_1\t1.0.0\t68\t\t\n+5\t2013-02-07 16:49:00\t40\tgops_coverage_1\t1.0.0\t69\t\t\n+6\t2013-02-07 16:57:00\t70\tcshl_awk_tool1\t1.0.0\t71\t\t\n+9\t2013-02-07 16:59:00\t84\tRemove beginning1\t1.0.0\t82\tGrep1\t1.0.1\n+9\t2013-02-07 16:59:00\t88\tPaste1\t1.0.0\t85\taddValue\t1.0.0\n+9\t2013-02-07 16:59:00\t88\tPaste1\t1.0.0\t86\tCut1\t1.0.1\n+9\t2013-02-07 16:59:00\t82\tGrep1\t1.0.1\t89\t\t\n+11\t2013-02-07 17:03:00\t145\tbarchart_gnuplot\t1.0.0\t136\tcat1\t1.0.0\n+11\t2013-02-07 17:03:00\t170\tPaste1\t1.0.0\t137\tCut1\t1.0.1\n+11\t2013-02-07 17:03:00\t170\tPaste1\t1.0.0\t138\tCut1\t1.0.1\n+11\t2013-02-07 17:03:00\t177\tPaste1\t1.0.0\t139\tCut1\t1.0.1\n+11\t2013-02-07 17:03:00\t177\tPaste1\t1.0.0\t140\tCut1\t1.0.1\n+11\t2013-02-07 17:03:00\t171\tPaste1\t1.0.0\t143\tCut1\t1.0.1\n+11\t2013-02-07 17:03:00\t171\tPaste1\t1.0.0\t144\tCut1\t1.0.1\n+11\t2013-02-07 17:03:00\t176\tAdd_a_column1\t1.1.0\t146\tPaste1\t1.0.0\n+11\t2013-02-07 17:03:00\t175\tAdd_a_column1\t1.1.0\t147\tPaste1\t1.0.0\n+11\t2013-02-07 17:03:00\t156\tSummary_Statistics1\t1.1.0\t150\tFilter1\t1.1.0\n+11\t2013-02-07 17:03:00\t157\tSummary_Statistics1\t1.1.0\t150\tFilter1\t1.1.0\n+11\t2013-02-07 17:03:00\t155\tFilter1\t1.1.0\t151\tgops_coverage_1\t1.0.0\n+11\t2013-02-07 17:03:00\t162\tSummary_Statistics1\t1.1.0\t152\t'..b'galore/trim_galore/0.2.4.1\t0.2.4.1\t4040\t\t\n+371\t2013-07-24 20:44:00\t4179\twig_to_bigWig\t1.1.0\t4186\tFilter1\t1.1.0\n+371\t2013-07-24 20:44:00\t4174\twig_to_bigWig\t1.1.0\t4187\tFilter1\t1.1.0\n+371\t2013-07-24 20:44:00\t4180\twig_to_bigWig\t1.1.0\t4189\tFilter1\t1.1.0\n+371\t2013-07-24 20:44:00\t4181\twig_to_bigWig\t1.1.0\t4193\tFilter1\t1.1.0\n+371\t2013-07-24 20:44:00\t4202\twig_to_bigWig\t1.1.0\t4195\tFilter1\t1.1.0\n+371\t2013-07-24 20:44:00\t4207\twig_to_bigWig\t1.1.0\t4196\tFilter1\t1.1.0\n+371\t2013-07-24 20:44:00\t4206\twig_to_bigWig\t1.1.0\t4198\tFilter1\t1.1.0\n+371\t2013-07-24 20:44:00\t4182\twig_to_bigWig\t1.1.0\t4199\tFilter1\t1.1.0\n+371\t2013-07-24 20:44:00\t4204\twig_to_bigWig\t1.1.0\t4200\tFilter1\t1.1.0\n+371\t2013-07-24 20:44:00\t4205\twig_to_bigWig\t1.1.0\t4201\tFilter1\t1.1.0\n+371\t2013-07-24 20:44:00\t4176\tFilter1\t1.1.0\t4210\t\t\n+371\t2013-07-24 20:44:00\t4183\tFilter1\t1.1.0\t4210\t\t\n+371\t2013-07-24 20:44:00\t4185\tFilter1\t1.1.0\t4210\t\t\n+371\t2013-07-24 20:44:00\t4186\tFilter1\t1.1.0\t4210\t\t\n+371\t2013-07-24 20:44:00\t4187\tFilter1\t1.1.0\t4210\t\t\n+371\t2013-07-24 20:44:00\t4189\tFilter1\t1.1.0\t4210\t\t\n+371\t2013-07-24 20:44:00\t4190\tFilter1\t1.1.0\t4210\t\t\n+371\t2013-07-24 20:44:00\t4192\tFilter1\t1.1.0\t4210\t\t\n+371\t2013-07-24 20:44:00\t4193\tFilter1\t1.1.0\t4210\t\t\n+371\t2013-07-24 20:44:00\t4194\tFilter1\t1.1.0\t4210\t\t\n+371\t2013-07-24 20:44:00\t4195\tFilter1\t1.1.0\t4210\t\t\n+371\t2013-07-24 20:44:00\t4196\tFilter1\t1.1.0\t4210\t\t\n+371\t2013-07-24 20:44:00\t4197\tFilter1\t1.1.0\t4210\t\t\n+371\t2013-07-24 20:44:00\t4198\tFilter1\t1.1.0\t4210\t\t\n+371\t2013-07-24 20:44:00\t4199\tFilter1\t1.1.0\t4210\t\t\n+371\t2013-07-24 20:44:00\t4201\tFilter1\t1.1.0\t4210\t\t\n+375\t2013-07-26 14:02:00\t4288\theatmapper\t1.3\t4300\t\t\n+375\t2013-07-26 14:02:00\t4292\theatmapper\t1.3\t4300\t\t\n+375\t2013-07-26 14:02:00\t4298\theatmapper\t1.3\t4300\t\t\n+375\t2013-07-26 14:02:00\t4288\theatmapper\t1.3\t4301\t\t\n+375\t2013-07-26 14:02:00\t4292\theatmapper\t1.3\t4305\t\t\n+375\t2013-07-26 14:02:00\t4298\theatmapper\t1.3\t4311\t\t\n+380\t2013-07-31 09:32:00\t4346\tbowtie2\t0.1\t4347\ttoolshed.g2.bx.psu.edu/repos/bjoern-gruening/trim_galore/trim_galore/0.2.4.1\t0.2.4.1\n+380\t2013-07-31 09:32:00\t4347\ttoolshed.g2.bx.psu.edu/repos/bjoern-gruening/trim_galore/trim_galore/0.2.4.1\t0.2.4.1\t4350\t\t\n+380\t2013-07-31 09:32:00\t4347\ttoolshed.g2.bx.psu.edu/repos/bjoern-gruening/trim_galore/trim_galore/0.2.4.1\t0.2.4.1\t4351\t\t\n+385\t2013-07-31 19:25:00\t4371\ttoolshed.g2.bx.psu.edu/repos/ryo-tas/macs14/peakcalling_macs14/1.4.1\t1.4.1\t4372\t\t\n+385\t2013-07-31 19:25:00\t4371\ttoolshed.g2.bx.psu.edu/repos/ryo-tas/macs14/peakcalling_macs14/1.4.1\t1.4.1\t4373\t\t\n+392\t2013-07-31 20:28:00\t4425\tSummary_Statistics1\t1.1.0\t4427\tFilter1\t1.1.0\n+392\t2013-07-31 20:28:00\t4424\tSummary_Statistics1\t1.1.0\t4428\tFilter1\t1.1.0\n+392\t2013-07-31 20:28:00\t4429\taddValue\t1.0.0\t4428\tFilter1\t1.1.0\n+392\t2013-07-31 20:28:00\t4431\tCut1\t1.0.2\t4430\tmergeCols1\t1.0.1\n+392\t2013-07-31 20:28:00\t4433\tmergeCols1\t1.0.1\t4432\taddValue\t1.0.0\n+392\t2013-07-31 20:28:00\t4422\tCut1\t1.0.2\t4433\tmergeCols1\t1.0.1\n+392\t2013-07-31 20:28:00\t4426\taddValue\t1.0.0\t4435\t\t\n+392\t2013-07-31 20:28:00\t4427\tFilter1\t1.1.0\t4435\t\t\n+392\t2013-07-31 20:28:00\t4428\tFilter1\t1.1.0\t4435\t\t\n+394\t2013-08-01 09:49:00\t4446\ttoolshed.g2.bx.psu.edu/repos/aaronquinlan/bedtools/bedtools_intersectbed_bam/0.1.0\t0.1.0\t4449\t\t\n+394\t2013-08-01 09:49:00\t4446\ttoolshed.g2.bx.psu.edu/repos/aaronquinlan/bedtools/bedtools_intersectbed_bam/0.1.0\t0.1.0\t4451\t\t\n+397\t2013-08-01 21:11:00\t4472\tmethtools_filter\t0.1.1\t4469\ttoolshed.g2.bx.psu.edu/repos/aaronquinlan/bedtools/bedtools_intersectbed_bam/0.1.0\t0.1.0\n+397\t2013-08-01 21:11:00\t4472\tmethtools_filter\t0.1.1\t4470\ttoolshed.g2.bx.psu.edu/repos/aaronquinlan/bedtools/bedtools_intersectbed_bam/0.1.0\t0.1.0\n+397\t2013-08-01 21:11:00\t4471\tmethtools_dmr\t0.1.1\t4472\tmethtools_filter\t0.1.1\n+397\t2013-08-01 21:11:00\t4471\tmethtools_dmr\t0.1.1\t4472\tmethtools_filter\t0.1.1\n+397\t2013-08-01 21:11:00\t4468\ttoolshed.g2.bx.psu.edu/repos/aaronquinlan/bedtools/bedtools_intersectbed_bam/0.1.0\t0.1.0\t4474\tsmooth_running_window\t0.1\n+397\t2013-08-01 21:11:00\t4468\ttoolshed.g2.bx.psu.edu/repos/aaronquinlan/bedtools/bedtools_intersectbed_bam/0.1.0\t0.1.0\t4475\t\t\n'
b
diff -r 000000000000 -r 9bf25dbe00ad utils.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/utils.py Wed Aug 28 07:19:38 2019 -0400
[
b'@@ -0,0 +1,251 @@\n+import os\n+import numpy as np\n+import json\n+import h5py\n+\n+from keras.models import model_from_json, Sequential\n+from keras.layers import Dense, GRU, Dropout\n+from keras.layers.embeddings import Embedding\n+from keras.layers.core import SpatialDropout1D\n+from keras.optimizers import RMSprop\n+from keras import backend as K\n+\n+\n+def read_file(file_path):\n+    """\n+    Read a file\n+    """\n+    with open(file_path, "r") as json_file:\n+        file_content = json.loads(json_file.read())\n+    return file_content\n+\n+\n+def write_file(file_path, content):\n+    """\n+    Write a file\n+    """\n+    remove_file(file_path)\n+    with open(file_path, "w") as json_file:\n+        json_file.write(json.dumps(content))\n+\n+\n+def save_processed_workflows(file_path, unique_paths):\n+    workflow_paths_unique = ""\n+    for path in unique_paths:\n+        workflow_paths_unique += path + "\\n"\n+    with open(file_path, "w") as workflows_file:\n+        workflows_file.write(workflow_paths_unique)\n+\n+\n+def load_saved_model(model_config, model_weights):\n+    """\n+    Load the saved trained model using the saved network and its weights\n+    """\n+    # load the network\n+    loaded_model = model_from_json(model_config)\n+    # load the saved weights into the model\n+    loaded_model.set_weights(model_weights)\n+    return loaded_model\n+\n+\n+def format_tool_id(tool_link):\n+    """\n+    Extract tool id from tool link\n+    """\n+    tool_id_split = tool_link.split("/")\n+    tool_id = tool_id_split[-2] if len(tool_id_split) > 1 else tool_link\n+    return tool_id\n+\n+\n+def get_HDF5(hf, d_key):\n+    """\n+    Read h5 file to get train and test data\n+    """\n+    return hf.get(d_key).value\n+\n+\n+def save_HDF5(hf_file, d_key, data, d_type=""):\n+    """\n+    Save datasets as h5 file\n+    """\n+    if (d_type == \'json\'):\n+        data = json.dumps(data)\n+    hf_file.create_dataset(d_key, data=data)\n+\n+\n+def set_trained_model(dump_file, model_values):\n+    """\n+    Create an h5 file with the trained weights and associated dicts\n+    """\n+    hf_file = h5py.File(dump_file, \'w\')\n+    for key in model_values:\n+        value = model_values[key]\n+        if key == \'model_weights\':\n+            for idx, item in enumerate(value):\n+                w_key = "weight_" + str(idx)\n+                if w_key in hf_file:\n+                    hf_file.modify(w_key, item)\n+                else:\n+                    hf_file.create_dataset(w_key, data=item)\n+        else:\n+            if key in hf_file:\n+                hf_file.modify(key, json.dumps(value))\n+            else:\n+                hf_file.create_dataset(key, data=json.dumps(value))\n+    hf_file.close()\n+\n+\n+def remove_file(file_path):\n+    if os.path.exists(file_path):\n+        os.remove(file_path)\n+\n+\n+def extract_configuration(config_object):\n+    config_loss = dict()\n+    for index, item in enumerate(config_object):\n+        config_loss[index] = list()\n+        d_config = dict()\n+        d_config[\'loss\'] = item[\'result\'][\'loss\']\n+        d_config[\'params_config\'] = item[\'misc\'][\'vals\']\n+        config_loss[index].append(d_config)\n+    return config_loss\n+\n+\n+def get_best_parameters(mdl_dict):\n+    """\n+    Get param values (defaults as well)\n+    """\n+    lr = float(mdl_dict.get("learning_rate", "0.001"))\n+    embedding_size = int(mdl_dict.get("embedding_size", "512"))\n+    dropout = float(mdl_dict.get("dropout", "0.2"))\n+    recurrent_dropout = float(mdl_dict.get("recurrent_dropout", "0.2"))\n+    spatial_dropout = float(mdl_dict.get("spatial_dropout", "0.2"))\n+    units = int(mdl_dict.get("units", "512"))\n+    batch_size = int(mdl_dict.get("batch_size", "512"))\n+    activation_recurrent = mdl_dict.get("activation_recurrent", "elu")\n+    activation_output = mdl_dict.get("activation_output", "sigmoid")\n+\n+    return {\n+        "lr": lr,\n+        "embedding_size": embedding_size,\n+        "dropout": dropout,\n+        "recurrent_dropout": recurrent_dropout,\n+        "spatial_dropout": spatial_dropout,\n+        "units": u'..b'turn_sequences=True))\n+    model.add(Dropout(model_params["dropout"]))\n+    model.add(GRU(model_params["units"], dropout=model_params["spatial_dropout"], recurrent_dropout=model_params["recurrent_dropout"], activation=model_params["activation_recurrent"], return_sequences=False))\n+    model.add(Dropout(model_params["dropout"]))\n+    model.add(Dense(dimensions, activation=model_params["activation_output"]))\n+    optimizer = RMSprop(lr=model_params["lr"])\n+    model.compile(loss=weighted_loss(class_weights), optimizer=optimizer)\n+    return model, model_params\n+\n+\n+def compute_precision(model, x, y, reverse_data_dictionary, next_compatible_tools, usage_scores, actual_classes_pos, topk):\n+    """\n+    Compute absolute and compatible precision\n+    """\n+    absolute_precision = 0.0\n+    test_sample = np.reshape(x, (1, len(x)))\n+\n+    # predict next tools for a test path\n+    prediction = model.predict(test_sample, verbose=0)\n+\n+    nw_dimension = prediction.shape[1]\n+\n+    # remove the 0th position as there is no tool at this index\n+    prediction = np.reshape(prediction, (nw_dimension,))\n+\n+    prediction_pos = np.argsort(prediction, axis=-1)\n+    topk_prediction_pos = prediction_pos[-topk:]\n+\n+    # remove the wrong tool position from the predicted list of tool positions\n+    topk_prediction_pos = [x for x in topk_prediction_pos if x > 0]\n+\n+    # read tool names using reverse dictionary\n+    actual_next_tool_names = [reverse_data_dictionary[int(tool_pos)] for tool_pos in actual_classes_pos]\n+    top_predicted_next_tool_names = [reverse_data_dictionary[int(tool_pos)] for tool_pos in topk_prediction_pos]\n+\n+    # compute the class weights of predicted tools\n+    mean_usg_score = 0\n+    usg_wt_scores = list()\n+    for t_id in topk_prediction_pos:\n+        t_name = reverse_data_dictionary[int(t_id)]\n+        if t_id in usage_scores and t_name in actual_next_tool_names:\n+            usg_wt_scores.append(np.log(usage_scores[t_id] + 1.0))\n+    if len(usg_wt_scores) > 0:\n+            mean_usg_score = np.sum(usg_wt_scores) / float(topk)\n+    false_positives = [tool_name for tool_name in top_predicted_next_tool_names if tool_name not in actual_next_tool_names]\n+    absolute_precision = 1 - (len(false_positives) / float(topk))\n+    return mean_usg_score, absolute_precision\n+\n+\n+def verify_model(model, x, y, reverse_data_dictionary, next_compatible_tools, usage_scores, topk_list=[1, 2, 3]):\n+    """\n+    Verify the model on test data\n+    """\n+    print("Evaluating performance on test data...")\n+    print("Test data size: %d" % len(y))\n+    size = y.shape[0]\n+    precision = np.zeros([len(y), len(topk_list)])\n+    usage_weights = np.zeros([len(y), len(topk_list)])\n+    # loop over all the test samples and find prediction precision\n+    for i in range(size):\n+        actual_classes_pos = np.where(y[i] > 0)[0]\n+        for index, abs_topk in enumerate(topk_list):\n+            abs_mean_usg_score, absolute_precision = compute_precision(model, x[i, :], y, reverse_data_dictionary, next_compatible_tools, usage_scores, actual_classes_pos, abs_topk)\n+            precision[i][index] = absolute_precision\n+            usage_weights[i][index] = abs_mean_usg_score\n+    mean_precision = np.mean(precision, axis=0)\n+    mean_usage = np.mean(usage_weights, axis=0)\n+    return mean_precision, mean_usage\n+\n+\n+def save_model(results, data_dictionary, compatible_next_tools, trained_model_path, class_weights):\n+    # save files\n+    trained_model = results["model"]\n+    best_model_parameters = results["best_parameters"]\n+    model_config = trained_model.to_json()\n+    model_weights = trained_model.get_weights()\n+\n+    model_values = {\n+        \'data_dictionary\': data_dictionary,\n+        \'model_config\': model_config,\n+        \'best_parameters\': best_model_parameters,\n+        \'model_weights\': model_weights,\n+        "compatible_tools": compatible_next_tools,\n+        "class_weights": class_weights\n+    }\n+    set_trained_model(trained_model_path, model_values)\n'