Galaxy |

Changeset 2:76251d1ccdcc (2019-10-11)

Previous changeset 1:12764915e1c5 (2019-09-25) Next changeset 3:5b3c08710e47 (2020-05-09)

Commit message:
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 6fa2a0294d615c9f267b766337dca0b2d3637219"

modified:
create_tool_recommendation_model.xml
main.py
optimise_hyperparameters.py
test-data/test_tool_usage
test-data/test_workflows
utils.py

diff -r 12764915e1c5 -r 76251d1ccdcc create_tool_recommendation_model.xml
--- a/create_tool_recommendation_model.xml Wed Sep 25 06:42:40 2019 -0400
+++ b/create_tool_recommendation_model.xml Fri Oct 11 18:24:54 2019 -0400

[

b'@@ -2,11 +2,11 @@\n <description>using deep learning</description>\n <requirements>\n <requirement type="package" version="3.6">python</requirement>\n- <requirement type="package" version="1.14.0">tensorflow</requirement>\n- <requirement type="package" version="2.2.4">keras</requirement>\n- <requirement type="package" version="0.20.1">scikit-learn</requirement>\n+ <requirement type="package" version="1.13.1">tensorflow</requirement>\n+ <requirement type="package" version="2.3.0">keras</requirement>\n+ <requirement type="package" version="0.21.3">scikit-learn</requirement>\n <requirement type="package" version="2.9.0">h5py</requirement>\n- <requirement type="package" version="1.0">csvkit</requirement>\n+ <requirement type="package" version="1.0.4">csvkit</requirement>\n <requirement type="package" version="0.1.2">hyperopt</requirement>\n </requirements>\n <version_command>echo "@VERSION@"</version_command>\n@@ -38,25 +38,40 @@\n <param name="input_tabular_workflows" type="data" format="tabular" label="Dataset containing workflows" help="Please provide Galaxy workflows as a tabular file."/>\n <param name="input_tabular_tool_usage" type="data" format="tabular" label="Dataset containing usage frequencies of tools" help="Please provide tools usage frequencies as a tabular file."/>\n <section name="data_parameters" title="Data parameters" expanded="False">\n+\n <param name="input_cutoff_date" type="text" value="2017-12-01" label="Cutoff date" help="Provide a date (in the past) in yyyy-mm-dd format. The earliest date from which usage of tools will be extracted. For example, 2017-12-01 specifies that the usage of tools from this date until the data extraction date is extracted. The usage of tools before this date is not considered."/>\n+\n <param name="input_maximum_path_length" type="integer" value="25" label="Maximum number of tools in a tool sequence" help="Provide an integer between 1 and 25. A workflow is divided into unique paths and this number specifies the maximum number of tools a path can have. Paths longer than this number are ignored and are not included in the deep learning training."/>\n+\n </section>\n <section name="training_parameters" title="Training parameters" expanded="False">\n- <param name="max_evals" type="integer" value="2" label="Maximum number of evaluations of different configurations of parameters" help="Provide an integer. Different combinations of parameters are sampled and optimized to find the best one. This number specifies the number of different configurations sampled and tested."/>\n- <param name="optimize_n_epochs" type="integer" value="2" label="Number of training iterations to optimize the neural network parameters" help="Provide an integer. This number specifies the number of training iterations done for each sampled configuration while optimising the parameters."/>\n- <param name="n_epochs" type="integer" value="2" label="Number of training iterations" help="Provide an integer. This specifies the number of deep learning training iterations done after finding the best/optimised configuration of neural network parameters."/>\n- <param name="test_share" type="float" value="0.2" label="Share of the test data" help="Provide a real number between 0.0 and 1.0. This set of data is used to look through the prediction accuracy on unseen data after neural network training on an optimised configuration of parameters. It should be set to 0.0 while training for a model to be deployed to production. The minimum value can be 0.0 and maximum value should not be more than 0.5."/>\n+ <param name="max_evals" type="integer" value="50" label="Maximum number of evaluations of different configurations of parameters" help="Provide an integer. Different combinations of parameters are sampled and optimized to find the best one. This numb'..b' <param name="test_share" value="0.1"/>\n+ <output name="outfile_model">\n+ <assert_contents>\n+ <has_h5_keys keys="best_parameters,class_weights,compatible_tools,data_dictionary,model_config,weight_0,weight_1,weight_2,weight_3,weight_4,weight_5,weight_6,weight_7,weight_8"/>\n+ </assert_contents>\n+ </output>\n+ </test>\n+ <test>\n+ <param name="input_tabular_workflows" value="test_workflows" ftype="tabular"/>\n <param name="input_tabular_tool_usage" value="test_tool_usage" ftype="tabular"/>\n+ <param name="max_evals" value="1"/>\n+ <param name="optimize_n_epochs" value="1"/>\n+ <param name="n_epochs" value="1"/>\n+ <param name="test_share" value="0.0"/>\n <output name="outfile_model">\n <assert_contents>\n <has_h5_keys keys="best_parameters,class_weights,compatible_tools,data_dictionary,model_config,weight_0,weight_1,weight_2,weight_3,weight_4,weight_5,weight_6,weight_7,weight_8"/>\n </assert_contents>\n </output>\n </test>\n+ <test expect_failure="true">\n+ <param name="input_tabular_workflows" value="test_workflows" ftype="tabular"/>\n+ <param name="input_tabular_tool_usage" value="test_tool_usage" ftype="tabular"/>\n+ <param name="validation_share" value="0.0"/>\n+ </test>\n+ <test expect_failure="true">\n+ <param name="input_tabular_workflows" value="test_workflows" ftype="tabular"/>\n+ <param name="input_tabular_tool_usage" value="test_tool_usage" ftype="tabular"/>\n+ <param name="batch_size" value="1"/>\n+ </test>\n+ <test expect_failure="true">\n+ <param name="input_tabular_workflows" value="test_workflows" ftype="tabular"/>\n+ <param name="input_tabular_tool_usage" value="test_tool_usage" ftype="tabular"/>\n+ <param name="units" value="1"/>\n+ </test>\n+ <test expect_failure="true">\n+ <param name="input_tabular_workflows" value="test_workflows" ftype="tabular"/>\n+ <param name="input_tabular_tool_usage" value="test_tool_usage" ftype="tabular"/>\n+ <param name="embedding_size" value="1"/>\n+ </test>\n+ <test expect_failure="true">\n+ <param name="input_tabular_workflows" value="test_workflows" ftype="tabular"/>\n+ <param name="input_tabular_tool_usage" value="test_tool_usage" ftype="tabular"/>\n+ <param name="dropout" value="0.1"/>\n+ </test>\n+ <test expect_failure="true">\n+ <param name="input_tabular_workflows" value="test_workflows" ftype="tabular"/>\n+ <param name="input_tabular_tool_usage" value="test_tool_usage" ftype="tabular"/>\n+ <param name="spatial_dropout" value="0.1"/>\n+ </test>\n+ <test expect_failure="true">\n+ <param name="input_tabular_workflows" value="test_workflows" ftype="tabular"/>\n+ <param name="input_tabular_tool_usage" value="test_tool_usage" ftype="tabular"/>\n+ <param name="recurrent_dropout" value="0.1"/>\n+ </test>\n+ <test expect_failure="true">\n+ <param name="input_tabular_workflows" value="test_workflows" ftype="tabular"/>\n+ <param name="input_tabular_tool_usage" value="test_tool_usage" ftype="tabular"/>\n+ <param name="learning_rate" value="0.0001"/>\n+ </test>\n </tests>\n <help>\n <![CDATA[\n@@ -154,7 +226,7 @@\n Author = {Anup Kumar and Bj\xc3\xb6rn Gr\xc3\xbcning},\n keywords = {bioinformatics, recommendation system, deep learning},\n title = {{Tool recommendation system for Galaxy workflows}},\n- url = {https://github.com/anuprulez/galaxytools}\n+ url = {https://github.com/bgruening/galaxytools}\n }\n </citation>\n </citations>\n'

diff -r 12764915e1c5 -r 76251d1ccdcc main.py
--- a/main.py Wed Sep 25 06:42:40 2019 -0400
+++ b/main.py Fri Oct 11 18:24:54 2019 -0400

[

@@ -8,6 +8,8 @@
import time

# machine learning library
+import tensorflow as tf
+from keras import backend as K
import keras.callbacks as callbacks

import extract_workflow_connections
@@ -19,8 +21,16 @@
class PredictTool:

     @classmethod
-    def __init__(self):
+    def __init__(self, num_cpus):
         """ Init method. """
+        # set the number of cpus
+        cpu_config = tf.ConfigProto(
+            device_count={"CPU": num_cpus},
+            intra_op_parallelism_threads=num_cpus,
+            inter_op_parallelism_threads=num_cpus,
+            allow_soft_placement=True
+        )
+        K.set_session(tf.Session(config=cpu_config))

     @classmethod
     def find_train_best_network(self, network_config, reverse_dictionary, train_data, train_labels, test_data, test_labels, n_epochs, class_weights, usage_pred, compatible_next_tools):
@@ -29,39 +39,43 @@
         """
         print("Start hyperparameter optimisation...")
         hyper_opt = optimise_hyperparameters.HyperparameterOptimisation()
-        best_params = hyper_opt.train_model(network_config, reverse_dictionary, train_data, train_labels, test_data, test_labels, class_weights)
-
-        # retrieve the model and train on complete dataset without validation set
-        model, best_params = utils.set_recurrent_network(best_params, reverse_dictionary, class_weights)
+        best_params, best_model = hyper_opt.train_model(network_config, reverse_dictionary, train_data, train_labels, class_weights)

         # define callbacks
+        early_stopping = callbacks.EarlyStopping(monitor='loss', mode='min', verbose=1, min_delta=1e-4, restore_best_weights=True)
         predict_callback_test = PredictCallback(test_data, test_labels, reverse_dictionary, n_epochs, compatible_next_tools, usage_pred)
-        # tensor_board = callbacks.TensorBoard(log_dir=log_directory, histogram_freq=0, write_graph=True, write_images=True)
-        callbacks_list = [predict_callback_test]
+
+        callbacks_list = [predict_callback_test, early_stopping]

         print("Start training on the best model...")
-        model_fit = model.fit(
-            train_data,
-            train_labels,
-            batch_size=int(best_params["batch_size"]),
-            epochs=n_epochs,
-            verbose=2,
-            callbacks=callbacks_list,
-            shuffle="batch",
-            validation_data=(test_data, test_labels)
-        )
-
-        train_performance = {
-            "train_loss": np.array(model_fit.history["loss"]),
-            "model": model,
-            "best_parameters": best_params
-        }
-
-        # if there is test data, add more information
+        train_performance = dict()
         if len(test_data) > 0:
-            train_performance["validation_loss"] = np.array(model_fit.history["val_loss"])
+            trained_model = best_model.fit(
+                train_data,
+                train_labels,
+                batch_size=int(best_params["batch_size"]),
+                epochs=n_epochs,
+                verbose=2,
+                callbacks=callbacks_list,
+                shuffle="batch",
+                validation_data=(test_data, test_labels)
+            )
+            train_performance["validation_loss"] = np.array(trained_model.history["val_loss"])
             train_performance["precision"] = predict_callback_test.precision
             train_performance["usage_weights"] = predict_callback_test.usage_weights
+        else:
+            trained_model = best_model.fit(
+                train_data,
+                train_labels,
+                batch_size=int(best_params["batch_size"]),
+                epochs=n_epochs,
+                verbose=2,
+                callbacks=callbacks_list,
+                shuffle="batch"
+            )
+        train_performance["train_loss"] = np.array(trained_model.history["loss"])
+        train_performance["model"] = best_model
+        train_performance["best_parameters"] = best_params
         return train_performance

@@ -90,6 +104,7 @@

if __name__ == "__main__":
     start_time = time.time()
+
     arg_parser = argparse.ArgumentParser()
     arg_parser.add_argument("-wf", "--workflow_file", required=True, help="workflows tabular file")
     arg_parser.add_argument("-tu", "--tool_usage_file", required=True, help="tool usage file")
@@ -112,6 +127,7 @@
     arg_parser.add_argument("-lr", "--learning_rate", required=True, help="learning rate")
     arg_parser.add_argument("-ar", "--activation_recurrent", required=True, help="activation function for recurrent layers")
     arg_parser.add_argument("-ao", "--activation_output", required=True, help="activation function for output layers")
+
     # get argument values
     args = vars(arg_parser.parse_args())
     tool_usage_path = args["tool_usage_file"]
@@ -133,6 +149,7 @@
     learning_rate = args["learning_rate"]
     activation_recurrent = args["activation_recurrent"]
     activation_output = args["activation_output"]
+    num_cpus = 16

     config = {
         'cutoff_date': cutoff_date,
@@ -161,7 +178,7 @@
     data = prepare_data.PrepareData(maximum_path_length, test_share)
     train_data, train_labels, test_data, test_labels, data_dictionary, reverse_dictionary, class_weights, usage_pred = data.get_data_labels_matrices(workflow_paths, tool_usage_path, cutoff_date, compatible_next_tools)
     # find the best model and start training
-    predict_tool = PredictTool()
+    predict_tool = PredictTool(num_cpus)
     # start training with weighted classes
     print("Training with weighted classes and samples ...")
     results_weighted = predict_tool.find_train_best_network(config, reverse_dictionary, train_data, train_labels, test_data, test_labels, n_epochs, class_weights, usage_pred, compatible_next_tools)

diff -r 12764915e1c5 -r 76251d1ccdcc optimise_hyperparameters.py
--- a/optimise_hyperparameters.py Wed Sep 25 06:42:40 2019 -0400
+++ b/optimise_hyperparameters.py Fri Oct 11 18:24:54 2019 -0400

[

@@ -22,7 +22,7 @@
         """ Init method. """

     @classmethod
-    def train_model(self, config, reverse_dictionary, train_data, train_labels, test_data, test_labels, class_weights):
+    def train_model(self, config, reverse_dictionary, train_data, train_labels, class_weights):
         """
         Train a model and report accuracy
         """
@@ -46,7 +46,7 @@
         # get dimensions
         dimensions = len(reverse_dictionary) + 1
         best_model_params = dict()
-        early_stopping = EarlyStopping(monitor='val_loss', mode='min', min_delta=1e-4, verbose=1, patience=1)
+        early_stopping = EarlyStopping(monitor='val_loss', mode='min', verbose=1, min_delta=1e-4)

         # specify the search space for finding the best combination of parameters using Bayesian optimisation
         params = {
@@ -82,11 +82,12 @@
                 validation_split=validation_split,
                 callbacks=[early_stopping]
             )
-            return {'loss': model_fit.history["val_loss"][-1], 'status': STATUS_OK}
-        # minimize the objective function using the set of parameters above4
+            return {'loss': model_fit.history["val_loss"][-1], 'status': STATUS_OK, 'model': model}
+        # minimize the objective function using the set of parameters above
         trials = Trials()
         learned_params = fmin(create_model, params, trials=trials, algo=tpe.suggest, max_evals=int(config["max_evals"]))
-        print(learned_params)
+        best_model = trials.results[np.argmin([r['loss'] for r in trials.results])]['model']
+
         # set the best params with respective values
         for item in learned_params:
             item_val = learned_params[item]
@@ -96,4 +97,4 @@
                 best_model_params[item] = l_recurrent_activations[item_val]
             else:
                 best_model_params[item] = item_val
-        return best_model_params
+        return best_model_params, best_model

diff -r 12764915e1c5 -r 76251d1ccdcc test-data/test_tool_usage
--- a/test-data/test_tool_usage Wed Sep 25 06:42:40 2019 -0400
+++ b/test-data/test_tool_usage Fri Oct 11 18:24:54 2019 -0400

b'@@ -998,5219 +998,3 @@\n toolshed.g2.bx.psu.edu/repos/galaxyp/openms_xtandemadapter/XTandemAdapter/2.2.0.1\t2019-02-01\t2\n toolshed.g2.bx.psu.edu/repos/devteam/kraken2tax/Kraken2Tax/1.2\t2019-02-01\t2\n toolshed.g2.bx.psu.edu/repos/devteam/vcffilter/vcffilter2/0.0.3\t2019-02-01\t2\n-toolshed.g2.bx.psu.edu/repos/galaxyp/msconvert/msconvert_win/3.0.0\t2019-02-01\t2\n-__DATA_FETCH__\t2019-02-01\t2\n-toolshed.g2.bx.psu.edu/repos/iuc/bedtools/bedtools_annotatebed/2.27.0.0\t2019-02-01\t2\n-toolshed.g2.bx.psu.edu/repos/iuc/goseq/goseq/1.32.0\t2019-02-01\t2\n-toolshed.g2.bx.psu.edu/repos/bgruening/hicexplorer_hicplottads/hicexplorer_hicplottads/2.1.2.0\t2019-02-01\t2\n-toolshed.g2.bx.psu.edu/repos/iuc/goslimmer/goslimmer/1.0.1\t2019-02-01\t2\n-toolshed.g2.bx.psu.edu/repos/devteam/picard/picard_MarkDuplicates/1.136.0\t2019-02-01\t2\n-toolshed.g2.bx.psu.edu/repos/devteam/picard/picard_FilterSamReads/2.18.2.1\t2019-02-01\t2\n-toolshed.g2.bx.psu.edu/repos/crs4/edena/edena_ovl_wrapper/0.3\t2019-02-01\t2\n-toolshed.g2.bx.psu.edu/repos/bonsai/sortmerna/sortmerna_wrapper/1.0\t2019-02-01\t2\n-toolshed.g2.bx.psu.edu/repos/blankenberg/naive_variant_caller/naive_variant_caller/0.0.3\t2019-02-01\t2\n-toolshed.g2.bx.psu.edu/repos/nilesh/rseqc/rseqc_bam_stat/2.6.4\t2019-02-01\t2\n-toolshed.g2.bx.psu.edu/repos/blankenberg/naive_variant_caller/naive_variant_caller/0.0.1\t2019-02-01\t2\n-toolshed.g2.bx.psu.edu/repos/bgruening/wtdbg/wtdbg/1.2.8.1\t2019-02-01\t2\n-toolshed.g2.bx.psu.edu/repos/devteam/vcf2tsv/vcf2tsv/1.0.0_rc1+galaxy0\t2019-02-01\t2\n-toolshed.g2.bx.psu.edu/repos/devteam/ncbi_blast_plus/blastxml_to_tabular/0.3.1\t2019-02-01\t2\n-toolshed.g2.bx.psu.edu/repos/iuc/humann2_renorm_table/humann2_renorm_table/0.11.1.0\t2019-02-01\t2\n-toolshed.g2.bx.psu.edu/repos/iuc/idba_ud/idba_ud/1.1.3\t2019-02-01\t2\n-nn_classifier\t2019-02-01\t2\n-toolshed.g2.bx.psu.edu/repos/galaxyp/openms_featurefindermrm/FeatureFinderMRM/2.3.0\t2019-02-01\t2\n-toolshed.g2.bx.psu.edu/repos/bgruening/openbabel/ctb_ob_genProp/1.0\t2019-02-01\t2\n-toolshed.g2.bx.psu.edu/repos/iuc/trinity_define_clusters_by_cutting_tree/trinity_define_clusters_by_cutting_tree/2.8.4\t2019-02-01\t2\n-toolshed.g2.bx.psu.edu/repos/bgruening/openbabel_genprop/openbabel_genProp/2.4.1.0\t2019-02-01\t2\n-toolshed.g2.bx.psu.edu/repos/iuc/trinity_filter_low_expr_transcripts/trinity_filter_low_expr_transcripts/2.8.4\t2019-02-01\t2\n-toolshed.g2.bx.psu.edu/repos/iuc/mothur_get_groups/mothur_get_groups/1.39.5.0\t2019-02-01\t2\n-__EXTRACT_DATASET__\t2019-02-01\t2\n-Extract_features1\t2019-02-01\t2\n-toolshed.g2.bx.psu.edu/repos/imgteam/wsi_extract_top_view/ip_wsi_extract_top_view/0.1\t2019-02-01\t2\n-toolshed.g2.bx.psu.edu/repos/imgteam/split_labelmap/ip_split_labelmap/0.1\t2019-02-01\t2\n-toolshed.g2.bx.psu.edu/repos/iuc/trinity_run_de_analysis/trinity_run_de_analysis/2.8.4\t2019-02-01\t2\n-toolshed.g2.bx.psu.edu/repos/galaxyp/msi_filtering/mass_spectrometry_imaging_filtering/1.7.0.1\t2019-02-01\t2\n-toolshed.g2.bx.psu.edu/repos/iuc/trinity_samples_qccheck/trinity_samples_qccheck/2.8.4\t2019-02-01\t2\n-toolshed.g2.bx.psu.edu/repos/imgteam/mergeneighboursinlabelimage/ip_merge_neighbours_in_label/0.2\t2019-02-01\t2\n-toolshed.g2.bx.psu.edu/repos/devteam/picard/picard_CleanSam/1.136.0\t2019-02-01\t2\n-toolshed.g2.bx.psu.edu/repos/galaxyp/quantp/quantp/1.1.2\t2019-02-01\t2\n-toolshed.g2.bx.psu.edu/repos/iuc/macs2/macs2_predictd/2.1.1.20160309.0\t2019-02-01\t2\n-toolshed.g2.bx.psu.edu/repos/imgteam/binary2labelimage/ip_binary_to_labelimage/0.2\t2019-02-01\t2\n-toolshed.g2.bx.psu.edu/repos/imgteam/2d_simple_filter/ip_filter_standard/0.0.2\t2019-02-01\t2\n-toolshed.g2.bx.psu.edu/repos/imgteam/2d_auto_threshold/ip_threshold/0.0.3\t2019-02-01\t2\n-toolshed.g2.bx.psu.edu/repos/iuc/mageck_gsea/mageck_gsea/0.5.8\t2019-02-01\t2\n-toolshed.g2.bx.psu.edu/repos/galaxyp/openms_falsediscoveryrate/FalseDiscoveryRate/2.2.0\t2019-02-01\t2\n-MAF_To_Fasta1\t2019-02-01\t2\n-toolshed.g2.bx.psu.edu/repos/guru-ananda/karyotype_plot/karyotype_Plot_1/1.0.0\t2019-02-01\t2\n-toolshed.g2.bx.psu.edu/repos/bgruening/text_processing/tp_replace_in_column/1.1.2\t2019-02-01\t2\n-'..b'n/nanopolish_methylation/0.1.0\t2018-09-01\t1\n-random_lines1\t2018-09-01\t1\n-toolshed.g2.bx.psu.edu/repos/iuc/mothur_lefse/mothur_lefse/1.36.1.0\t2018-09-01\t1\n-toolshed.g2.bx.psu.edu/repos/iuc/mothur_make_contigs/mothur_make_contigs/1.36.1.0\t2018-09-01\t1\n-__RELABEL_FROM_FILE__\t2018-09-01\t1\n-toolshed.g2.bx.psu.edu/repos/iuc/sra_tools/sra_pileup/2.9.1.2\t2018-09-01\t1\n-toolshed.g2.bx.psu.edu/repos/iuc/sra_tools/sam_dump/2.9.1.3\t2018-09-01\t1\n-toolshed.g2.bx.psu.edu/repos/iuc/mothur_metastats/mothur_metastats/1.36.1.0\t2018-09-01\t1\n-secure_hash_message_digest\t2018-09-01\t1\n-toolshed.g2.bx.psu.edu/repos/devteam/picard/PicardInsertSize/1.56.0\t2018-09-01\t1\n-toolshed.g2.bx.psu.edu/repos/iuc/mothur_otu_hierarchy/mothur_otu_hierarchy/1.39.5.0\t2018-09-01\t1\n-toolshed.g2.bx.psu.edu/repos/iuc/mothur_parse_list/mothur_parse_list/1.39.5.0\t2018-09-01\t1\n-toolshed.g2.bx.psu.edu/repos/iuc/mothur_pre_cluster/mothur_pre_cluster/1.36.1.0\t2018-09-01\t1\n-toolshed.g2.bx.psu.edu/repos/galaxyp/openms_filefilter/FileFilter/2.2.0\t2018-09-01\t1\n-toolshed.g2.bx.psu.edu/repos/iuc/sqlite_to_tabular/sqlite_to_tabular/2.0.0\t2018-09-01\t1\n-toolshed.g2.bx.psu.edu/repos/iuc/snpsift/snpsift_vartype/4.3.1\t2018-09-01\t1\n-toolshed.g2.bx.psu.edu/repos/iuc/mothur_remove_groups/mothur_remove_groups/1.36.1.0\t2018-09-01\t1\n-toolshed.g2.bx.psu.edu/repos/iuc/mothur_remove_lineage/mothur_remove_lineage/1.36.1.0\t2018-09-01\t1\n-__APPLY_RULES__\t2018-09-01\t1\n-toolshed.g2.bx.psu.edu/repos/iuc/mothur_seq_error/mothur_seq_error/1.36.1.0\t2018-09-01\t1\n-toolshed.g2.bx.psu.edu/repos/iuc/snpsift_dbnsfp_generic/snpSift_dbnsfp_generic/4.1.1\t2018-09-01\t1\n-toolshed.g2.bx.psu.edu/repos/iuc/mothur_sub_sample/mothur_sub_sample/1.36.1.0\t2018-09-01\t1\n-toolshed.g2.bx.psu.edu/repos/bgruening/hicexplorer_hiccorrectmatrix/hicexplorer_hiccorrectmatrix/1.8.0\t2018-09-01\t1\n-toolshed.g2.bx.psu.edu/repos/devteam/picard/picard_MergeSamFiles/2.7.1.0\t2018-09-01\t1\n-toolshed.g2.bx.psu.edu/repos/bgruening/hicexplorer_hiccomparematrices/hicexplorer_hiccomparematrices/2.1.4.0\t2018-09-01\t1\n-toolshed.g2.bx.psu.edu/repos/galaxyp/openms_qcimporter/QCImporter/2.3.0\t2018-09-01\t1\n-toolshed.g2.bx.psu.edu/repos/iuc/snpeff/snpEff_databases/4.3r.1\t2018-09-01\t1\n-toolshed.g2.bx.psu.edu/repos/iuc/mothur_summary_single/mothur_summary_single/1.36.1.0\t2018-09-01\t1\n-toolshed.g2.bx.psu.edu/repos/iuc/snpeff/snpEff_databases/4.0.0\t2018-09-01\t1\n-toolshed.g2.bx.psu.edu/repos/galaxyp/openms_qcembedder/QCEmbedder/2.3.0\t2018-09-01\t1\n-toolshed.g2.bx.psu.edu/repos/iuc/mothur_summary_single/mothur_summary_single/1.39.5.0\t2018-09-01\t1\n-toolshed.g2.bx.psu.edu/repos/galaxyp/openms_highresprecursormasscorrector/HighResPrecursorMassCorrector/2.2.0\t2018-09-01\t1\n-toolshed.g2.bx.psu.edu/repos/bgruening/graphmap_align/graphmap_align/0.5.2\t2018-09-01\t1\n-toolshed.g2.bx.psu.edu/repos/devteam/picard/picard_ReorderSam/2.18.2.1\t2018-09-01\t1\n-toolshed.g2.bx.psu.edu/repos/bebatut/cdhit/cd_hit_est/1.3\t2018-09-01\t1\n-toolshed.g2.bx.psu.edu/repos/iuc/mothur_venn/mothur_venn/1.36.1.0\t2018-09-01\t1\n-toolshed.g2.bx.psu.edu/repos/galaxyp/openms_idfileconverter/IDFileConverter/2.1.0\t2018-09-01\t1\n-toolshed.g2.bx.psu.edu/repos/bgruening/diamond/bg_diamond_makedb/0.8.24\t2018-09-01\t1\n-toolshed.g2.bx.psu.edu/repos/galaxyp/openms_idfileconverter/IDFileConverter/2.2.0\t2018-09-01\t1\n-toolshed.g2.bx.psu.edu/repos/iuc/snpeff/snpEff/3.4\t2018-09-01\t1\n-toolshed.g2.bx.psu.edu/repos/bgruening/augustus/augustus/3.2.3\t2018-09-01\t1\n-toolshed.g2.bx.psu.edu/repos/iuc/ncbi_eutils_einfo/ncbi_eutils_einfo/1.1\t2018-09-01\t1\n-toolshed.g2.bx.psu.edu/repos/bgruening/bismark/bismark_deduplicate/0.16.3\t2018-09-01\t1\n-toolshed.g2.bx.psu.edu/repos/devteam/picard/rgPicFixMate/1.56.0\t2018-09-01\t1\n-toolshed.g2.bx.psu.edu/repos/iuc/seqtk/seqtk_mergepe/1.2.0\t2018-09-01\t1\n-toolshed.g2.bx.psu.edu/repos/iuc/seqtk/seqtk_fqchk/1.2.0\t2018-09-01\t1\n-toolshed.g2.bx.psu.edu/repos/iuc/raceid_diffgene/raceid_diffgene/1.0.0\t2018-09-01\t1\n-toolshed.g2.bx.psu.edu/repos/iuc/qiime_assign_taxonomy/qiime_assign_taxonomy/1.9.1.0\t2018-09-01\t1\n'

diff -r 12764915e1c5 -r 76251d1ccdcc test-data/test_workflows
--- a/test-data/test_workflows Wed Sep 25 06:42:40 2019 -0400
+++ b/test-data/test_workflows Fri Oct 11 18:24:54 2019 -0400

b'@@ -527,2600 +527,3 @@\n 165\t2013-05-23 17:21:00\t1986\ttoolshed.g2.bx.psu.edu/repos/bjoern-gruening/trim_galore/trim_galore/0.2.4.1\t\t1987\t\t\n 165\t2013-05-23 17:21:00\t1992\ttoolshed.g2.bx.psu.edu/repos/bjoern-gruening/trim_galore/trim_galore/0.2.4.1\t\t1991\t\t\n 165\t2013-05-23 17:21:00\t1992\ttoolshed.g2.bx.psu.edu/repos/bjoern-gruening/trim_galore/trim_galore/0.2.4.1\t\t1993\t\t\n-171\t2013-05-31 10:17:00\t2174\tCut1\t\t2173\t\t\n-171\t2013-05-31 10:17:00\t2176\tAdd_a_column1\t\t2175\tFilter1\t\n-171\t2013-05-31 10:17:00\t2182\tcshl_find_and_replace\t\t2181\tcshl_find_and_replace\t\n-171\t2013-05-31 10:17:00\t2183\tcshl_find_and_replace\t\t2182\tcshl_find_and_replace\t\n-171\t2013-05-31 10:17:00\t2184\tcshl_find_and_replace\t\t2183\tcshl_find_and_replace\t\n-171\t2013-05-31 10:17:00\t2185\tAdd_a_column1\t\t2184\tcshl_find_and_replace\t\n-171\t2013-05-31 10:17:00\t2187\tCut1\t\t2186\tAdd_a_column1\t\n-171\t2013-05-31 10:17:00\t2189\tmergeCols1\t\t2188\taddValue\t\n-171\t2013-05-31 10:17:00\t2199\tsamtools_rmdup\t\t2191\tsam_merge2\t\n-171\t2013-05-31 10:17:00\t2201\tFilter1\t\t2200\tmethtools_calling\t\n-171\t2013-05-31 10:17:00\t2202\tAdd_a_column1\t\t2201\tFilter1\t\n-171\t2013-05-31 10:17:00\t2206\tcshl_sort_header\t\t2205\tgops_subtract_1\t\n-171\t2013-05-31 10:17:00\t2207\tmethtools_destrand\t\t2206\tcshl_sort_header\t\n-174\t2013-06-09 19:14:00\t2338\ttophat2\t0.5\t2321\ttoolshed.g2.bx.psu.edu/repos/bjoern-gruening/trim_galore/trim_galore/0.2.4.1\t0.2.4.1\n-174\t2013-06-09 19:14:00\t2338\ttophat2\t0.5\t2321\ttoolshed.g2.bx.psu.edu/repos/bjoern-gruening/trim_galore/trim_galore/0.2.4.1\t0.2.4.1\n-174\t2013-06-09 19:14:00\t2336\ttophat2\t0.5\t2322\ttoolshed.g2.bx.psu.edu/repos/bjoern-gruening/trim_galore/trim_galore/0.2.4.1\t0.2.4.1\n-174\t2013-06-09 19:14:00\t2336\ttophat2\t0.5\t2322\ttoolshed.g2.bx.psu.edu/repos/bjoern-gruening/trim_galore/trim_galore/0.2.4.1\t0.2.4.1\n-174\t2013-06-09 19:14:00\t2343\ttophat2\t0.5\t2323\ttoolshed.g2.bx.psu.edu/repos/bjoern-gruening/trim_galore/trim_galore/0.2.4.1\t0.2.4.1\n-174\t2013-06-09 19:14:00\t2343\ttophat2\t0.5\t2323\ttoolshed.g2.bx.psu.edu/repos/bjoern-gruening/trim_galore/trim_galore/0.2.4.1\t0.2.4.1\n-174\t2013-06-09 19:14:00\t2342\ttophat2\t0.5\t2324\ttoolshed.g2.bx.psu.edu/repos/bjoern-gruening/trim_galore/trim_galore/0.2.4.1\t0.2.4.1\n-174\t2013-06-09 19:14:00\t2342\ttophat2\t0.5\t2324\ttoolshed.g2.bx.psu.edu/repos/bjoern-gruening/trim_galore/trim_galore/0.2.4.1\t0.2.4.1\n-174\t2013-06-09 19:14:00\t2317\tcuffdiff\t0.0.5\t2331\ttophat2\t0.5\n-174\t2013-06-09 19:14:00\t2344\tcufflinks\t0.0.5\t2331\ttophat2\t0.5\n-174\t2013-06-09 19:14:00\t2316\tcuffdiff\t0.0.5\t2333\tcuffmerge\t0.0.5\n-174\t2013-06-09 19:14:00\t2317\tcuffdiff\t0.0.5\t2333\tcuffmerge\t0.0.5\n-174\t2013-06-09 19:14:00\t2317\tcuffdiff\t0.0.5\t2334\ttophat2\t0.5\n-174\t2013-06-09 19:14:00\t2317\tcuffdiff\t0.0.5\t2335\ttophat2\t0.5\n-174\t2013-06-09 19:14:00\t2316\tcuffdiff\t0.0.5\t2336\ttophat2\t0.5\n-174\t2013-06-09 19:14:00\t2332\tcufflinks\t0.0.5\t2336\ttophat2\t0.5\n-174\t2013-06-09 19:14:00\t2331\ttophat2\t0.5\t2337\ttoolshed.g2.bx.psu.edu/repos/bjoern-gruening/trim_galore/trim_galore/0.2.4.1\t0.2.4.1\n-174\t2013-06-09 19:14:00\t2331\ttophat2\t0.5\t2337\ttoolshed.g2.bx.psu.edu/repos/bjoern-gruening/trim_galore/trim_galore/0.2.4.1\t0.2.4.1\n-174\t2013-06-09 19:14:00\t2317\tcuffdiff\t0.0.5\t2338\ttophat2\t0.5\n-174\t2013-06-09 19:14:00\t2326\tcufflinks\t0.0.5\t2338\ttophat2\t0.5\n-174\t2013-06-09 19:14:00\t2316\tcuffdiff\t0.0.5\t2339\ttophat2\t0.5\n-174\t2013-06-09 19:14:00\t2316\tcuffdiff\t0.0.5\t2340\ttophat2\t0.5\n-174\t2013-06-09 19:14:00\t2328\tcufflinks\t0.0.5\t2340\ttophat2\t0.5\n-174\t2013-06-09 19:14:00\t2316\tcuffdiff\t0.0.5\t2341\ttophat2\t0.5\n-174\t2013-06-09 19:14:00\t2316\tcuffdiff\t0.0.5\t2342\ttophat2\t0.5\n-174\t2013-06-09 19:14:00\t2316\tcuffdiff\t0.0.5\t2343\ttophat2\t0.5\n-174\t2013-06-09 19:14:00\t2316\tcuffdiff\t0.0.5\t2345\t\t\n-174\t2013-06-09 19:14:00\t2317\tcuffdiff\t0.0.5\t2345\t\t\n-174\t2013-06-09 19:14:00\t2326\tcufflinks\t0.0.5\t2345\t\t\n-174\t2013-06-09 19:14:00\t2328\tcufflinks\t0.0.5\t2345\t\t\n-174\t2013-06-09 19:14:00\t2331\ttophat2\t0.5\t2345\t\t\n-174\t2013-06-09 19:14:00\t2332\tcufflinks\t0.0.5\t2345\t\t\n-174\t2013-06-09 19:14:00\t2336\ttophat2\t0.5\t2345\t\t\n-174\t2013-06-09 19:14:00\t2338\ttophat2\t0.5\t2345\t\t\n-174\t2013-06-09 19:14:00\t2342\ttophat2\t'..b'galore/trim_galore/0.2.4.1\t0.2.4.1\t4040\t\t\n-371\t2013-07-24 20:44:00\t4179\twig_to_bigWig\t1.1.0\t4186\tFilter1\t1.1.0\n-371\t2013-07-24 20:44:00\t4174\twig_to_bigWig\t1.1.0\t4187\tFilter1\t1.1.0\n-371\t2013-07-24 20:44:00\t4180\twig_to_bigWig\t1.1.0\t4189\tFilter1\t1.1.0\n-371\t2013-07-24 20:44:00\t4181\twig_to_bigWig\t1.1.0\t4193\tFilter1\t1.1.0\n-371\t2013-07-24 20:44:00\t4202\twig_to_bigWig\t1.1.0\t4195\tFilter1\t1.1.0\n-371\t2013-07-24 20:44:00\t4207\twig_to_bigWig\t1.1.0\t4196\tFilter1\t1.1.0\n-371\t2013-07-24 20:44:00\t4206\twig_to_bigWig\t1.1.0\t4198\tFilter1\t1.1.0\n-371\t2013-07-24 20:44:00\t4182\twig_to_bigWig\t1.1.0\t4199\tFilter1\t1.1.0\n-371\t2013-07-24 20:44:00\t4204\twig_to_bigWig\t1.1.0\t4200\tFilter1\t1.1.0\n-371\t2013-07-24 20:44:00\t4205\twig_to_bigWig\t1.1.0\t4201\tFilter1\t1.1.0\n-371\t2013-07-24 20:44:00\t4176\tFilter1\t1.1.0\t4210\t\t\n-371\t2013-07-24 20:44:00\t4183\tFilter1\t1.1.0\t4210\t\t\n-371\t2013-07-24 20:44:00\t4185\tFilter1\t1.1.0\t4210\t\t\n-371\t2013-07-24 20:44:00\t4186\tFilter1\t1.1.0\t4210\t\t\n-371\t2013-07-24 20:44:00\t4187\tFilter1\t1.1.0\t4210\t\t\n-371\t2013-07-24 20:44:00\t4189\tFilter1\t1.1.0\t4210\t\t\n-371\t2013-07-24 20:44:00\t4190\tFilter1\t1.1.0\t4210\t\t\n-371\t2013-07-24 20:44:00\t4192\tFilter1\t1.1.0\t4210\t\t\n-371\t2013-07-24 20:44:00\t4193\tFilter1\t1.1.0\t4210\t\t\n-371\t2013-07-24 20:44:00\t4194\tFilter1\t1.1.0\t4210\t\t\n-371\t2013-07-24 20:44:00\t4195\tFilter1\t1.1.0\t4210\t\t\n-371\t2013-07-24 20:44:00\t4196\tFilter1\t1.1.0\t4210\t\t\n-371\t2013-07-24 20:44:00\t4197\tFilter1\t1.1.0\t4210\t\t\n-371\t2013-07-24 20:44:00\t4198\tFilter1\t1.1.0\t4210\t\t\n-371\t2013-07-24 20:44:00\t4199\tFilter1\t1.1.0\t4210\t\t\n-371\t2013-07-24 20:44:00\t4201\tFilter1\t1.1.0\t4210\t\t\n-375\t2013-07-26 14:02:00\t4288\theatmapper\t1.3\t4300\t\t\n-375\t2013-07-26 14:02:00\t4292\theatmapper\t1.3\t4300\t\t\n-375\t2013-07-26 14:02:00\t4298\theatmapper\t1.3\t4300\t\t\n-375\t2013-07-26 14:02:00\t4288\theatmapper\t1.3\t4301\t\t\n-375\t2013-07-26 14:02:00\t4292\theatmapper\t1.3\t4305\t\t\n-375\t2013-07-26 14:02:00\t4298\theatmapper\t1.3\t4311\t\t\n-380\t2013-07-31 09:32:00\t4346\tbowtie2\t0.1\t4347\ttoolshed.g2.bx.psu.edu/repos/bjoern-gruening/trim_galore/trim_galore/0.2.4.1\t0.2.4.1\n-380\t2013-07-31 09:32:00\t4347\ttoolshed.g2.bx.psu.edu/repos/bjoern-gruening/trim_galore/trim_galore/0.2.4.1\t0.2.4.1\t4350\t\t\n-380\t2013-07-31 09:32:00\t4347\ttoolshed.g2.bx.psu.edu/repos/bjoern-gruening/trim_galore/trim_galore/0.2.4.1\t0.2.4.1\t4351\t\t\n-385\t2013-07-31 19:25:00\t4371\ttoolshed.g2.bx.psu.edu/repos/ryo-tas/macs14/peakcalling_macs14/1.4.1\t1.4.1\t4372\t\t\n-385\t2013-07-31 19:25:00\t4371\ttoolshed.g2.bx.psu.edu/repos/ryo-tas/macs14/peakcalling_macs14/1.4.1\t1.4.1\t4373\t\t\n-392\t2013-07-31 20:28:00\t4425\tSummary_Statistics1\t1.1.0\t4427\tFilter1\t1.1.0\n-392\t2013-07-31 20:28:00\t4424\tSummary_Statistics1\t1.1.0\t4428\tFilter1\t1.1.0\n-392\t2013-07-31 20:28:00\t4429\taddValue\t1.0.0\t4428\tFilter1\t1.1.0\n-392\t2013-07-31 20:28:00\t4431\tCut1\t1.0.2\t4430\tmergeCols1\t1.0.1\n-392\t2013-07-31 20:28:00\t4433\tmergeCols1\t1.0.1\t4432\taddValue\t1.0.0\n-392\t2013-07-31 20:28:00\t4422\tCut1\t1.0.2\t4433\tmergeCols1\t1.0.1\n-392\t2013-07-31 20:28:00\t4426\taddValue\t1.0.0\t4435\t\t\n-392\t2013-07-31 20:28:00\t4427\tFilter1\t1.1.0\t4435\t\t\n-392\t2013-07-31 20:28:00\t4428\tFilter1\t1.1.0\t4435\t\t\n-394\t2013-08-01 09:49:00\t4446\ttoolshed.g2.bx.psu.edu/repos/aaronquinlan/bedtools/bedtools_intersectbed_bam/0.1.0\t0.1.0\t4449\t\t\n-394\t2013-08-01 09:49:00\t4446\ttoolshed.g2.bx.psu.edu/repos/aaronquinlan/bedtools/bedtools_intersectbed_bam/0.1.0\t0.1.0\t4451\t\t\n-397\t2013-08-01 21:11:00\t4472\tmethtools_filter\t0.1.1\t4469\ttoolshed.g2.bx.psu.edu/repos/aaronquinlan/bedtools/bedtools_intersectbed_bam/0.1.0\t0.1.0\n-397\t2013-08-01 21:11:00\t4472\tmethtools_filter\t0.1.1\t4470\ttoolshed.g2.bx.psu.edu/repos/aaronquinlan/bedtools/bedtools_intersectbed_bam/0.1.0\t0.1.0\n-397\t2013-08-01 21:11:00\t4471\tmethtools_dmr\t0.1.1\t4472\tmethtools_filter\t0.1.1\n-397\t2013-08-01 21:11:00\t4471\tmethtools_dmr\t0.1.1\t4472\tmethtools_filter\t0.1.1\n-397\t2013-08-01 21:11:00\t4468\ttoolshed.g2.bx.psu.edu/repos/aaronquinlan/bedtools/bedtools_intersectbed_bam/0.1.0\t0.1.0\t4474\tsmooth_running_window\t0.1\n-397\t2013-08-01 21:11:00\t4468\ttoolshed.g2.bx.psu.edu/repos/aaronquinlan/bedtools/bedtools_intersectbed_bam/0.1.0\t0.1.0\t4475\t\t\n'

diff -r 12764915e1c5 -r 76251d1ccdcc utils.py
--- a/utils.py Wed Sep 25 06:42:40 2019 -0400
+++ b/utils.py Fri Oct 11 18:24:54 2019 -0400

[

@@ -3,11 +3,6 @@
import json
import h5py

-from keras.models import model_from_json, Sequential
-from keras.layers import Dense, GRU, Dropout
-from keras.layers.embeddings import Embedding
-from keras.layers.core import SpatialDropout1D
-from keras.optimizers import RMSprop
from keras import backend as K

@@ -37,17 +32,6 @@
         workflows_file.write(workflow_paths_unique)

-def load_saved_model(model_config, model_weights):
-    """
-    Load the saved trained model using the saved network and its weights
-    """
-    # load the network
-    loaded_model = model_from_json(model_config)
-    # load the saved weights into the model
-    loaded_model.set_weights(model_weights)
-    return loaded_model
-
-
def format_tool_id(tool_link):
     """
     Extract tool id from tool link
@@ -57,22 +41,6 @@
     return tool_id

-def get_HDF5(hf, d_key):
-    """
-    Read h5 file to get train and test data
-    """
-    return hf.get(d_key).value
-
-
-def save_HDF5(hf_file, d_key, data, d_type=""):
-    """
-    Save datasets as h5 file
-    """
-    if (d_type == 'json'):
-        data = json.dumps(data)
-    hf_file.create_dataset(d_key, data=data)
-
-
def set_trained_model(dump_file, model_values):
     """
     Create an h5 file with the trained weights and associated dicts
@@ -100,44 +68,6 @@
         os.remove(file_path)

-def extract_configuration(config_object):
-    config_loss = dict()
-    for index, item in enumerate(config_object):
-        config_loss[index] = list()
-        d_config = dict()
-        d_config['loss'] = item['result']['loss']
-        d_config['params_config'] = item['misc']['vals']
-        config_loss[index].append(d_config)
-    return config_loss
-
-
-def get_best_parameters(mdl_dict):
-    """
-    Get param values (defaults as well)
-    """
-    lr = float(mdl_dict.get("learning_rate", "0.001"))
-    embedding_size = int(mdl_dict.get("embedding_size", "512"))
-    dropout = float(mdl_dict.get("dropout", "0.2"))
-    recurrent_dropout = float(mdl_dict.get("recurrent_dropout", "0.2"))
-    spatial_dropout = float(mdl_dict.get("spatial_dropout", "0.2"))
-    units = int(mdl_dict.get("units", "512"))
-    batch_size = int(mdl_dict.get("batch_size", "512"))
-    activation_recurrent = mdl_dict.get("activation_recurrent", "elu")
-    activation_output = mdl_dict.get("activation_output", "sigmoid")
-
-    return {
-        "lr": lr,
-        "embedding_size": embedding_size,
-        "dropout": dropout,
-        "recurrent_dropout": recurrent_dropout,
-        "spatial_dropout": spatial_dropout,
-        "units": units,
-        "batch_size": batch_size,
-        "activation_recurrent": activation_recurrent,
-        "activation_output": activation_output,
-    }
-
-
def weighted_loss(class_weights):
     """
     Create a weighted loss function. Penalise the misclassification
@@ -152,27 +82,6 @@
     return weighted_binary_crossentropy

-def set_recurrent_network(mdl_dict, reverse_dictionary, class_weights):
-    """
-    Create a RNN network and set its parameters
-    """
-    dimensions = len(reverse_dictionary) + 1
-    model_params = get_best_parameters(mdl_dict)
-
-    # define the architecture of the neural network
-    model = Sequential()
-    model.add(Embedding(dimensions, model_params["embedding_size"], mask_zero=True))
-    model.add(SpatialDropout1D(model_params["spatial_dropout"]))
-    model.add(GRU(model_params["units"], dropout=model_params["spatial_dropout"], recurrent_dropout=model_params["recurrent_dropout"], activation=model_params["activation_recurrent"], return_sequences=True))
-    model.add(Dropout(model_params["dropout"]))
-    model.add(GRU(model_params["units"], dropout=model_params["spatial_dropout"], recurrent_dropout=model_params["recurrent_dropout"], activation=model_params["activation_recurrent"], return_sequences=False))
-    model.add(Dropout(model_params["dropout"]))
-    model.add(Dense(dimensions, activation=model_params["activation_output"]))
-    optimizer = RMSprop(lr=model_params["lr"])
-    model.compile(loss=weighted_loss(class_weights), optimizer=optimizer)
-    return model, model_params
-
-
def compute_precision(model, x, y, reverse_data_dictionary, next_compatible_tools, usage_scores, actual_classes_pos, topk):
     """
     Compute absolute and compatible precision