Repository 'sklearn_feature_selection'
hg clone https://toolshed.g2.bx.psu.edu/repos/bgruening/sklearn_feature_selection

Changeset 31:5773e98921fc (2021-05-01)
Previous changeset 30:1d20e0dce176 (2021-04-13) Next changeset 32:a7c667ff83fe (2021-08-27)
Commit message:
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit ea12f973df4b97a2691d9e4ce6bf6fae59d57717"
modified:
fitted_model_eval.py
keras_deep_learning.py
keras_train_and_eval.py
ml_visualization_ex.py
model_prediction.py
search_model_validation.py
simple_model_fit.py
stacking_ensembles.py
to_categorical.py
train_test_eval.py
train_test_split.py
added:
association_rules.py
label_encoder.py
test-data/le_input_w_header.tabular
test-data/le_input_wo_header.tabular
test-data/le_output.tabular
test-data/mba_input_int_w.tabular
test-data/mba_input_int_wo.tabular
test-data/mba_input_str_w.tabular
test-data/mba_input_str_wo.tabular
test-data/mba_out_str.tabular
test-data/mba_output_int.tabular
test-data/mba_output_str.tabular
b
diff -r 1d20e0dce176 -r 5773e98921fc association_rules.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/association_rules.py Sat May 01 01:20:14 2021 +0000
[
@@ -0,0 +1,116 @@
+import argparse
+import json
+import warnings
+
+import pandas as pd
+from mlxtend.frequent_patterns import association_rules, fpgrowth
+from mlxtend.preprocessing import TransactionEncoder
+
+
+def main(inputs, infile, outfile, min_support=0.5, min_confidence=0.5, min_lift=1.0, min_conviction=1.0, max_length=None):
+    """
+    Parameter
+    ---------
+    input : str
+        File path to galaxy tool parameter
+
+    infile : str
+        File paths of input vector
+
+    outfile : str
+        File path to output matrix
+
+    min_support: float
+        Minimum support
+
+    min_confidence: float
+        Minimum confidence
+
+    min_lift: float
+        Minimum lift
+
+    min_conviction: float
+        Minimum conviction
+
+    max_length: int
+        Maximum length
+
+    """
+    warnings.simplefilter('ignore')
+
+    with open(inputs, 'r') as param_handler:
+        params = json.load(param_handler)
+
+    input_header = params['header0']
+    header = 'infer' if input_header else None
+
+    with open(infile) as fp:
+        lines = fp.read().splitlines()
+
+    if header is not None:
+        lines = lines[1:]
+
+    dataset = []
+    for line in lines:
+        line_items = line.split("\t")
+        dataset.append(line_items)
+
+    # TransactionEncoder learns the unique labels in the dataset and transforms the
+    # input dataset (a Python list of lists) into a one-hot encoded NumPy boolean array
+    te = TransactionEncoder()
+    te_ary = te.fit_transform(dataset)
+
+    # Turn the encoded NumPy array into a DataFrame
+    df = pd.DataFrame(te_ary, columns=te.columns_)
+
+    # Extract frequent itemsets for association rule mining
+    # use_colnames: Use DataFrames' column names in the returned DataFrame instead of column indices
+    frequent_itemsets = fpgrowth(df, min_support=min_support, use_colnames=True, max_len=max_length)
+
+    # Get association rules, with confidence larger than min_confidence
+    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)
+
+    # Filter association rules, keeping rules with lift and conviction larger than min_liftand and min_conviction
+    rules = rules[(rules['lift'] >= min_lift) & (rules['conviction'] >= min_conviction)]
+
+    # Convert columns from frozenset to list (more readable)
+    rules['antecedents'] = rules['antecedents'].apply(list)
+    rules['consequents'] = rules['consequents'].apply(list)
+
+    # The next 3 steps are intended to fix the order of the association
+    # rules generated, so tests that rely on diff'ing a desired output
+    # with an expected output can pass
+
+    # 1) Sort entry in every row/column for columns 'antecedents' and 'consequents'
+    rules['antecedents'] = rules['antecedents'].apply(lambda row: sorted(row))
+    rules['consequents'] = rules['consequents'].apply(lambda row: sorted(row))
+
+    # 2) Create two temporary string columns to sort on
+    rules['ant_str'] = rules['antecedents'].apply(lambda row: " ".join(row))
+    rules['con_str'] = rules['consequents'].apply(lambda row: " ".join(row))
+
+    # 3) Sort results so they are re-producable
+    rules.sort_values(by=['ant_str', 'con_str'], inplace=True)
+    del rules['ant_str']
+    del rules['con_str']
+    rules.reset_index(drop=True, inplace=True)
+
+    # Write association rules and metrics to file
+    rules.to_csv(outfile, sep="\t", index=False)
+
+
+if __name__ == '__main__':
+    aparser = argparse.ArgumentParser()
+    aparser.add_argument("-i", "--inputs", dest="inputs", required=True)
+    aparser.add_argument("-y", "--infile", dest="infile", required=True)
+    aparser.add_argument("-o", "--outfile", dest="outfile", required=True)
+    aparser.add_argument("-s", "--support", dest="support", default=0.5)
+    aparser.add_argument("-c", "--confidence", dest="confidence", default=0.5)
+    aparser.add_argument("-l", "--lift", dest="lift", default=1.0)
+    aparser.add_argument("-v", "--conviction", dest="conviction", default=1.0)
+    aparser.add_argument("-t", "--length", dest="length", default=5)
+    args = aparser.parse_args()
+
+    main(args.inputs, args.infile, args.outfile,
+         min_support=float(args.support), min_confidence=float(args.confidence),
+         min_lift=float(args.lift), min_conviction=float(args.conviction), max_length=int(args.length))
b
diff -r 1d20e0dce176 -r 5773e98921fc fitted_model_eval.py
--- a/fitted_model_eval.py Tue Apr 13 22:00:10 2021 +0000
+++ b/fitted_model_eval.py Sat May 01 01:20:14 2021 +0000
[
@@ -30,7 +30,9 @@
     # tabular input
     if input_type == "tabular":
         header = "infer" if params["input_options"]["header1"] else None
-        column_option = params["input_options"]["column_selector_options_1"]["selected_column_selector_option"]
+        column_option = params["input_options"]["column_selector_options_1"][
+            "selected_column_selector_option"
+        ]
         if column_option in [
             "by_index_number",
             "all_but_by_index_number",
@@ -52,7 +54,9 @@
 
     # Get target y
     header = "infer" if params["input_options"]["header2"] else None
-    column_option = params["input_options"]["column_selector_options_2"]["selected_column_selector_option2"]
+    column_option = params["input_options"]["column_selector_options_2"][
+        "selected_column_selector_option2"
+    ]
     if column_option in [
         "by_index_number",
         "all_but_by_index_number",
@@ -70,7 +74,9 @@
         infile2 = pd.read_csv(infile2, sep="\t", header=header, parse_dates=True)
         loaded_df[df_key] = infile2
 
-    y = read_columns(infile2, c=c, c_option=column_option, sep="\t", header=header, parse_dates=True)
+    y = read_columns(
+        infile2, c=c, c_option=column_option, sep="\t", header=header, parse_dates=True
+    )
     if len(y.shape) == 2 and y.shape[1] == 1:
         y = y.ravel()
 
@@ -123,7 +129,8 @@
     if hasattr(main_est, "config") and hasattr(main_est, "load_weights"):
         if not infile_weights or infile_weights == "None":
             raise ValueError(
-                "The selected model skeleton asks for weights, " "but no dataset for weights was provided!"
+                "The selected model skeleton asks for weights, "
+                "but no dataset for weights was provided!"
             )
         main_est.load_weights(infile_weights)
 
@@ -142,7 +149,9 @@
     scorer, _ = _check_multimetric_scoring(estimator, scoring=scorer)
 
     if hasattr(estimator, "evaluate"):
-        scores = estimator.evaluate(X_test, y_test=y_test, scorer=scorer, is_multimetric=True)
+        scores = estimator.evaluate(
+            X_test, y_test=y_test, scorer=scorer, is_multimetric=True
+        )
     else:
         scores = _score(estimator, X_test, y_test, scorer, is_multimetric=True)
 
b
diff -r 1d20e0dce176 -r 5773e98921fc keras_deep_learning.py
--- a/keras_deep_learning.py Tue Apr 13 22:00:10 2021 +0000
+++ b/keras_deep_learning.py Sat May 01 01:20:14 2021 +0000
[
b'@@ -10,12 +10,12 @@\n from galaxy_ml.utils import get_search_params, SafeEval, try_get_attr\n from keras.models import Model, Sequential\n \n-\n safe_eval = SafeEval()\n \n \n def _handle_shape(literal):\n-    """Eval integer or list/tuple of integers from string\n+    """\n+    Eval integer or list/tuple of integers from string\n \n     Parameters:\n     -----------\n@@ -32,7 +32,8 @@\n \n \n def _handle_regularizer(literal):\n-    """Construct regularizer from string literal\n+    """\n+    Construct regularizer from string literal\n \n     Parameters\n     ----------\n@@ -48,15 +49,16 @@\n         return None\n \n     if l1 is None:\n-        l1 = 0.\n+        l1 = 0.0\n     if l2 is None:\n-        l2 = 0.\n+        l2 = 0.0\n \n     return keras.regularizers.l1_l2(l1=l1, l2=l2)\n \n \n def _handle_constraint(config):\n-    """Construct constraint from galaxy tool parameters.\n+    """\n+    Construct constraint from galaxy tool parameters.\n     Suppose correct dictionary format\n \n     Parameters\n@@ -72,14 +74,14 @@\n                 "MinMaxNorm"\n             }\n     """\n-    constraint_type = config[\'constraint_type\']\n-    if constraint_type in (\'None\', \'\'):\n+    constraint_type = config["constraint_type"]\n+    if constraint_type in ("None", ""):\n         return None\n \n     klass = getattr(keras.constraints, constraint_type)\n-    options = config.get(\'constraint_options\', {})\n-    if \'axis\' in options:\n-        options[\'axis\'] = literal_eval(options[\'axis\'])\n+    options = config.get("constraint_options", {})\n+    if "axis" in options:\n+        options["axis"] = literal_eval(options["axis"])\n \n     return klass(**options)\n \n@@ -89,62 +91,82 @@\n \n \n def _handle_layer_parameters(params):\n-    """Access to handle all kinds of parameters\n+    """\n+    Access to handle all kinds of parameters\n     """\n     for key, value in six.iteritems(params):\n-        if value in (\'None\', \'\'):\n+        if value in ("None", ""):\n             params[key] = None\n             continue\n \n-        if type(value) in [int, float, bool]\\\n-                or (type(value) is str and value.isalpha()):\n+        if type(value) in [int, float, bool] or (\n+            type(value) is str and value.isalpha()\n+        ):\n             continue\n \n-        if key in [\'input_shape\', \'noise_shape\', \'shape\', \'batch_shape\',\n-                   \'target_shape\', \'dims\', \'kernel_size\', \'strides\',\n-                   \'dilation_rate\', \'output_padding\', \'cropping\', \'size\',\n-                   \'padding\', \'pool_size\', \'axis\', \'shared_axes\'] \\\n-                and isinstance(value, str):\n+        if (\n+            key\n+            in [\n+                "input_shape",\n+                "noise_shape",\n+                "shape",\n+                "batch_shape",\n+                "target_shape",\n+                "dims",\n+                "kernel_size",\n+                "strides",\n+                "dilation_rate",\n+                "output_padding",\n+                "cropping",\n+                "size",\n+                "padding",\n+                "pool_size",\n+                "axis",\n+                "shared_axes",\n+            ]\n+            and isinstance(value, str)\n+        ):\n             params[key] = _handle_shape(value)\n \n-        elif key.endswith(\'_regularizer\') and isinstance(value, dict):\n+        elif key.endswith("_regularizer") and isinstance(value, dict):\n             params[key] = _handle_regularizer(value)\n \n-        elif key.endswith(\'_constraint\') and isinstance(value, dict):\n+        elif key.endswith("_constraint") and isinstance(value, dict):\n             params[key] = _handle_constraint(value)\n \n-        elif key == \'function\':  # No support for lambda/function eval\n+        elif key == "function":  # No support for lambda/function eval\n             params.pop(key)\n \n     return params\n \n \n def get_sequential_model(config):\n-    """Construct keras Sequential model from Galaxy tool parameters\n+    """\n+    Construct keras Sequential model from Galaxy tool parameters\n \n     Parameters:\n     -----------\n  '..b'mizer_selection"][\n+                    "optimizer_options"\n+                ]\n+            )\n+        )\n \n-        train_metrics = inputs[\'mode_selection\'][\'compile_params\'][\'metrics\']\n-        if train_metrics[-1] == \'none\':\n+        train_metrics = inputs["mode_selection"]["compile_params"]["metrics"]\n+        if train_metrics[-1] == "none":\n             train_metrics = train_metrics[:-1]\n-        options[\'metrics\'] = train_metrics\n+        options["metrics"] = train_metrics\n \n-        options.update(inputs[\'mode_selection\'][\'fit_params\'])\n-        options[\'seed\'] = inputs[\'mode_selection\'][\'random_seed\']\n+        options.update(inputs["mode_selection"]["fit_params"])\n+        options["seed"] = inputs["mode_selection"]["random_seed"]\n \n         if batch_mode:\n-            generator = get_batch_generator(inputs[\'mode_selection\']\n-                                            [\'generator_selection\'])\n-            options[\'data_batch_generator\'] = generator\n-            options[\'prediction_steps\'] = \\\n-                inputs[\'mode_selection\'][\'prediction_steps\']\n-            options[\'class_positive_factor\'] = \\\n-                inputs[\'mode_selection\'][\'class_positive_factor\']\n+            generator = get_batch_generator(\n+                inputs["mode_selection"]["generator_selection"]\n+            )\n+            options["data_batch_generator"] = generator\n+            options["prediction_steps"] = inputs["mode_selection"]["prediction_steps"]\n+            options["class_positive_factor"] = inputs["mode_selection"][\n+                "class_positive_factor"\n+            ]\n         estimator = klass(config, **options)\n         if outfile_params:\n             hyper_params = get_search_params(estimator)\n             # TODO: remove this after making `verbose` tunable\n             for h_param in hyper_params:\n-                if h_param[1].endswith(\'verbose\'):\n-                    h_param[0] = \'@\'\n-            df = pd.DataFrame(hyper_params, columns=[\'\', \'Parameter\', \'Value\'])\n-            df.to_csv(outfile_params, sep=\'\\t\', index=False)\n+                if h_param[1].endswith("verbose"):\n+                    h_param[0] = "@"\n+            df = pd.DataFrame(hyper_params, columns=["", "Parameter", "Value"])\n+            df.to_csv(outfile_params, sep="\\t", index=False)\n \n     print(repr(estimator))\n     # save model by pickle\n-    with open(outfile, \'wb\') as f:\n+    with open(outfile, "wb") as f:\n         pickle.dump(estimator, f, pickle.HIGHEST_PROTOCOL)\n \n \n-if __name__ == \'__main__\':\n-    warnings.simplefilter(\'ignore\')\n+if __name__ == "__main__":\n+    warnings.simplefilter("ignore")\n \n     aparser = argparse.ArgumentParser()\n     aparser.add_argument("-i", "--inputs", dest="inputs", required=True)\n@@ -345,7 +383,7 @@\n     args = aparser.parse_args()\n \n     input_json_path = args.inputs\n-    with open(input_json_path, \'r\') as param_handler:\n+    with open(input_json_path, "r") as param_handler:\n         inputs = json.load(param_handler)\n \n     tool_id = args.tool_id\n@@ -355,18 +393,20 @@\n     infile_weights = args.infile_weights\n \n     # for keras_model_config tool\n-    if tool_id == \'keras_model_config\':\n+    if tool_id == "keras_model_config":\n         config_keras_model(inputs, outfile)\n \n     # for keras_model_builder tool\n     else:\n         batch_mode = False\n-        if tool_id == \'keras_batch_models\':\n+        if tool_id == "keras_batch_models":\n             batch_mode = True\n \n-        build_keras_model(inputs=inputs,\n-                          model_json=model_json,\n-                          infile_weights=infile_weights,\n-                          batch_mode=batch_mode,\n-                          outfile=outfile,\n-                          outfile_params=outfile_params)\n+        build_keras_model(\n+            inputs=inputs,\n+            model_json=model_json,\n+            infile_weights=infile_weights,\n+            batch_mode=batch_mode,\n+            outfile=outfile,\n+            outfile_params=outfile_params,\n+        )\n'
b
diff -r 1d20e0dce176 -r 5773e98921fc keras_train_and_eval.py
--- a/keras_train_and_eval.py Tue Apr 13 22:00:10 2021 +0000
+++ b/keras_train_and_eval.py Sat May 01 01:20:14 2021 +0000
[
b'@@ -11,16 +11,9 @@\n from galaxy_ml.externals.selene_sdk.utils import compute_score\n from galaxy_ml.keras_galaxy_models import _predict_generator\n from galaxy_ml.model_validations import train_test_split\n-from galaxy_ml.utils import (\n-    clean_params,\n-    get_main_estimator,\n-    get_module,\n-    get_scoring,\n-    load_model,\n-    read_columns,\n-    SafeEval,\n-    try_get_attr,\n-)\n+from galaxy_ml.utils import (clean_params, get_main_estimator,\n+                             get_module, get_scoring, load_model, read_columns,\n+                             SafeEval, try_get_attr)\n from scipy.io import mmread\n from sklearn.metrics.scorer import _check_multimetric_scoring\n from sklearn.model_selection import _search, _validation\n@@ -28,7 +21,6 @@\n from sklearn.pipeline import Pipeline\n from sklearn.utils import indexable, safe_indexing\n \n-\n _fit_and_score = try_get_attr("galaxy_ml.model_validations", "_fit_and_score")\n setattr(_search, "_fit_and_score", _fit_and_score)\n setattr(_validation, "_fit_and_score", _fit_and_score)\n@@ -56,7 +48,10 @@\n \n         param_name = p["sp_name"]\n         if param_name.lower().endswith(NON_SEARCHABLE):\n-            warnings.warn("Warning: `%s` is not eligible for search and was " "omitted!" % param_name)\n+            warnings.warn(\n+                "Warning: `%s` is not eligible for search and was "\n+                "omitted!" % param_name\n+            )\n             continue\n \n         if not swap_value.startswith(":"):\n@@ -99,7 +94,11 @@\n         index_arr = np.arange(n_samples)\n         test = index_arr[np.isin(groups, group_names)]\n         train = index_arr[~np.isin(groups, group_names)]\n-        rval = list(chain.from_iterable((safe_indexing(a, train), safe_indexing(a, test)) for a in new_arrays))\n+        rval = list(\n+            chain.from_iterable(\n+                (safe_indexing(a, train), safe_indexing(a, test)) for a in new_arrays\n+            )\n+        )\n     else:\n         rval = train_test_split(*new_arrays, **kwargs)\n \n@@ -127,14 +126,22 @@\n         pred_labels = (pred_probas > 0.5).astype("int32")\n         targets = y_true.ravel().astype("int32")\n         if not is_multimetric:\n-            preds = pred_labels if scorer.__class__.__name__ == "_PredictScorer" else pred_probas\n+            preds = (\n+                pred_labels\n+                if scorer.__class__.__name__ == "_PredictScorer"\n+                else pred_probas\n+            )\n             score = scorer._score_func(targets, preds, **scorer._kwargs)\n \n             return score\n         else:\n             scores = {}\n             for name, one_scorer in scorer.items():\n-                preds = pred_labels if one_scorer.__class__.__name__ == "_PredictScorer" else pred_probas\n+                preds = (\n+                    pred_labels\n+                    if one_scorer.__class__.__name__ == "_PredictScorer"\n+                    else pred_probas\n+                )\n                 score = one_scorer._score_func(targets, preds, **one_scorer._kwargs)\n                 scores[name] = score\n \n@@ -144,13 +151,21 @@\n         pred_labels = (pred_probas > 0.5).astype("int32")\n         targets = y_true.astype("int32")\n         if not is_multimetric:\n-            preds = pred_labels if scorer.__class__.__name__ == "_PredictScorer" else pred_probas\n+            preds = (\n+                pred_labels\n+                if scorer.__class__.__name__ == "_PredictScorer"\n+                else pred_probas\n+            )\n             score, _ = compute_score(preds, targets, scorer._score_func)\n             return score\n         else:\n             scores = {}\n             for name, one_scorer in scorer.items():\n-                preds = pred_labels if one_scorer.__class__.__name__ == "_PredictScorer" else pred_probas\n+                preds = (\n+                    pred_labels\n+                    if one_scorer.__class__.__name__ == "_PredictScorer"\n+                    else pred_probas\n+                )\n                 score, _ = compute_sco'..b'put_options"]["column_selector_options_2"][\n+        "selected_column_selector_option2"\n+    ]\n     if column_option in [\n         "by_index_number",\n         "all_but_by_index_number",\n@@ -313,12 +332,9 @@\n         infile2 = pd.read_csv(infile2, sep="\\t", header=header, parse_dates=True)\n         loaded_df[df_key] = infile2\n \n-    y = read_columns(infile2,\n-                     c=c,\n-                     c_option=column_option,\n-                     sep=\'\\t\',\n-                     header=header,\n-                     parse_dates=True)\n+    y = read_columns(\n+        infile2, c=c, c_option=column_option, sep="\\t", header=header, parse_dates=True\n+    )\n     if len(y.shape) == 2 and y.shape[1] == 1:\n         y = y.ravel()\n     if input_type == "refseq_and_interval":\n@@ -328,10 +344,14 @@\n \n     # load groups\n     if groups:\n-        groups_selector = (params["experiment_schemes"]["test_split"]["split_algos"]).pop("groups_selector")\n+        groups_selector = (\n+            params["experiment_schemes"]["test_split"]["split_algos"]\n+        ).pop("groups_selector")\n \n         header = "infer" if groups_selector["header_g"] else None\n-        column_option = groups_selector["column_selector_options_g"]["selected_column_selector_option_g"]\n+        column_option = groups_selector["column_selector_options_g"][\n+            "selected_column_selector_option_g"\n+        ]\n         if column_option in [\n             "by_index_number",\n             "all_but_by_index_number",\n@@ -346,12 +366,14 @@\n         if df_key in loaded_df:\n             groups = loaded_df[df_key]\n \n-        groups = read_columns(groups,\n-                              c=c,\n-                              c_option=column_option,\n-                              sep=\'\\t\',\n-                              header=header,\n-                              parse_dates=True)\n+        groups = read_columns(\n+            groups,\n+            c=c,\n+            c_option=column_option,\n+            sep="\\t",\n+            header=header,\n+            parse_dates=True,\n+        )\n         groups = groups.ravel()\n \n     # del loaded_df\n@@ -364,7 +386,7 @@\n         main_est.set_params(memory=memory)\n \n     # handle scorer, convert to scorer dict\n-    scoring = params[\'experiment_schemes\'][\'metrics\'][\'scoring\']\n+    scoring = params["experiment_schemes"]["metrics"]["scoring"]\n     if scoring is not None:\n         # get_scoring() expects secondary_scoring to be a comma separated string (not a list)\n         # Check if secondary_scoring is specified\n@@ -385,7 +407,9 @@\n         if y is not None:\n             test_split_options["labels"] = y\n         else:\n-            raise ValueError("Stratified shuffle split is not " "applicable on empty target values!")\n+            raise ValueError(\n+                "Stratified shuffle split is not " "applicable on empty target values!"\n+            )\n \n     (\n         X_train,\n@@ -408,7 +432,10 @@\n             if y_train is not None:\n                 val_split_options["labels"] = y_train\n             else:\n-                raise ValueError("Stratified shuffle split is not " "applicable on empty target values!")\n+                raise ValueError(\n+                    "Stratified shuffle split is not "\n+                    "applicable on empty target values!"\n+                )\n \n         (\n             X_train,\n@@ -431,8 +458,12 @@\n     if hasattr(estimator, "evaluate"):\n         steps = estimator.prediction_steps\n         batch_size = estimator.batch_size\n-        generator = estimator.data_generator_.flow(X_test, y=y_test, batch_size=batch_size)\n-        predictions, y_true = _predict_generator(estimator.model_, generator, steps=steps)\n+        generator = estimator.data_generator_.flow(\n+            X_test, y=y_test, batch_size=batch_size\n+        )\n+        predictions, y_true = _predict_generator(\n+            estimator.model_, generator, steps=steps\n+        )\n         scores = _evaluate(y_true, predictions, scorer, is_multimetric=True)\n \n     else:\n'
b
diff -r 1d20e0dce176 -r 5773e98921fc label_encoder.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/label_encoder.py Sat May 01 01:20:14 2021 +0000
[
@@ -0,0 +1,48 @@
+import argparse
+import json
+import warnings
+
+import numpy as np
+import pandas as pd
+from sklearn.preprocessing import LabelEncoder
+
+
+def main(inputs, infile, outfile):
+    """
+    Parameter
+    ---------
+    input : str
+        File path to galaxy tool parameter
+
+    infile : str
+        File paths of input vector
+
+    outfile : str
+        File path to output vector
+
+    """
+    warnings.simplefilter('ignore')
+
+    with open(inputs, 'r') as param_handler:
+        params = json.load(param_handler)
+
+    input_header = params['header0']
+    header = 'infer' if input_header else None
+
+    input_vector = pd.read_csv(infile, sep='\t', header=header)
+
+    le = LabelEncoder()
+
+    output_vector = le.fit_transform(input_vector)
+
+    np.savetxt(outfile, output_vector, fmt="%d", delimiter='\t')
+
+
+if __name__ == '__main__':
+    aparser = argparse.ArgumentParser()
+    aparser.add_argument("-i", "--inputs", dest="inputs", required=True)
+    aparser.add_argument("-y", "--infile", dest="infile")
+    aparser.add_argument("-o", "--outfile", dest="outfile")
+    args = aparser.parse_args()
+
+    main(args.inputs, args.infile, args.outfile)
b
diff -r 1d20e0dce176 -r 5773e98921fc ml_visualization_ex.py
--- a/ml_visualization_ex.py Tue Apr 13 22:00:10 2021 +0000
+++ b/ml_visualization_ex.py Sat May 01 01:20:14 2021 +0000
[
@@ -13,10 +13,10 @@
 from keras.models import model_from_json
 from keras.utils import plot_model
 from sklearn.feature_selection.base import SelectorMixin
-from sklearn.metrics import auc, average_precision_score, confusion_matrix, precision_recall_curve, roc_curve
+from sklearn.metrics import (auc, average_precision_score, confusion_matrix,
+                             precision_recall_curve, roc_curve)
 from sklearn.pipeline import Pipeline
 
-
 safe_eval = SafeEval()
 
 # plotly default colors
@@ -51,7 +51,9 @@
         y_true = df1.iloc[:, idx].values
         y_score = df2.iloc[:, idx].values
 
-        precision, recall, _ = precision_recall_curve(y_true, y_score, pos_label=pos_label)
+        precision, recall, _ = precision_recall_curve(
+            y_true, y_score, pos_label=pos_label
+        )
         ap = average_precision_score(y_true, y_score, pos_label=pos_label or 1)
 
         trace = go.Scatter(
@@ -111,7 +113,9 @@
         y_true = df1.iloc[:, idx].values
         y_score = df2.iloc[:, idx].values
 
-        precision, recall, _ = precision_recall_curve(y_true, y_score, pos_label=pos_label)
+        precision, recall, _ = precision_recall_curve(
+            y_true, y_score, pos_label=pos_label
+        )
         ap = average_precision_score(y_true, y_score, pos_label=pos_label or 1)
 
         plt.step(
@@ -155,7 +159,9 @@
         y_true = df1.iloc[:, idx].values
         y_score = df2.iloc[:, idx].values
 
-        fpr, tpr, _ = roc_curve(y_true, y_score, pos_label=pos_label, drop_intermediate=drop_intermediate)
+        fpr, tpr, _ = roc_curve(
+            y_true, y_score, pos_label=pos_label, drop_intermediate=drop_intermediate
+        )
         roc_auc = auc(fpr, tpr)
 
         trace = go.Scatter(
@@ -168,7 +174,9 @@
         data.append(trace)
 
     layout = go.Layout(
-        xaxis=dict(title="False Positive Rate", linecolor="lightslategray", linewidth=1),
+        xaxis=dict(
+            title="False Positive Rate", linecolor="lightslategray", linewidth=1
+        ),
         yaxis=dict(title="True Positive Rate", linecolor="lightslategray", linewidth=1),
         title=dict(
             text=title or "Receiver Operating Characteristic (ROC) Curve",
@@ -204,7 +212,9 @@
     os.rename("output.html", "output")
 
 
-def visualize_roc_curve_matplotlib(df1, df2, pos_label, drop_intermediate=True, title=None):
+def visualize_roc_curve_matplotlib(
+    df1, df2, pos_label, drop_intermediate=True, title=None
+):
     """visualize roc-curve using matplotlib and output svg image"""
     backend = matplotlib.get_backend()
     if "inline" not in backend:
@@ -216,7 +226,9 @@
         y_true = df1.iloc[:, idx].values
         y_score = df2.iloc[:, idx].values
 
-        fpr, tpr, _ = roc_curve(y_true, y_score, pos_label=pos_label, drop_intermediate=drop_intermediate)
+        fpr, tpr, _ = roc_curve(
+            y_true, y_score, pos_label=pos_label, drop_intermediate=drop_intermediate
+        )
         roc_auc = auc(fpr, tpr)
 
         plt.step(
@@ -253,11 +265,15 @@
         col = plot_selection[column_name]["col1"]
     else:
         col = None
-    _, input_df = read_columns(file_path, c=col,
-                               c_option=column_option,
-                               return_df=True,
-                               sep='\t', header=header,
-                               parse_dates=True)
+    _, input_df = read_columns(
+        file_path,
+        c=col,
+        c_option=column_option,
+        return_df=True,
+        sep="\t",
+        header=header,
+        parse_dates=True,
+    )
     return input_df
 
 
@@ -344,7 +360,9 @@
         with open(infile_estimator, "rb") as estimator_handler:
             estimator = load_model(estimator_handler)
 
-        column_option = params["plotting_selection"]["column_selector_options"]["selected_column_selector_option"]
+        column_option = params["plotting_selection"]["column_selector_options"][
+            "selected_column_selector_option"
+        ]
         if column_option in [
             "by_index_number",
             "all_but_by_index_number",
@@ -379,7 +397,11 @@
         else:
             coefs = getattr(estimator, "feature_importances_", None)
         if coefs is None:
-            raise RuntimeError("The classifier does not expose " '"coef_" or "feature_importances_" ' "attributes")
+            raise RuntimeError(
+                "The classifier does not expose "
+                '"coef_" or "feature_importances_" '
+                "attributes"
+            )
 
         threshold = params["plotting_selection"]["threshold"]
         if threshold is not None:
@@ -454,7 +476,9 @@
         layout = go.Layout(
             xaxis=dict(title="Number of features selected"),
             yaxis=dict(title="Cross validation score"),
-            title=dict(text=title or None, x=0.5, y=0.92, xanchor="center", yanchor="top"),
+            title=dict(
+                text=title or None, x=0.5, y=0.92, xanchor="center", yanchor="top"
+            ),
             font=dict(family="sans-serif", size=11),
             # control backgroud colors
             plot_bgcolor="rgba(255,255,255,0)",
@@ -548,9 +572,13 @@
 
     elif plot_type == "classification_confusion_matrix":
         plot_selection = params["plotting_selection"]
-        input_true = get_dataframe(true_labels, plot_selection, "header_true", "column_selector_options_true")
+        input_true = get_dataframe(
+            true_labels, plot_selection, "header_true", "column_selector_options_true"
+        )
         header_predicted = "infer" if plot_selection["header_predicted"] else None
-        input_predicted = pd.read_csv(predicted_labels, sep="\t", parse_dates=True, header=header_predicted)
+        input_predicted = pd.read_csv(
+            predicted_labels, sep="\t", parse_dates=True, header=header_predicted
+        )
         true_classes = input_true.iloc[:, -1].copy()
         predicted_classes = input_predicted.iloc[:, -1].copy()
         axis_labels = list(set(true_classes))
b
diff -r 1d20e0dce176 -r 5773e98921fc model_prediction.py
--- a/model_prediction.py Tue Apr 13 22:00:10 2021 +0000
+++ b/model_prediction.py Sat May 01 01:20:14 2021 +0000
[
@@ -63,7 +63,8 @@
     if hasattr(main_est, "config") and hasattr(main_est, "load_weights"):
         if not infile_weights or infile_weights == "None":
             raise ValueError(
-                "The selected model skeleton asks for weights, " "but dataset for weights wan not selected!"
+                "The selected model skeleton asks for weights, "
+                "but dataset for weights wan not selected!"
             )
         main_est.load_weights(infile_weights)
 
@@ -72,7 +73,9 @@
     # tabular input
     if input_type == "tabular":
         header = "infer" if params["input_options"]["header1"] else None
-        column_option = params["input_options"]["column_selector_options_1"]["selected_column_selector_option"]
+        column_option = params["input_options"]["column_selector_options_1"][
+            "selected_column_selector_option"
+        ]
         if column_option in [
             "by_index_number",
             "all_but_by_index_number",
@@ -122,9 +125,13 @@
         pred_data_generator = klass(fasta_path, seq_length=seq_length)
 
         if params["method"] == "predict":
-            preds = estimator.predict(X, data_generator=pred_data_generator, steps=steps)
+            preds = estimator.predict(
+                X, data_generator=pred_data_generator, steps=steps
+            )
         else:
-            preds = estimator.predict_proba(X, data_generator=pred_data_generator, steps=steps)
+            preds = estimator.predict_proba(
+                X, data_generator=pred_data_generator, steps=steps
+            )
 
     # vcf input
     elif input_type == "variant_effect":
@@ -135,7 +142,9 @@
         if options["blacklist_regions"] == "none":
             options["blacklist_regions"] = None
 
-        pred_data_generator = klass(ref_genome_path=ref_seq, vcf_path=vcf_path, **options)
+        pred_data_generator = klass(
+            ref_genome_path=ref_seq, vcf_path=vcf_path, **options
+        )
 
         pred_data_generator.set_processing_attrs()
 
b
diff -r 1d20e0dce176 -r 5773e98921fc search_model_validation.py
--- a/search_model_validation.py Tue Apr 13 22:00:10 2021 +0000
+++ b/search_model_validation.py Sat May 01 01:20:14 2021 +0000
[
b'@@ -11,31 +11,16 @@\n import numpy as np\n import pandas as pd\n import skrebate\n-from galaxy_ml.utils import (\n-    clean_params,\n-    get_cv,\n-    get_main_estimator,\n-    get_module,\n-    get_scoring,\n-    load_model,\n-    read_columns,\n-    SafeEval,\n-    try_get_attr\n-)\n+from galaxy_ml.utils import (clean_params, get_cv,\n+                             get_main_estimator, get_module, get_scoring,\n+                             load_model, read_columns, SafeEval, try_get_attr)\n from scipy.io import mmread\n-from sklearn import (\n-    cluster,\n-    decomposition,\n-    feature_selection,\n-    kernel_approximation,\n-    model_selection,\n-    preprocessing,\n-)\n+from sklearn import (cluster, decomposition, feature_selection,\n+                     kernel_approximation, model_selection, preprocessing)\n from sklearn.exceptions import FitFailedWarning\n from sklearn.model_selection import _search, _validation\n from sklearn.model_selection._validation import _score, cross_validate\n \n-\n _fit_and_score = try_get_attr("galaxy_ml.model_validations", "_fit_and_score")\n setattr(_search, "_fit_and_score", _fit_and_score)\n setattr(_validation, "_fit_and_score", _fit_and_score)\n@@ -57,7 +42,10 @@\n \n         param_name = p["sp_name"]\n         if param_name.lower().endswith(NON_SEARCHABLE):\n-            print("Warning: `%s` is not eligible for search and was " "omitted!" % param_name)\n+            print(\n+                "Warning: `%s` is not eligible for search and was "\n+                "omitted!" % param_name\n+            )\n             continue\n \n         if not search_list.startswith(":"):\n@@ -90,7 +78,9 @@\n                 decomposition.IncrementalPCA(),\n                 decomposition.KernelPCA(random_state=0, n_jobs=N_JOBS),\n                 decomposition.LatentDirichletAllocation(random_state=0, n_jobs=N_JOBS),\n-                decomposition.MiniBatchDictionaryLearning(random_state=0, n_jobs=N_JOBS),\n+                decomposition.MiniBatchDictionaryLearning(\n+                    random_state=0, n_jobs=N_JOBS\n+                ),\n                 decomposition.MiniBatchSparsePCA(random_state=0, n_jobs=N_JOBS),\n                 decomposition.NMF(random_state=0),\n                 decomposition.PCA(random_state=0),\n@@ -107,14 +97,26 @@\n                 skrebate.MultiSURF(n_jobs=N_JOBS),\n                 skrebate.MultiSURFstar(n_jobs=N_JOBS),\n                 imblearn.under_sampling.ClusterCentroids(random_state=0, n_jobs=N_JOBS),\n-                imblearn.under_sampling.CondensedNearestNeighbour(random_state=0, n_jobs=N_JOBS),\n-                imblearn.under_sampling.EditedNearestNeighbours(random_state=0, n_jobs=N_JOBS),\n-                imblearn.under_sampling.RepeatedEditedNearestNeighbours(random_state=0, n_jobs=N_JOBS),\n+                imblearn.under_sampling.CondensedNearestNeighbour(\n+                    random_state=0, n_jobs=N_JOBS\n+                ),\n+                imblearn.under_sampling.EditedNearestNeighbours(\n+                    random_state=0, n_jobs=N_JOBS\n+                ),\n+                imblearn.under_sampling.RepeatedEditedNearestNeighbours(\n+                    random_state=0, n_jobs=N_JOBS\n+                ),\n                 imblearn.under_sampling.AllKNN(random_state=0, n_jobs=N_JOBS),\n-                imblearn.under_sampling.InstanceHardnessThreshold(random_state=0, n_jobs=N_JOBS),\n+                imblearn.under_sampling.InstanceHardnessThreshold(\n+                    random_state=0, n_jobs=N_JOBS\n+                ),\n                 imblearn.under_sampling.NearMiss(random_state=0, n_jobs=N_JOBS),\n-                imblearn.under_sampling.NeighbourhoodCleaningRule(random_state=0, n_jobs=N_JOBS),\n-                imblearn.under_sampling.OneSidedSelection(random_state=0, n_jobs=N_JOBS),\n+                imblearn.under_sampling.NeighbourhoodCleaningRule(\n+                    random_state=0, n_jobs=N_JOBS\n+                ),\n+                imblearn.under_sampling.OneSidedSelection(\n+                    random_state=0'..b'        split_options["shuffle"] = None\n@@ -411,9 +425,13 @@\n \n     # TODO Solve deep learning models in pipeline\n     if best_estimator_.__class__.__name__ == "KerasGBatchClassifier":\n-        test_score = best_estimator_.evaluate(X_test, scorer=scorer_, is_multimetric=is_multimetric)\n+        test_score = best_estimator_.evaluate(\n+            X_test, scorer=scorer_, is_multimetric=is_multimetric\n+        )\n     else:\n-        test_score = _score(best_estimator_, X_test, y_test, scorer_, is_multimetric=is_multimetric)\n+        test_score = _score(\n+            best_estimator_, X_test, y_test, scorer_, is_multimetric=is_multimetric\n+        )\n \n     if not is_multimetric:\n         test_score = {primary_scoring: test_score}\n@@ -487,7 +505,9 @@\n         params = json.load(param_handler)\n \n     # Override the refit parameter\n-    params["search_schemes"]["options"]["refit"] = True if params["save"] != "nope" else False\n+    params["search_schemes"]["options"]["refit"] = (\n+        True if params["save"] != "nope" else False\n+    )\n \n     with open(infile_estimator, "rb") as estimator_handler:\n         estimator = load_model(estimator_handler)\n@@ -499,17 +519,21 @@\n     options = params["search_schemes"]["options"]\n \n     if groups:\n-        header = "infer" if (options["cv_selector"]["groups_selector"]["header_g"]) else None\n-        column_option = options["cv_selector"]["groups_selector"]["column_selector_options_g"][\n-            "selected_column_selector_option_g"\n-        ]\n+        header = (\n+            "infer" if (options["cv_selector"]["groups_selector"]["header_g"]) else None\n+        )\n+        column_option = options["cv_selector"]["groups_selector"][\n+            "column_selector_options_g"\n+        ]["selected_column_selector_option_g"]\n         if column_option in [\n             "by_index_number",\n             "all_but_by_index_number",\n             "by_header_name",\n             "all_but_by_header_name",\n         ]:\n-            c = options["cv_selector"]["groups_selector"]["column_selector_options_g"]["col_g"]\n+            c = options["cv_selector"]["groups_selector"]["column_selector_options_g"][\n+                "col_g"\n+            ]\n         else:\n             c = None\n \n@@ -537,12 +561,14 @@\n     secondary_scoring = options["scoring"].get("secondary_scoring", None)\n     if secondary_scoring is not None:\n         # If secondary_scoring is specified, convert the list into comman separated string\n-        options["scoring"]["secondary_scoring"] = ",".join(options["scoring"]["secondary_scoring"])\n+        options["scoring"]["secondary_scoring"] = ",".join(\n+            options["scoring"]["secondary_scoring"]\n+        )\n     options["scoring"] = get_scoring(options["scoring"])\n     if options["error_score"]:\n         options["error_score"] = "raise"\n     else:\n-        options["error_score"] = np.NaN\n+        options["error_score"] = np.nan\n     if options["refit"] and isinstance(options["scoring"], dict):\n         options["refit"] = primary_scoring\n     if "pre_dispatch" in options and options["pre_dispatch"] == "":\n@@ -588,7 +614,9 @@\n         # make sure refit is choosen\n         # this could be True for sklearn models, but not the case for\n         # deep learning models\n-        if not options["refit"] and not all(hasattr(estimator, attr) for attr in ("config", "model_type")):\n+        if not options["refit"] and not all(\n+            hasattr(estimator, attr) for attr in ("config", "model_type")\n+        ):\n             warnings.warn("Refit is change to `True` for nested validation!")\n             setattr(searcher, "refit", True)\n \n@@ -687,7 +715,9 @@\n \n         cv_results = pd.DataFrame(searcher.cv_results_)\n         cv_results = cv_results[sorted(cv_results.columns)]\n-        cv_results.to_csv(path_or_buf=outfile_result, sep="\\t", header=True, index=False)\n+        cv_results.to_csv(\n+            path_or_buf=outfile_result, sep="\\t", header=True, index=False\n+        )\n \n     memory.clear(warn=False)\n \n'
b
diff -r 1d20e0dce176 -r 5773e98921fc simple_model_fit.py
--- a/simple_model_fit.py Tue Apr 13 22:00:10 2021 +0000
+++ b/simple_model_fit.py Sat May 01 01:20:14 2021 +0000
[
@@ -7,7 +7,6 @@
 from scipy.io import mmread
 from sklearn.pipeline import Pipeline
 
-
 N_JOBS = int(__import__("os").environ.get("GALAXY_SLOTS", 1))
 
 
@@ -36,7 +35,7 @@
         if name == "memory" or name.endswith("__memory") or name.endswith("_path"):
             new_p = {name: None}
             estimator.set_params(**new_p)
-        elif n_jobs is not None and (name == 'n_jobs' or name.endswith('__n_jobs')):
+        elif n_jobs is not None and (name == "n_jobs" or name.endswith("__n_jobs")):
             new_p = {name: n_jobs}
             estimator.set_params(**new_p)
         elif name.endswith("callbacks"):
@@ -68,7 +67,9 @@
     # tabular input
     if input_type == "tabular":
         header = "infer" if params["input_options"]["header1"] else None
-        column_option = params["input_options"]["column_selector_options_1"]["selected_column_selector_option"]
+        column_option = params["input_options"]["column_selector_options_1"][
+            "selected_column_selector_option"
+        ]
         if column_option in [
             "by_index_number",
             "all_but_by_index_number",
@@ -90,7 +91,9 @@
 
     # Get target y
     header = "infer" if params["input_options"]["header2"] else None
-    column_option = params["input_options"]["column_selector_options_2"]["selected_column_selector_option2"]
+    column_option = params["input_options"]["column_selector_options_2"][
+        "selected_column_selector_option2"
+    ]
     if column_option in [
         "by_index_number",
         "all_but_by_index_number",
@@ -108,12 +111,9 @@
         infile2 = pd.read_csv(infile2, sep="\t", header=header, parse_dates=True)
         loaded_df[df_key] = infile2
 
-    y = read_columns(infile2,
-                     c=c,
-                     c_option=column_option,
-                     sep='\t',
-                     header=header,
-                     parse_dates=True)
+    y = read_columns(
+        infile2, c=c, c_option=column_option, sep="\t", header=header, parse_dates=True
+    )
     if len(y.shape) == 2 and y.shape[1] == 1:
         y = y.ravel()
 
b
diff -r 1d20e0dce176 -r 5773e98921fc stacking_ensembles.py
--- a/stacking_ensembles.py Tue Apr 13 22:00:10 2021 +0000
+++ b/stacking_ensembles.py Sat May 01 01:20:14 2021 +0000
[
@@ -8,8 +8,8 @@
 import mlxtend.classifier
 import mlxtend.regressor
 import pandas as pd
-from galaxy_ml.utils import get_cv, get_estimator, get_search_params, load_model
-
+from galaxy_ml.utils import (get_cv, get_estimator, get_search_params,
+                             load_model)
 
 warnings.filterwarnings("ignore")
 
@@ -62,7 +62,9 @@
             with open(meta_path, "rb") as f:
                 meta_estimator = load_model(f)
         else:
-            estimator_json = params["algo_selection"]["meta_estimator"]["estimator_selector"]
+            estimator_json = params["algo_selection"]["meta_estimator"][
+                "estimator_selector"
+            ]
             meta_estimator = get_estimator(estimator_json)
 
     options = params["algo_selection"]["options"]
@@ -89,10 +91,14 @@
         ensemble_estimator = klass(base_estimators, **options)
 
     elif mod == mlxtend.classifier:
-        ensemble_estimator = klass(classifiers=base_estimators, meta_classifier=meta_estimator, **options)
+        ensemble_estimator = klass(
+            classifiers=base_estimators, meta_classifier=meta_estimator, **options
+        )
 
     else:
-        ensemble_estimator = klass(regressors=base_estimators, meta_regressor=meta_estimator, **options)
+        ensemble_estimator = klass(
+            regressors=base_estimators, meta_regressor=meta_estimator, **options
+        )
 
     print(ensemble_estimator)
     for base_est in base_estimators:
b
diff -r 1d20e0dce176 -r 5773e98921fc test-data/le_input_w_header.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/le_input_w_header.tabular Sat May 01 01:20:14 2021 +0000
b
@@ -0,0 +1,5 @@
+Class
+Liverpool
+Real Madrid
+Bayern Munich
+A.C. Milan
b
diff -r 1d20e0dce176 -r 5773e98921fc test-data/le_input_wo_header.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/le_input_wo_header.tabular Sat May 01 01:20:14 2021 +0000
b
@@ -0,0 +1,4 @@
+Liverpool
+Real Madrid
+Bayern Munich
+A.C. Milan
b
diff -r 1d20e0dce176 -r 5773e98921fc test-data/le_output.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/le_output.tabular Sat May 01 01:20:14 2021 +0000
b
@@ -0,0 +1,4 @@
+2
+3
+1
+0
b
diff -r 1d20e0dce176 -r 5773e98921fc test-data/mba_input_int_w.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/mba_input_int_w.tabular Sat May 01 01:20:14 2021 +0000
b
@@ -0,0 +1,6 @@
+Transactions
+10 11 12 13 14 15
+16 11 12 13 14 15
+10 17 13 14
+10 18 19 13 15
+19 11 11 13 20 14
b
diff -r 1d20e0dce176 -r 5773e98921fc test-data/mba_input_int_wo.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/mba_input_int_wo.tabular Sat May 01 01:20:14 2021 +0000
b
@@ -0,0 +1,5 @@
+10 11 12 13 14 15
+16 11 12 13 14 15
+10 17 13 14
+10 18 19 13 15
+19 11 11 13 20 14
b
diff -r 1d20e0dce176 -r 5773e98921fc test-data/mba_input_str_w.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/mba_input_str_w.tabular Sat May 01 01:20:14 2021 +0000
b
@@ -0,0 +1,6 @@
+Transactions
+Milk Onion Nutmeg Kidney Beans Eggs Yogurt
+Dill Onion Nutmeg Kidney Beans Eggs Yogurt
+Milk Apple Kidney Beans Eggs
+Milk Unicorn Corn Kidney Beans Yogurt
+Corn Onion Onion Kidney Beans Ice cream Eggs
b
diff -r 1d20e0dce176 -r 5773e98921fc test-data/mba_input_str_wo.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/mba_input_str_wo.tabular Sat May 01 01:20:14 2021 +0000
b
@@ -0,0 +1,5 @@
+Milk Onion Nutmeg Kidney Beans Eggs Yogurt
+Dill Onion Nutmeg Kidney Beans Eggs Yogurt
+Milk Apple Kidney Beans Eggs
+Milk Unicorn Corn Kidney Beans Yogurt
+Corn Onion Onion Kidney Beans Ice cream Eggs
b
diff -r 1d20e0dce176 -r 5773e98921fc test-data/mba_out_str.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/mba_out_str.tabular Sat May 01 01:20:14 2021 +0000
[
@@ -0,0 +1,7 @@
+antecedents consequents antecedent support consequent support support confidence lift leverage conviction
+['Eggs'] ['Kidney Beans', 'Onion'] 0.8 0.6 0.6 0.7499999999999999 1.2499999999999998 0.12 1.5999999999999994
+['Eggs'] ['Onion'] 0.8 0.6 0.6 0.7499999999999999 1.2499999999999998 0.12 1.5999999999999994
+['Eggs', 'Kidney Beans'] ['Onion'] 0.8 0.6 0.6 0.7499999999999999 1.2499999999999998 0.12 1.5999999999999994
+['Kidney Beans', 'Onion'] ['Eggs'] 0.6 0.8 0.6 1.0 1.25 0.12 inf
+['Onion'] ['Eggs'] 0.6 0.8 0.6 1.0 1.25 0.12 inf
+['Onion'] ['Eggs', 'Kidney Beans'] 0.6 0.8 0.6 1.0 1.25 0.12 inf
b
diff -r 1d20e0dce176 -r 5773e98921fc test-data/mba_output_int.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/mba_output_int.tabular Sat May 01 01:20:14 2021 +0000
[
@@ -0,0 +1,7 @@
+antecedents consequents antecedent support consequent support support confidence lift leverage conviction
+['11'] ['13', '14'] 0.6 0.8 0.6 1.0 1.25 0.12 inf
+['11'] ['14'] 0.6 0.8 0.6 1.0 1.25 0.12 inf
+['11', '13'] ['14'] 0.6 0.8 0.6 1.0 1.25 0.12 inf
+['13', '14'] ['11'] 0.8 0.6 0.6 0.7499999999999999 1.2499999999999998 0.12 1.5999999999999994
+['14'] ['11'] 0.8 0.6 0.6 0.7499999999999999 1.2499999999999998 0.12 1.5999999999999994
+['14'] ['11', '13'] 0.8 0.6 0.6 0.7499999999999999 1.2499999999999998 0.12 1.5999999999999994
b
diff -r 1d20e0dce176 -r 5773e98921fc test-data/mba_output_str.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/mba_output_str.tabular Sat May 01 01:20:14 2021 +0000
[
@@ -0,0 +1,7 @@
+antecedents consequents antecedent support consequent support support confidence lift leverage conviction
+['Eggs'] ['Kidney Beans', 'Onion'] 0.8 0.6 0.6 0.7499999999999999 1.2499999999999998 0.12 1.5999999999999994
+['Eggs'] ['Onion'] 0.8 0.6 0.6 0.7499999999999999 1.2499999999999998 0.12 1.5999999999999994
+['Eggs', 'Kidney Beans'] ['Onion'] 0.8 0.6 0.6 0.7499999999999999 1.2499999999999998 0.12 1.5999999999999994
+['Kidney Beans', 'Onion'] ['Eggs'] 0.6 0.8 0.6 1.0 1.25 0.12 inf
+['Onion'] ['Eggs'] 0.6 0.8 0.6 1.0 1.25 0.12 inf
+['Onion'] ['Eggs', 'Kidney Beans'] 0.6 0.8 0.6 1.0 1.25 0.12 inf
b
diff -r 1d20e0dce176 -r 5773e98921fc to_categorical.py
--- a/to_categorical.py Tue Apr 13 22:00:10 2021 +0000
+++ b/to_categorical.py Sat May 01 01:20:14 2021 +0000
b
@@ -43,7 +43,9 @@
     aparser = argparse.ArgumentParser()
     aparser.add_argument("-i", "--inputs", dest="inputs", required=True)
     aparser.add_argument("-y", "--infile", dest="infile")
-    aparser.add_argument("-n", "--num_classes", dest="num_classes", type=int, default=None)
+    aparser.add_argument(
+        "-n", "--num_classes", dest="num_classes", type=int, default=None
+    )
     aparser.add_argument("-o", "--outfile", dest="outfile")
     args = aparser.parse_args()
 
b
diff -r 1d20e0dce176 -r 5773e98921fc train_test_eval.py
--- a/train_test_eval.py Tue Apr 13 22:00:10 2021 +0000
+++ b/train_test_eval.py Sat May 01 01:20:14 2021 +0000
[
@@ -9,14 +9,8 @@
 import numpy as np
 import pandas as pd
 from galaxy_ml.model_validations import train_test_split
-from galaxy_ml.utils import (
-    get_module,
-    get_scoring,
-    load_model,
-    read_columns,
-    SafeEval,
-    try_get_attr,
-)
+from galaxy_ml.utils import (get_module, get_scoring, load_model,
+                             read_columns, SafeEval, try_get_attr)
 from scipy.io import mmread
 from sklearn import pipeline
 from sklearn.metrics.scorer import _check_multimetric_scoring
@@ -24,7 +18,6 @@
 from sklearn.model_selection._validation import _score
 from sklearn.utils import indexable, safe_indexing
 
-
 _fit_and_score = try_get_attr("galaxy_ml.model_validations", "_fit_and_score")
 setattr(_search, "_fit_and_score", _fit_and_score)
 setattr(_validation, "_fit_and_score", _fit_and_score)
@@ -262,12 +255,9 @@
         infile2 = pd.read_csv(infile2, sep="\t", header=header, parse_dates=True)
         loaded_df[df_key] = infile2
 
-    y = read_columns(infile2,
-                     c=c,
-                     c_option=column_option,
-                     sep='\t',
-                     header=header,
-                     parse_dates=True)
+    y = read_columns(
+        infile2, c=c, c_option=column_option, sep="\t", header=header, parse_dates=True
+    )
     if len(y.shape) == 2 and y.shape[1] == 1:
         y = y.ravel()
     if input_type == "refseq_and_interval":
@@ -299,12 +289,14 @@
         if df_key in loaded_df:
             groups = loaded_df[df_key]
 
-        groups = read_columns(groups,
-                              c=c,
-                              c_option=column_option,
-                              sep='\t',
-                              header=header,
-                              parse_dates=True)
+        groups = read_columns(
+            groups,
+            c=c,
+            c_option=column_option,
+            sep="\t",
+            header=header,
+            parse_dates=True,
+        )
         groups = groups.ravel()
 
     # del loaded_df
@@ -371,9 +363,14 @@
                 "Stratified shuffle split is not " "applicable on empty target values!"
             )
 
-    X_train, X_test, y_train, y_test, groups_train, _groups_test = train_test_split_none(
-        X, y, groups, **test_split_options
-    )
+    (
+        X_train,
+        X_test,
+        y_train,
+        y_test,
+        groups_train,
+        _groups_test,
+    ) = train_test_split_none(X, y, groups, **test_split_options)
 
     exp_scheme = params["experiment_schemes"]["selected_exp_scheme"]
 
b
diff -r 1d20e0dce176 -r 5773e98921fc train_test_split.py
--- a/train_test_split.py Tue Apr 13 22:00:10 2021 +0000
+++ b/train_test_split.py Sat May 01 01:20:14 2021 +0000
[
@@ -28,17 +28,23 @@
 
     # read groups
     if infile_groups:
-        header = "infer" if (params["mode_selection"]["cv_selector"]["groups_selector"]["header_g"]) else None
-        column_option = params["mode_selection"]["cv_selector"]["groups_selector"]["column_selector_options_g"][
-            "selected_column_selector_option_g"
-        ]
+        header = (
+            "infer"
+            if (params["mode_selection"]["cv_selector"]["groups_selector"]["header_g"])
+            else None
+        )
+        column_option = params["mode_selection"]["cv_selector"]["groups_selector"][
+            "column_selector_options_g"
+        ]["selected_column_selector_option_g"]
         if column_option in [
             "by_index_number",
             "all_but_by_index_number",
             "by_header_name",
             "all_but_by_header_name",
         ]:
-            c = params["mode_selection"]["cv_selector"]["groups_selector"]["column_selector_options_g"]["col_g"]
+            c = params["mode_selection"]["cv_selector"]["groups_selector"][
+                "column_selector_options_g"
+            ]["col_g"]
         else:
             c = None
 
@@ -67,7 +73,10 @@
 
     total_n_splits = splitter.get_n_splits(array.values, y=y, groups=groups)
     if nth_split > total_n_splits:
-        raise ValueError("Total number of splits is {}, but got `nth_split` " "= {}".format(total_n_splits, nth_split))
+        raise ValueError(
+            "Total number of splits is {}, but got `nth_split` "
+            "= {}".format(total_n_splits, nth_split)
+        )
 
     i = 1
     for train_index, test_index in splitter.split(array.values, y=y, groups=groups):
@@ -137,7 +146,9 @@
 
     # cv splitter
     else:
-        train, test = _get_single_cv_split(params, array, infile_labels=infile_labels, infile_groups=infile_groups)
+        train, test = _get_single_cv_split(
+            params, array, infile_labels=infile_labels, infile_groups=infile_groups
+        )
 
     print("Input shape: %s" % repr(array.shape))
     print("Train shape: %s" % repr(train.shape))