Repository 'sklearn_train_test_split'
hg clone https://toolshed.g2.bx.psu.edu/repos/bgruening/sklearn_train_test_split

Changeset 6:13b9ac5d277c (2021-04-13)
Previous changeset 5:ce2fd1edbc6e (2020-10-02) Next changeset 7:3312fb686ffb (2021-05-01)
Commit message:
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 208a8d348e7c7a182cfbe1b6f17868146428a7e2"
modified:
fitted_model_eval.py
keras_deep_learning.py
keras_train_and_eval.py
main_macros.xml
ml_visualization_ex.py
model_prediction.py
pca.py
search_model_validation.py
simple_model_fit.py
stacking_ensembles.py
test-data/keras_batch_params01.tabular
test-data/keras_batch_params04.tabular
test-data/keras_model01
test-data/keras_model02
test-data/keras_model04
test-data/keras_params04.tabular
test-data/pipeline_params05.tabular
test-data/pipeline_params18
test-data/train_test_eval_model01
test-data/train_test_eval_weights01.h5
test-data/train_test_eval_weights02.h5
train_test_eval.py
train_test_split.py
added:
test-data/ohe_in_w_header.tabular
test-data/ohe_in_wo_header.tabular
test-data/ohe_out_4.tabular
test-data/ohe_out_5.tabular
to_categorical.py
b
diff -r ce2fd1edbc6e -r 13b9ac5d277c fitted_model_eval.py
--- a/fitted_model_eval.py Fri Oct 02 08:59:31 2020 +0000
+++ b/fitted_model_eval.py Tue Apr 13 22:24:07 2021 +0000
[
@@ -1,17 +1,17 @@
 import argparse
 import json
-import pandas as pd
 import warnings
 
+import pandas as pd
+from galaxy_ml.utils import get_scoring, load_model, read_columns
 from scipy.io import mmread
-from sklearn.pipeline import Pipeline
 from sklearn.metrics.scorer import _check_multimetric_scoring
 from sklearn.model_selection._validation import _score
-from galaxy_ml.utils import get_scoring, load_model, read_columns
+from sklearn.pipeline import Pipeline
 
 
 def _get_X_y(params, infile1, infile2):
-    """ read from inputs and output X and y
+    """read from inputs and output X and y
 
     Parameters
     ----------
@@ -26,35 +26,40 @@
     # store read dataframe object
     loaded_df = {}
 
-    input_type = params['input_options']['selected_input']
+    input_type = params["input_options"]["selected_input"]
     # tabular input
-    if input_type == 'tabular':
-        header = 'infer' if params['input_options']['header1'] else None
-        column_option = (params['input_options']['column_selector_options_1']
-                         ['selected_column_selector_option'])
-        if column_option in ['by_index_number', 'all_but_by_index_number',
-                             'by_header_name', 'all_but_by_header_name']:
-            c = params['input_options']['column_selector_options_1']['col1']
+    if input_type == "tabular":
+        header = "infer" if params["input_options"]["header1"] else None
+        column_option = params["input_options"]["column_selector_options_1"]["selected_column_selector_option"]
+        if column_option in [
+            "by_index_number",
+            "all_but_by_index_number",
+            "by_header_name",
+            "all_but_by_header_name",
+        ]:
+            c = params["input_options"]["column_selector_options_1"]["col1"]
         else:
             c = None
 
         df_key = infile1 + repr(header)
-        df = pd.read_csv(infile1, sep='\t', header=header,
-                         parse_dates=True)
+        df = pd.read_csv(infile1, sep="\t", header=header, parse_dates=True)
         loaded_df[df_key] = df
 
         X = read_columns(df, c=c, c_option=column_option).astype(float)
     # sparse input
-    elif input_type == 'sparse':
-        X = mmread(open(infile1, 'r'))
+    elif input_type == "sparse":
+        X = mmread(open(infile1, "r"))
 
     # Get target y
-    header = 'infer' if params['input_options']['header2'] else None
-    column_option = (params['input_options']['column_selector_options_2']
-                     ['selected_column_selector_option2'])
-    if column_option in ['by_index_number', 'all_but_by_index_number',
-                         'by_header_name', 'all_but_by_header_name']:
-        c = params['input_options']['column_selector_options_2']['col2']
+    header = "infer" if params["input_options"]["header2"] else None
+    column_option = params["input_options"]["column_selector_options_2"]["selected_column_selector_option2"]
+    if column_option in [
+        "by_index_number",
+        "all_but_by_index_number",
+        "by_header_name",
+        "all_but_by_header_name",
+    ]:
+        c = params["input_options"]["column_selector_options_2"]["col2"]
     else:
         c = None
 
@@ -62,26 +67,24 @@
     if df_key in loaded_df:
         infile2 = loaded_df[df_key]
     else:
-        infile2 = pd.read_csv(infile2, sep='\t',
-                              header=header, parse_dates=True)
+        infile2 = pd.read_csv(infile2, sep="\t", header=header, parse_dates=True)
         loaded_df[df_key] = infile2
 
-    y = read_columns(
-            infile2,
-            c=c,
-            c_option=column_option,
-            sep='\t',
-            header=header,
-            parse_dates=True)
+    y = read_columns(infile2, c=c, c_option=column_option, sep="\t", header=header, parse_dates=True)
     if len(y.shape) == 2 and y.shape[1] == 1:
         y = y.ravel()
 
     return X, y
 
 
-def main(inputs, infile_estimator, outfile_eval,
-         infile_weights=None, infile1=None,
-         infile2=None):
+def main(
+    inputs,
+    infile_estimator,
+    outfile_eval,
+    infile_weights=None,
+    infile1=None,
+    infile2=None,
+):
     """
     Parameter
     ---------
@@ -103,49 +106,55 @@
     infile2 : str
         File path to dataset containing target values
     """
-    warnings.filterwarnings('ignore')
+    warnings.filterwarnings("ignore")
 
-    with open(inputs, 'r') as param_handler:
+    with open(inputs, "r") as param_handler:
         params = json.load(param_handler)
 
     X_test, y_test = _get_X_y(params, infile1, infile2)
 
     # load model
-    with open(infile_estimator, 'rb') as est_handler:
+    with open(infile_estimator, "rb") as est_handler:
         estimator = load_model(est_handler)
 
     main_est = estimator
     if isinstance(estimator, Pipeline):
         main_est = estimator.steps[-1][-1]
-    if hasattr(main_est, 'config') and hasattr(main_est, 'load_weights'):
-        if not infile_weights or infile_weights == 'None':
-            raise ValueError("The selected model skeleton asks for weights, "
-                             "but no dataset for weights was provided!")
+    if hasattr(main_est, "config") and hasattr(main_est, "load_weights"):
+        if not infile_weights or infile_weights == "None":
+            raise ValueError(
+                "The selected model skeleton asks for weights, " "but no dataset for weights was provided!"
+            )
         main_est.load_weights(infile_weights)
 
     # handle scorer, convert to scorer dict
-    scoring = params['scoring']
+    # Check if scoring is specified
+    scoring = params["scoring"]
+    if scoring is not None:
+        # get_scoring() expects secondary_scoring to be a comma separated string (not a list)
+        # Check if secondary_scoring is specified
+        secondary_scoring = scoring.get("secondary_scoring", None)
+        if secondary_scoring is not None:
+            # If secondary_scoring is specified, convert the list into comman separated string
+            scoring["secondary_scoring"] = ",".join(scoring["secondary_scoring"])
+
     scorer = get_scoring(scoring)
     scorer, _ = _check_multimetric_scoring(estimator, scoring=scorer)
 
-    if hasattr(estimator, 'evaluate'):
-        scores = estimator.evaluate(X_test, y_test=y_test,
-                                    scorer=scorer,
-                                    is_multimetric=True)
+    if hasattr(estimator, "evaluate"):
+        scores = estimator.evaluate(X_test, y_test=y_test, scorer=scorer, is_multimetric=True)
     else:
-        scores = _score(estimator, X_test, y_test, scorer,
-                        is_multimetric=True)
+        scores = _score(estimator, X_test, y_test, scorer, is_multimetric=True)
 
     # handle output
     for name, score in scores.items():
         scores[name] = [score]
     df = pd.DataFrame(scores)
     df = df[sorted(df.columns)]
-    df.to_csv(path_or_buf=outfile_eval, sep='\t',
-              header=True, index=False)
+    df.to_csv(path_or_buf=outfile_eval, sep="\t", header=True, index=False)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     aparser = argparse.ArgumentParser()
     aparser.add_argument("-i", "--inputs", dest="inputs", required=True)
     aparser.add_argument("-e", "--infile_estimator", dest="infile_estimator")
@@ -155,6 +164,11 @@
     aparser.add_argument("-O", "--outfile_eval", dest="outfile_eval")
     args = aparser.parse_args()
 
-    main(args.inputs, args.infile_estimator, args.outfile_eval,
-         infile_weights=args.infile_weights, infile1=args.infile1,
-         infile2=args.infile2)
+    main(
+        args.inputs,
+        args.infile_estimator,
+        args.outfile_eval,
+        infile_weights=args.infile_weights,
+        infile1=args.infile1,
+        infile2=args.infile2,
+    )
b
diff -r ce2fd1edbc6e -r 13b9ac5d277c keras_deep_learning.py
--- a/keras_deep_learning.py Fri Oct 02 08:59:31 2020 +0000
+++ b/keras_deep_learning.py Tue Apr 13 22:24:07 2021 +0000
[
@@ -1,14 +1,14 @@
 import argparse
 import json
+import pickle
+import warnings
+from ast import literal_eval
+
 import keras
 import pandas as pd
-import pickle
 import six
-import warnings
-
-from ast import literal_eval
-from keras.models import Sequential, Model
-from galaxy_ml.utils import try_get_attr, get_search_params, SafeEval
+from galaxy_ml.utils import get_search_params, SafeEval, try_get_attr
+from keras.models import Model, Sequential
 
 
 safe_eval = SafeEval()
@@ -177,11 +177,11 @@
         # merge layers
         if 'merging_layers' in options:
             idxs = literal_eval(options.pop('merging_layers'))
-            merging_layers = [all_layers[i-1] for i in idxs]
+            merging_layers = [all_layers[i - 1] for i in idxs]
             new_layer = klass(**options)(merging_layers)
         # non-input layers
         elif inbound_nodes is not None:
-            new_layer = klass(**options)(all_layers[inbound_nodes-1])
+            new_layer = klass(**options)(all_layers[inbound_nodes - 1])
         # input layers
         else:
             new_layer = klass(**options)
@@ -189,10 +189,10 @@
         all_layers.append(new_layer)
 
     input_indexes = _handle_shape(config['input_layers'])
-    input_layers = [all_layers[i-1] for i in input_indexes]
+    input_layers = [all_layers[i - 1] for i in input_indexes]
 
     output_indexes = _handle_shape(config['output_layers'])
-    output_layers = [all_layers[i-1] for i in output_indexes]
+    output_layers = [all_layers[i - 1] for i in output_indexes]
 
     return Model(inputs=input_layers, outputs=output_layers)
 
@@ -300,8 +300,7 @@
         options.update((inputs['mode_selection']['compile_params']
                         ['optimizer_selection']['optimizer_options']))
 
-        train_metrics = (inputs['mode_selection']['compile_params']
-                         ['metrics']).split(',')
+        train_metrics = inputs['mode_selection']['compile_params']['metrics']
         if train_metrics[-1] == 'none':
             train_metrics = train_metrics[:-1]
         options['metrics'] = train_metrics
b
diff -r ce2fd1edbc6e -r 13b9ac5d277c keras_train_and_eval.py
--- a/keras_train_and_eval.py Fri Oct 02 08:59:31 2020 +0000
+++ b/keras_train_and_eval.py Tue Apr 13 22:24:07 2021 +0000
[
b'@@ -1,56 +1,65 @@\n import argparse\n-import joblib\n import json\n-import numpy as np\n import os\n-import pandas as pd\n import pickle\n import warnings\n from itertools import chain\n+\n+import joblib\n+import numpy as np\n+import pandas as pd\n+from galaxy_ml.externals.selene_sdk.utils import compute_score\n+from galaxy_ml.keras_galaxy_models import _predict_generator\n+from galaxy_ml.model_validations import train_test_split\n+from galaxy_ml.utils import (\n+    clean_params,\n+    get_main_estimator,\n+    get_module,\n+    get_scoring,\n+    load_model,\n+    read_columns,\n+    SafeEval,\n+    try_get_attr,\n+)\n from scipy.io import mmread\n-from sklearn.pipeline import Pipeline\n from sklearn.metrics.scorer import _check_multimetric_scoring\n-from sklearn import model_selection\n+from sklearn.model_selection import _search, _validation\n from sklearn.model_selection._validation import _score\n-from sklearn.model_selection import _search, _validation\n+from sklearn.pipeline import Pipeline\n from sklearn.utils import indexable, safe_indexing\n \n-from galaxy_ml.externals.selene_sdk.utils import compute_score\n-from galaxy_ml.model_validations import train_test_split\n-from galaxy_ml.keras_galaxy_models import _predict_generator\n-from galaxy_ml.utils import (SafeEval, get_scoring, load_model,\n-                             read_columns, try_get_attr, get_module,\n-                             clean_params, get_main_estimator)\n \n+_fit_and_score = try_get_attr("galaxy_ml.model_validations", "_fit_and_score")\n+setattr(_search, "_fit_and_score", _fit_and_score)\n+setattr(_validation, "_fit_and_score", _fit_and_score)\n \n-_fit_and_score = try_get_attr(\'galaxy_ml.model_validations\', \'_fit_and_score\')\n-setattr(_search, \'_fit_and_score\', _fit_and_score)\n-setattr(_validation, \'_fit_and_score\', _fit_and_score)\n-\n-N_JOBS = int(os.environ.get(\'GALAXY_SLOTS\', 1))\n-CACHE_DIR = os.path.join(os.getcwd(), \'cached\')\n+N_JOBS = int(os.environ.get("GALAXY_SLOTS", 1))\n+CACHE_DIR = os.path.join(os.getcwd(), "cached")\n del os\n-NON_SEARCHABLE = (\'n_jobs\', \'pre_dispatch\', \'memory\', \'_path\',\n-                  \'nthread\', \'callbacks\')\n-ALLOWED_CALLBACKS = (\'EarlyStopping\', \'TerminateOnNaN\', \'ReduceLROnPlateau\',\n-                     \'CSVLogger\', \'None\')\n+NON_SEARCHABLE = ("n_jobs", "pre_dispatch", "memory", "_path", "nthread", "callbacks")\n+ALLOWED_CALLBACKS = (\n+    "EarlyStopping",\n+    "TerminateOnNaN",\n+    "ReduceLROnPlateau",\n+    "CSVLogger",\n+    "None",\n+)\n \n \n def _eval_swap_params(params_builder):\n     swap_params = {}\n \n-    for p in params_builder[\'param_set\']:\n-        swap_value = p[\'sp_value\'].strip()\n-        if swap_value == \'\':\n+    for p in params_builder["param_set"]:\n+        swap_value = p["sp_value"].strip()\n+        if swap_value == "":\n             continue\n \n-        param_name = p[\'sp_name\']\n+        param_name = p["sp_name"]\n         if param_name.lower().endswith(NON_SEARCHABLE):\n-            warnings.warn("Warning: `%s` is not eligible for search and was "\n-                          "omitted!" % param_name)\n+            warnings.warn("Warning: `%s` is not eligible for search and was " "omitted!" % param_name)\n             continue\n \n-        if not swap_value.startswith(\':\'):\n+        if not swap_value.startswith(":"):\n             safe_eval = SafeEval(load_scipy=True, load_numpy=True)\n             ev = safe_eval(swap_value)\n         else:\n@@ -77,23 +86,20 @@\n         else:\n             new_arrays.append(arr)\n \n-    if kwargs[\'shuffle\'] == \'None\':\n-        kwargs[\'shuffle\'] = None\n+    if kwargs["shuffle"] == "None":\n+        kwargs["shuffle"] = None\n \n-    group_names = kwargs.pop(\'group_names\', None)\n+    group_names = kwargs.pop("group_names", None)\n \n     if group_names is not None and group_names.strip():\n-        group_names = [name.strip() for name in\n-                       group_names.split(\',\')]\n+        group_names = [name.strip() for name in group_names.split(",")]\n         new_arrays = indexable(*new_arrays)\n-        groups = kwargs[\'labels\']\n+'..b'redict_generator(estimator.model_, generator, steps=steps)\n         scores = _evaluate(y_true, predictions, scorer, is_multimetric=True)\n \n     else:\n-        if hasattr(estimator, \'predict_proba\'):\n+        if hasattr(estimator, "predict_proba"):\n             predictions = estimator.predict_proba(X_test)\n         else:\n             predictions = estimator.predict(X_test)\n \n         y_true = y_test\n-        scores = _score(estimator, X_test, y_test, scorer,\n-                        is_multimetric=True)\n+        scores = _score(estimator, X_test, y_test, scorer, is_multimetric=True)\n     if outfile_y_true:\n         try:\n-            pd.DataFrame(y_true).to_csv(outfile_y_true, sep=\'\\t\',\n-                                        index=False)\n+            pd.DataFrame(y_true).to_csv(outfile_y_true, sep="\\t", index=False)\n             pd.DataFrame(predictions).astype(np.float32).to_csv(\n-                outfile_y_preds, sep=\'\\t\', index=False,\n-                float_format=\'%g\', chunksize=10000)\n+                outfile_y_preds,\n+                sep="\\t",\n+                index=False,\n+                float_format="%g",\n+                chunksize=10000,\n+            )\n         except Exception as e:\n             print("Error in saving predictions: %s" % e)\n \n@@ -437,8 +461,7 @@\n         scores[name] = [score]\n     df = pd.DataFrame(scores)\n     df = df[sorted(df.columns)]\n-    df.to_csv(path_or_buf=outfile_result, sep=\'\\t\',\n-              header=True, index=False)\n+    df.to_csv(path_or_buf=outfile_result, sep="\\t", header=True, index=False)\n \n     memory.clear(warn=False)\n \n@@ -447,23 +470,22 @@\n         if isinstance(estimator, Pipeline):\n             main_est = estimator.steps[-1][-1]\n \n-        if hasattr(main_est, \'model_\') \\\n-                and hasattr(main_est, \'save_weights\'):\n+        if hasattr(main_est, "model_") and hasattr(main_est, "save_weights"):\n             if outfile_weights:\n                 main_est.save_weights(outfile_weights)\n             del main_est.model_\n             del main_est.fit_params\n             del main_est.model_class_\n-            del main_est.validation_data\n-            if getattr(main_est, \'data_generator_\', None):\n+            if getattr(main_est, "validation_data", None):\n+                del main_est.validation_data\n+            if getattr(main_est, "data_generator_", None):\n                 del main_est.data_generator_\n \n-        with open(outfile_object, \'wb\') as output_handler:\n-            pickle.dump(estimator, output_handler,\n-                        pickle.HIGHEST_PROTOCOL)\n+        with open(outfile_object, "wb") as output_handler:\n+            pickle.dump(estimator, output_handler, pickle.HIGHEST_PROTOCOL)\n \n \n-if __name__ == \'__main__\':\n+if __name__ == "__main__":\n     aparser = argparse.ArgumentParser()\n     aparser.add_argument("-i", "--inputs", dest="inputs", required=True)\n     aparser.add_argument("-e", "--estimator", dest="infile_estimator")\n@@ -481,11 +503,19 @@\n     aparser.add_argument("-f", "--fasta_path", dest="fasta_path")\n     args = aparser.parse_args()\n \n-    main(args.inputs, args.infile_estimator, args.infile1, args.infile2,\n-         args.outfile_result, outfile_object=args.outfile_object,\n-         outfile_weights=args.outfile_weights,\n-         outfile_y_true=args.outfile_y_true,\n-         outfile_y_preds=args.outfile_y_preds,\n-         groups=args.groups,\n-         ref_seq=args.ref_seq, intervals=args.intervals,\n-         targets=args.targets, fasta_path=args.fasta_path)\n+    main(\n+        args.inputs,\n+        args.infile_estimator,\n+        args.infile1,\n+        args.infile2,\n+        args.outfile_result,\n+        outfile_object=args.outfile_object,\n+        outfile_weights=args.outfile_weights,\n+        outfile_y_true=args.outfile_y_true,\n+        outfile_y_preds=args.outfile_y_preds,\n+        groups=args.groups,\n+        ref_seq=args.ref_seq,\n+        intervals=args.intervals,\n+        targets=args.targets,\n+        fasta_path=args.fasta_path,\n+    )\n'
b
diff -r ce2fd1edbc6e -r 13b9ac5d277c main_macros.xml
--- a/main_macros.xml Fri Oct 02 08:59:31 2020 +0000
+++ b/main_macros.xml Tue Apr 13 22:24:07 2021 +0000
[
b'@@ -1,1952 +1,1940 @@\n <macros>\n-  <token name="@VERSION@">1.0.8.2</token>\n+    <token name="@VERSION@">1.0.8.3</token>\n \n-  <xml name="python_requirements">\n-      <requirements>\n-          <requirement type="package" version="3.6">python</requirement>\n-          <requirement type="package" version="0.8.1">Galaxy-ML</requirement>\n-          <yield/>\n-      </requirements>\n-  </xml>\n+    <xml name="python_requirements">\n+        <requirements>\n+            <requirement type="package" version="0.8.3">Galaxy-ML</requirement>\n+            <yield />\n+        </requirements>\n+    </xml>\n \n-  <xml name="macro_stdio">\n-    <stdio>\n-        <exit_code range="1:" level="fatal" description="Error occurred. Please check Tool Standard Error"/>\n-    </stdio>\n-  </xml>\n+    <xml name="macro_stdio">\n+        <stdio>\n+            <exit_code range="1:" level="fatal" description="Error occurred. Please check Tool Standard Error" />\n+        </stdio>\n+    </xml>\n \n \n-  <!--Generic interface-->\n+    <!--Generic interface-->\n \n-  <xml name="sl_Conditional" token_train="tabular" token_data="tabular" token_model="txt">\n-    <conditional name="selected_tasks">\n-        <param name="selected_task" type="select" label="Select a Classification Task">\n-            <option value="train" selected="true">Train a model</option>\n-            <option value="load">Load a model and predict</option>\n-        </param>\n-        <when value="load">\n-            <param name="infile_model" type="data" format="@MODEL@" label="Models" help="Select a model file."/>\n-            <param name="infile_data" type="data" format="@DATA@" label="Data (tabular)" help="Select the dataset you want to classify."/>\n-            <param name="header" type="boolean" optional="True" truevalue="booltrue" falsevalue="boolfalse" checked="False" label="Does the dataset contain header:" />\n-            <conditional name="prediction_options">\n-                <param name="prediction_option" type="select" label="Select the type of prediction">\n-                    <option value="predict">Predict class labels</option>\n-                    <option value="advanced">Include advanced options</option>\n-                </param>\n-                <when value="predict">\n-                </when>\n-                <when value="advanced">\n-                </when>\n-            </conditional>\n-        </when>\n-        <when value="train">\n-            <conditional name="selected_algorithms">\n-                <yield />\n-            </conditional>\n-        </when>\n-    </conditional>\n-  </xml>\n+    <xml name="sl_Conditional" token_train="tabular" token_data="tabular" token_model="txt">\n+        <conditional name="selected_tasks">\n+            <param name="selected_task" type="select" label="Select a Classification Task">\n+                <option value="train" selected="true">Train a model</option>\n+                <option value="load">Load a model and predict</option>\n+            </param>\n+            <when value="load">\n+                <param name="infile_model" type="data" format="@MODEL@" label="Models" help="Select a model file." />\n+                <param name="infile_data" type="data" format="@DATA@" label="Data (tabular)" help="Select the dataset you want to classify." />\n+                <param name="header" type="boolean" optional="True" truevalue="booltrue" falsevalue="boolfalse" checked="False" label="Does the dataset contain header:" />\n+                <conditional name="prediction_options">\n+                    <param name="prediction_option" type="select" label="Select the type of prediction">\n+                        <option value="predict">Predict class labels</option>\n+                        <option value="advanced">Include advanced options</option>\n+                    </param>\n+                    <when value="predict">\n+                    </when>\n+                    <when value="advanced">\n+                    </when>\n+                </conditional>\n+            </when>\n+            <w'..b'         <citation type="bibtex">\n+          @article{scikit-learn, title={Scikit-learn: Machine Learning in {P}ython}, author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.\n                     and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.\n                     and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and\n-                    Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},\n-            journal={Journal of Machine Learning Research},\n-            volume={12},\n-            pages={2825--2830},\n-            year={2011}\n+                    Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.}, journal={Journal of Machine Learning Research}, volume={12}, pages={2825--2830}, year={2011}\n           }\n-        </citation>\n-        <yield/>\n-    </citations>\n-  </xml>\n+            </citation>\n+            <yield />\n+        </citations>\n+    </xml>\n \n-  <xml name="scipy_citation">\n-    <citations>\n-        <citation type="bibtex">\n+    <xml name="scipy_citation">\n+        <citations>\n+            <citation type="bibtex">\n           @Misc{,\n           author =    {Eric Jones and Travis Oliphant and Pearu Peterson and others},\n           title =     {{SciPy}: Open source scientific tools for {Python}},\n@@ -1954,12 +1942,12 @@\n           url = "http://www.scipy.org/",\n           note = {[Online; accessed 2016-04-09]}\n         }\n-        </citation>\n-    </citations>\n-  </xml>\n+            </citation>\n+        </citations>\n+    </xml>\n \n-  <xml name="skrebate_citation">\n-    <citation type="bibtex">\n+    <xml name="skrebate_citation">\n+        <citation type="bibtex">\n       @article{DBLP:journals/corr/abs-1711-08477,\n         author    = {Ryan J. Urbanowicz and\n                     Randal S. Olson and\n@@ -1977,11 +1965,11 @@\n         biburl    = {https://dblp.org/rec/bib/journals/corr/abs-1711-08477},\n         bibsource = {dblp computer science bibliography, https://dblp.org}\n       }\n-    </citation>\n-  </xml>\n+        </citation>\n+    </xml>\n \n-  <xml name="xgboost_citation">\n-    <citation type="bibtex">\n+    <xml name="xgboost_citation">\n+        <citation type="bibtex">\n       @inproceedings{Chen:2016:XST:2939672.2939785,\n         author = {Chen, Tianqi and Guestrin, Carlos},\n         title = {{XGBoost}: A Scalable Tree Boosting System},\n@@ -1999,11 +1987,11 @@\n         address = {New York, NY, USA},\n         keywords = {large-scale machine learning},\n       }\n-    </citation>\n-  </xml>\n+        </citation>\n+    </xml>\n \n-  <xml name="imblearn_citation">\n-    <citation type="bibtex">\n+    <xml name="imblearn_citation">\n+        <citation type="bibtex">\n       @article{JMLR:v18:16-365,\n         author  = {Guillaume  Lema{{\\^i}}tre and Fernando Nogueira and Christos K. Aridas},\n         title   = {Imbalanced-learn: A Python Toolbox to Tackle the Curse of Imbalanced Datasets in Machine Learning},\n@@ -2014,22 +2002,14 @@\n         pages   = {1-5},\n         url     = {http://jmlr.org/papers/v18/16-365.html}\n       }\n-    </citation>\n-  </xml>\n+        </citation>\n+    </xml>\n \n-  <xml name="selene_citation">\n-    <citation type="bibtex">\n-      @article{chen2019selene,\n-        title={Selene: a PyTorch-based deep learning library for sequence data},\n-        author={Chen, Kathleen M and Cofer, Evan M and Zhou, Jian and Troyanskaya, Olga G},\n-        journal={Nature methods},\n-        volume={16},\n-        number={4},\n-        pages={315},\n-        year={2019},\n-        publisher={Nature Publishing Group}\n+    <xml name="selene_citation">\n+        <citation type="bibtex">\n+      @article{chen2019selene, title={Selene: a PyTorch-based deep learning library for sequence data}, author={Chen, Kathleen M and Cofer, Evan M and Zhou, Jian and Troyanskaya, Olga G}, journal={Nature methods}, volume={16}, number={4}, pages={315}, year={2019}, publisher={Nature Publishing Group}\n       }\n-    </citation>\n-  </xml>\n+        </citation>\n+    </xml>\n \n </macros>\n'
b
diff -r ce2fd1edbc6e -r 13b9ac5d277c ml_visualization_ex.py
--- a/ml_visualization_ex.py Fri Oct 02 08:59:31 2020 +0000
+++ b/ml_visualization_ex.py Tue Apr 13 22:24:07 2021 +0000
[
b'@@ -1,37 +1,36 @@\n import argparse\n import json\n+import os\n+import warnings\n+\n import matplotlib\n import matplotlib.pyplot as plt\n import numpy as np\n-import os\n import pandas as pd\n import plotly\n import plotly.graph_objs as go\n-import warnings\n-\n+from galaxy_ml.utils import load_model, read_columns, SafeEval\n from keras.models import model_from_json\n from keras.utils import plot_model\n from sklearn.feature_selection.base import SelectorMixin\n-from sklearn.metrics import precision_recall_curve, average_precision_score\n-from sklearn.metrics import roc_curve, auc, confusion_matrix\n+from sklearn.metrics import auc, average_precision_score, confusion_matrix, precision_recall_curve, roc_curve\n from sklearn.pipeline import Pipeline\n-from galaxy_ml.utils import load_model, read_columns, SafeEval\n \n \n safe_eval = SafeEval()\n \n # plotly default colors\n default_colors = [\n-    \'#1f77b4\',  # muted blue\n-    \'#ff7f0e\',  # safety orange\n-    \'#2ca02c\',  # cooked asparagus green\n-    \'#d62728\',  # brick red\n-    \'#9467bd\',  # muted purple\n-    \'#8c564b\',  # chestnut brown\n-    \'#e377c2\',  # raspberry yogurt pink\n-    \'#7f7f7f\',  # middle gray\n-    \'#bcbd22\',  # curry yellow-green\n-    \'#17becf\'   # blue-teal\n+    "#1f77b4",  # muted blue\n+    "#ff7f0e",  # safety orange\n+    "#2ca02c",  # cooked asparagus green\n+    "#d62728",  # brick red\n+    "#9467bd",  # muted purple\n+    "#8c564b",  # chestnut brown\n+    "#e377c2",  # raspberry yogurt pink\n+    "#7f7f7f",  # middle gray\n+    "#bcbd22",  # curry yellow-green\n+    "#17becf",  # blue-teal\n ]\n \n \n@@ -52,46 +51,31 @@\n         y_true = df1.iloc[:, idx].values\n         y_score = df2.iloc[:, idx].values\n \n-        precision, recall, _ = precision_recall_curve(\n-            y_true, y_score, pos_label=pos_label)\n-        ap = average_precision_score(\n-            y_true, y_score, pos_label=pos_label or 1)\n+        precision, recall, _ = precision_recall_curve(y_true, y_score, pos_label=pos_label)\n+        ap = average_precision_score(y_true, y_score, pos_label=pos_label or 1)\n \n         trace = go.Scatter(\n             x=recall,\n             y=precision,\n-            mode=\'lines\',\n-            marker=dict(\n-                color=default_colors[idx % len(default_colors)]\n-            ),\n-            name=\'%s (area = %.3f)\' % (idx, ap)\n+            mode="lines",\n+            marker=dict(color=default_colors[idx % len(default_colors)]),\n+            name="%s (area = %.3f)" % (idx, ap),\n         )\n         data.append(trace)\n \n     layout = go.Layout(\n-        xaxis=dict(\n-            title=\'Recall\',\n-            linecolor=\'lightslategray\',\n-            linewidth=1\n-        ),\n-        yaxis=dict(\n-            title=\'Precision\',\n-            linecolor=\'lightslategray\',\n-            linewidth=1\n-        ),\n+        xaxis=dict(title="Recall", linecolor="lightslategray", linewidth=1),\n+        yaxis=dict(title="Precision", linecolor="lightslategray", linewidth=1),\n         title=dict(\n-            text=title or \'Precision-Recall Curve\',\n+            text=title or "Precision-Recall Curve",\n             x=0.5,\n             y=0.92,\n-            xanchor=\'center\',\n-            yanchor=\'top\'\n+            xanchor="center",\n+            yanchor="top",\n         ),\n-        font=dict(\n-            family="sans-serif",\n-            size=11\n-        ),\n+        font=dict(family="sans-serif", size=11),\n         # control backgroud colors\n-        plot_bgcolor=\'rgba(255,255,255,0)\'\n+        plot_bgcolor="rgba(255,255,255,0)",\n     )\n     """\n     legend=dict(\n@@ -112,45 +96,47 @@\n \n     plotly.offline.plot(fig, filename="output.html", auto_open=False)\n     # to be discovered by `from_work_dir`\n-    os.rename(\'output.html\', \'output\')\n+    os.rename("output.html", "output")\n \n \n def visualize_pr_curve_matplotlib(df1, df2, pos_label, title=None):\n-    """visualize pr-curve using matplotlib and output svg image\n-    """\n+    """visualize pr-curve using matplotlib and output svg image"""\n     backend = matplotlib.get_b'..b'ud colors\n-            plot_bgcolor=\'rgba(255,255,255,0)\'\n+            plot_bgcolor="rgba(255,255,255,0)",\n         )\n         """\n         # legend=dict(\n@@ -556,27 +531,26 @@\n         """\n \n         fig = go.Figure(data=[data1, data2], layout=layout)\n-        plotly.offline.plot(fig, filename="output.html",\n-                            auto_open=False)\n+        plotly.offline.plot(fig, filename="output.html", auto_open=False)\n         # to be discovered by `from_work_dir`\n-        os.rename(\'output.html\', \'output\')\n+        os.rename("output.html", "output")\n \n         return 0\n \n-    elif plot_type == \'keras_plot_model\':\n-        with open(model_config, \'r\') as f:\n+    elif plot_type == "keras_plot_model":\n+        with open(model_config, "r") as f:\n             model_str = f.read()\n         model = model_from_json(model_str)\n         plot_model(model, to_file="output.png")\n-        os.rename(\'output.png\', \'output\')\n+        os.rename("output.png", "output")\n \n         return 0\n \n-    elif plot_type == \'classification_confusion_matrix\':\n+    elif plot_type == "classification_confusion_matrix":\n         plot_selection = params["plotting_selection"]\n         input_true = get_dataframe(true_labels, plot_selection, "header_true", "column_selector_options_true")\n-        header_predicted = \'infer\' if plot_selection["header_predicted"] else None\n-        input_predicted = pd.read_csv(predicted_labels, sep=\'\\t\', parse_dates=True, header=header_predicted)\n+        header_predicted = "infer" if plot_selection["header_predicted"] else None\n+        input_predicted = pd.read_csv(predicted_labels, sep="\\t", parse_dates=True, header=header_predicted)\n         true_classes = input_true.iloc[:, -1].copy()\n         predicted_classes = input_predicted.iloc[:, -1].copy()\n         axis_labels = list(set(true_classes))\n@@ -586,15 +560,15 @@\n         for i in range(len(c_matrix)):\n             for j in range(len(c_matrix)):\n                 ax.text(j, i, c_matrix[i, j], ha="center", va="center", color="k")\n-        ax.set_ylabel(\'True class labels\')\n-        ax.set_xlabel(\'Predicted class labels\')\n+        ax.set_ylabel("True class labels")\n+        ax.set_xlabel("Predicted class labels")\n         ax.set_title(title)\n         ax.set_xticks(axis_labels)\n         ax.set_yticks(axis_labels)\n         fig.colorbar(im, ax=ax)\n         fig.tight_layout()\n         plt.savefig("output.png", dpi=125)\n-        os.rename(\'output.png\', \'output\')\n+        os.rename("output.png", "output")\n \n         return 0\n \n@@ -603,7 +577,7 @@\n     # fig.write_image("image.pdf", format=\'pdf\', width=340*2, height=226*2)\n \n \n-if __name__ == \'__main__\':\n+if __name__ == "__main__":\n     aparser = argparse.ArgumentParser()\n     aparser.add_argument("-i", "--inputs", dest="inputs", required=True)\n     aparser.add_argument("-e", "--estimator", dest="infile_estimator")\n@@ -623,11 +597,21 @@\n     aparser.add_argument("-pt", "--title", dest="title")\n     args = aparser.parse_args()\n \n-    main(args.inputs, args.infile_estimator, args.infile1, args.infile2,\n-         args.outfile_result, outfile_object=args.outfile_object,\n-         groups=args.groups, ref_seq=args.ref_seq, intervals=args.intervals,\n-         targets=args.targets, fasta_path=args.fasta_path,\n-         model_config=args.model_config, true_labels=args.true_labels,\n-         predicted_labels=args.predicted_labels,\n-         plot_color=args.plot_color,\n-         title=args.title)\n+    main(\n+        args.inputs,\n+        args.infile_estimator,\n+        args.infile1,\n+        args.infile2,\n+        args.outfile_result,\n+        outfile_object=args.outfile_object,\n+        groups=args.groups,\n+        ref_seq=args.ref_seq,\n+        intervals=args.intervals,\n+        targets=args.targets,\n+        fasta_path=args.fasta_path,\n+        model_config=args.model_config,\n+        true_labels=args.true_labels,\n+        predicted_labels=args.predicted_labels,\n+        plot_color=args.plot_color,\n+        title=args.title,\n+    )\n'
b
diff -r ce2fd1edbc6e -r 13b9ac5d277c model_prediction.py
--- a/model_prediction.py Fri Oct 02 08:59:31 2020 +0000
+++ b/model_prediction.py Tue Apr 13 22:24:07 2021 +0000
[
b'@@ -1,23 +1,26 @@\n import argparse\n import json\n+import warnings\n+\n import numpy as np\n import pandas as pd\n-import warnings\n-\n+from galaxy_ml.utils import get_module, load_model, read_columns, try_get_attr\n from scipy.io import mmread\n from sklearn.pipeline import Pipeline\n \n-from galaxy_ml.utils import (load_model, read_columns,\n-                             get_module, try_get_attr)\n+N_JOBS = int(__import__("os").environ.get("GALAXY_SLOTS", 1))\n \n \n-N_JOBS = int(__import__(\'os\').environ.get(\'GALAXY_SLOTS\', 1))\n-\n-\n-def main(inputs, infile_estimator, outfile_predict,\n-         infile_weights=None, infile1=None,\n-         fasta_path=None, ref_seq=None,\n-         vcf_path=None):\n+def main(\n+    inputs,\n+    infile_estimator,\n+    outfile_predict,\n+    infile_weights=None,\n+    infile1=None,\n+    fasta_path=None,\n+    ref_seq=None,\n+    vcf_path=None,\n+):\n     """\n     Parameter\n     ---------\n@@ -45,96 +48,94 @@\n     vcf_path : str\n         File path to dataset containing variants info.\n     """\n-    warnings.filterwarnings(\'ignore\')\n+    warnings.filterwarnings("ignore")\n \n-    with open(inputs, \'r\') as param_handler:\n+    with open(inputs, "r") as param_handler:\n         params = json.load(param_handler)\n \n     # load model\n-    with open(infile_estimator, \'rb\') as est_handler:\n+    with open(infile_estimator, "rb") as est_handler:\n         estimator = load_model(est_handler)\n \n     main_est = estimator\n     if isinstance(estimator, Pipeline):\n         main_est = estimator.steps[-1][-1]\n-    if hasattr(main_est, \'config\') and hasattr(main_est, \'load_weights\'):\n-        if not infile_weights or infile_weights == \'None\':\n-            raise ValueError("The selected model skeleton asks for weights, "\n-                             "but dataset for weights wan not selected!")\n+    if hasattr(main_est, "config") and hasattr(main_est, "load_weights"):\n+        if not infile_weights or infile_weights == "None":\n+            raise ValueError(\n+                "The selected model skeleton asks for weights, " "but dataset for weights wan not selected!"\n+            )\n         main_est.load_weights(infile_weights)\n \n     # handle data input\n-    input_type = params[\'input_options\'][\'selected_input\']\n+    input_type = params["input_options"]["selected_input"]\n     # tabular input\n-    if input_type == \'tabular\':\n-        header = \'infer\' if params[\'input_options\'][\'header1\'] else None\n-        column_option = (params[\'input_options\']\n-                               [\'column_selector_options_1\']\n-                               [\'selected_column_selector_option\'])\n-        if column_option in [\'by_index_number\', \'all_but_by_index_number\',\n-                             \'by_header_name\', \'all_but_by_header_name\']:\n-            c = params[\'input_options\'][\'column_selector_options_1\'][\'col1\']\n+    if input_type == "tabular":\n+        header = "infer" if params["input_options"]["header1"] else None\n+        column_option = params["input_options"]["column_selector_options_1"]["selected_column_selector_option"]\n+        if column_option in [\n+            "by_index_number",\n+            "all_but_by_index_number",\n+            "by_header_name",\n+            "all_but_by_header_name",\n+        ]:\n+            c = params["input_options"]["column_selector_options_1"]["col1"]\n         else:\n             c = None\n \n-        df = pd.read_csv(infile1, sep=\'\\t\', header=header, parse_dates=True)\n+        df = pd.read_csv(infile1, sep="\\t", header=header, parse_dates=True)\n \n         X = read_columns(df, c=c, c_option=column_option).astype(float)\n \n-        if params[\'method\'] == \'predict\':\n+        if params["method"] == "predict":\n             preds = estimator.predict(X)\n         else:\n             preds = estimator.predict_proba(X)\n \n     # sparse input\n-    elif input_type == \'sparse\':\n-        X = mmread(open(infile1, \'r\'))\n-        if params[\'method\'] == \'predict\':\n+    elif input_type == "sparse":\n+        X = mmread(open(infile1, "r"))\n+        if pa'..b'        pred_data_generator = klass(\n-            ref_genome_path=ref_seq, vcf_path=vcf_path, **options)\n+        pred_data_generator = klass(ref_genome_path=ref_seq, vcf_path=vcf_path, **options)\n \n         pred_data_generator.set_processing_attrs()\n \n@@ -143,9 +144,8 @@\n         # predict 1600 sample at once then write to file\n         gen_flow = pred_data_generator.flow(batch_size=1600)\n \n-        file_writer = open(outfile_predict, \'w\')\n-        header_row = \'\\t\'.join([\'chrom\', \'pos\', \'name\', \'ref\',\n-                                \'alt\', \'strand\'])\n+        file_writer = open(outfile_predict, "w")\n+        header_row = "\\t".join(["chrom", "pos", "name", "ref", "alt", "strand"])\n         file_writer.write(header_row)\n         header_done = False\n \n@@ -155,23 +155,24 @@\n         try:\n             while steps_done < len(gen_flow):\n                 index_array = next(gen_flow.index_generator)\n-                batch_X = gen_flow._get_batches_of_transformed_samples(\n-                    index_array)\n+                batch_X = gen_flow._get_batches_of_transformed_samples(index_array)\n \n-                if params[\'method\'] == \'predict\':\n+                if params["method"] == "predict":\n                     batch_preds = estimator.predict(\n                         batch_X,\n                         # The presence of `pred_data_generator` below is to\n                         # override model carrying data_generator if there\n                         # is any.\n-                        data_generator=pred_data_generator)\n+                        data_generator=pred_data_generator,\n+                    )\n                 else:\n                     batch_preds = estimator.predict_proba(\n                         batch_X,\n                         # The presence of `pred_data_generator` below is to\n                         # override model carrying data_generator if there\n                         # is any.\n-                        data_generator=pred_data_generator)\n+                        data_generator=pred_data_generator,\n+                    )\n \n                 if batch_preds.ndim == 1:\n                     batch_preds = batch_preds[:, np.newaxis]\n@@ -181,12 +182,12 @@\n \n                 if not header_done:\n                     heads = np.arange(batch_preds.shape[-1]).astype(str)\n-                    heads_str = \'\\t\'.join(heads)\n+                    heads_str = "\\t".join(heads)\n                     file_writer.write("\\t%s\\n" % heads_str)\n                     header_done = True\n \n                 for row in batch_out:\n-                    row_str = \'\\t\'.join(row)\n+                    row_str = "\\t".join(row)\n                     file_writer.write("%s\\n" % row_str)\n \n                 steps_done += 1\n@@ -200,14 +201,14 @@\n \n     # output\n     if len(preds.shape) == 1:\n-        rval = pd.DataFrame(preds, columns=[\'Predicted\'])\n+        rval = pd.DataFrame(preds, columns=["Predicted"])\n     else:\n         rval = pd.DataFrame(preds)\n \n-    rval.to_csv(outfile_predict, sep=\'\\t\', header=True, index=False)\n+    rval.to_csv(outfile_predict, sep="\\t", header=True, index=False)\n \n \n-if __name__ == \'__main__\':\n+if __name__ == "__main__":\n     aparser = argparse.ArgumentParser()\n     aparser.add_argument("-i", "--inputs", dest="inputs", required=True)\n     aparser.add_argument("-e", "--infile_estimator", dest="infile_estimator")\n@@ -219,7 +220,13 @@\n     aparser.add_argument("-v", "--vcf_path", dest="vcf_path")\n     args = aparser.parse_args()\n \n-    main(args.inputs, args.infile_estimator, args.outfile_predict,\n-         infile_weights=args.infile_weights, infile1=args.infile1,\n-         fasta_path=args.fasta_path, ref_seq=args.ref_seq,\n-         vcf_path=args.vcf_path)\n+    main(\n+        args.inputs,\n+        args.infile_estimator,\n+        args.outfile_predict,\n+        infile_weights=args.infile_weights,\n+        infile1=args.infile1,\n+        fasta_path=args.fasta_path,\n+        ref_seq=args.ref_seq,\n+        vcf_path=args.vcf_path,\n+    )\n'
b
diff -r ce2fd1edbc6e -r 13b9ac5d277c pca.py
--- a/pca.py Fri Oct 02 08:59:31 2020 +0000
+++ b/pca.py Tue Apr 13 22:24:07 2021 +0000
[
b'@@ -1,98 +1,185 @@\n import argparse\n+\n import numpy as np\n-from sklearn.decomposition import PCA, IncrementalPCA, KernelPCA\n from galaxy_ml.utils import read_columns\n+from sklearn.decomposition import IncrementalPCA, KernelPCA, PCA\n+\n \n def main():\n-    parser = argparse.ArgumentParser(description=\'RDKit screen\')\n-    parser.add_argument(\'-i\', \'--infile\',\n-                        help="Input file")\n-    parser.add_argument(\'--header\', action=\'store_true\', help="Include the header row or skip it")\n-    parser.add_argument(\'-c\', \'--columns\', type=str.lower, default=\'all\', choices=[\'by_index_number\', \'all_but_by_index_number\',\\\n-                        \'by_header_name\', \'all_but_by_header_name\', \'all_columns\'],\n-                        help="Choose to select all columns, or exclude/include some")\n-    parser.add_argument(\'-ci\', \'--column_indices\', type=str.lower,\n-                        help="Choose to select all columns, or exclude/include some")\n-    parser.add_argument(\'-n\', \'--number\', nargs=\'?\', type=int, default=None,\\\n-                        help="Number of components to keep. If not set, all components are kept")\n-    parser.add_argument(\'--whiten\', action=\'store_true\', help="Whiten the components")\n-    parser.add_argument(\'-t\', \'--pca_type\', type=str.lower, default=\'classical\', choices=[\'classical\', \'incremental\', \'kernel\'],\n-                        help="Choose which flavour of PCA to use")\n-    parser.add_argument(\'-s\', \'--svd_solver\', type=str.lower, default=\'auto\', choices=[\'auto\', \'full\', \'arpack\', \'randomized\'],\n-                        help="Choose the type of svd solver.")\n-    parser.add_argument(\'-b\', \'--batch_size\', nargs=\'?\', type=int, default=None,\\\n-                        help="The number of samples to use for each batch")\n-    parser.add_argument(\'-k\', \'--kernel\', type=str.lower, default=\'linear\',\\\n-                        choices=[\'linear\', \'poly\', \'rbf\', \'sigmoid\', \'cosine\', \'precomputed\'],\n-                        help="Choose the type of kernel.")\n-    parser.add_argument(\'-g\', \'--gamma\', nargs=\'?\', type=float, default=None,\n-                        help=\'Kernel coefficient for rbf, poly and sigmoid kernels. Ignored by other kernels\')\n-    parser.add_argument(\'-tol\', \'--tolerance\', type=float, default=0.0,\n-                        help=\'Convergence tolerance for arpack. If 0, optimal value will be chosen by arpack\')\n-    parser.add_argument(\'-mi\', \'--max_iter\', nargs=\'?\', type=int, default=None,\\\n-                        help="Maximum number of iterations for arpack")\n-    parser.add_argument(\'-d\', \'--degree\', type=int, default=3,\\\n-                        help="Degree for poly kernels. Ignored by other kernels")\n-    parser.add_argument(\'-cf\', \'--coef0\', type=float, default=1.0,\n-                        help=\'Independent term in poly and sigmoid kernels\')\n-    parser.add_argument(\'-e\', \'--eigen_solver\', type=str.lower, default=\'auto\', choices=[\'auto\', \'dense\', \'arpack\'],\n-                        help="Choose the type of eigen solver.")\n-    parser.add_argument(\'-o\', \'--outfile\',\n-                        help="Base name for output file (no extension).")\n+    parser = argparse.ArgumentParser(description="RDKit screen")\n+    parser.add_argument("-i", "--infile", help="Input file")\n+    parser.add_argument(\n+        "--header", action="store_true", help="Include the header row or skip it"\n+    )\n+    parser.add_argument(\n+        "-c",\n+        "--columns",\n+        type=str.lower,\n+        default="all",\n+        choices=[\n+            "by_index_number",\n+            "all_but_by_index_number",\n+            "by_header_name",\n+            "all_but_by_header_name",\n+            "all_columns",\n+        ],\n+        help="Choose to select all columns, or exclude/include some",\n+    )\n+    parser.add_argument(\n+        "-ci",\n+        "--column_indices",\n+        type=str.lower,\n+        help="Choose to select all columns, or exclude/include some",\n+    )\n+    parser.add_argument(\n+        "-n",\n+        "'..b'    "--degree",\n+        type=int,\n+        default=3,\n+        help="Degree for poly kernels. Ignored by other kernels",\n+    )\n+    parser.add_argument(\n+        "-cf",\n+        "--coef0",\n+        type=float,\n+        default=1.0,\n+        help="Independent term in poly and sigmoid kernels",\n+    )\n+    parser.add_argument(\n+        "-e",\n+        "--eigen_solver",\n+        type=str.lower,\n+        default="auto",\n+        choices=["auto", "dense", "arpack"],\n+        help="Choose the type of eigen solver.",\n+    )\n+    parser.add_argument(\n+        "-o", "--outfile", help="Base name for output file (no extension)."\n+    )\n     args = parser.parse_args()\n \n     usecols = None\n-    cols = []\n     pca_params = {}\n \n-    if args.columns == \'by_index_number\' or args.columns == \'all_but_by_index_number\':\n-        usecols = [int(i) for i in args.column_indices.split(\',\')]\n-    elif args.columns == \'by_header_name\' or args.columns == \'all_but_by_header_name\':\n+    if args.columns == "by_index_number" or args.columns == "all_but_by_index_number":\n+        usecols = [int(i) for i in args.column_indices.split(",")]\n+    elif args.columns == "by_header_name" or args.columns == "all_but_by_header_name":\n         usecols = args.column_indices\n \n-    header = \'infer\' if args.header else None\n+    header = "infer" if args.header else None\n \n     pca_input = read_columns(\n         f=args.infile,\n         c=usecols,\n         c_option=args.columns,\n-        sep=\'\\t\',\n+        sep="\\t",\n         header=header,\n         parse_dates=True,\n         encoding=None,\n-        index_col=None)\n+        index_col=None,\n+    )\n \n-    pca_params.update({\'n_components\': args.number})\n+    pca_params.update({"n_components": args.number})\n \n-    if args.pca_type == \'classical\':\n-        pca_params.update({\'svd_solver\': args.svd_solver, \'whiten\': args.whiten})\n-        if args.svd_solver == \'arpack\':\n-            pca_params.update({\'tol\': args.tolerance})\n+    if args.pca_type == "classical":\n+        pca_params.update({"svd_solver": args.svd_solver, "whiten": args.whiten})\n+        if args.svd_solver == "arpack":\n+            pca_params.update({"tol": args.tolerance})\n         pca = PCA()\n \n-    elif args.pca_type == \'incremental\':\n-        pca_params.update({\'batch_size\': args.batch_size, \'whiten\': args.whiten})\n+    elif args.pca_type == "incremental":\n+        pca_params.update({"batch_size": args.batch_size, "whiten": args.whiten})\n         pca = IncrementalPCA()\n \n-    elif args.pca_type == \'kernel\':\n-        pca_params.update({\'kernel\': args.kernel, \'eigen_solver\': args.eigen_solver, \'gamma\': args.gamma})\n+    elif args.pca_type == "kernel":\n+        pca_params.update(\n+            {\n+                "kernel": args.kernel,\n+                "eigen_solver": args.eigen_solver,\n+                "gamma": args.gamma,\n+            }\n+        )\n \n-        if args.kernel == \'poly\':\n-            pca_params.update({\'degree\': args.degree, \'coef0\': args.coef0})\n-        elif args.kernel == \'sigmoid\':\n-            pca_params.update({\'coef0\': args.coef0})\n-        elif args.kernel == \'precomputed\':\n+        if args.kernel == "poly":\n+            pca_params.update({"degree": args.degree, "coef0": args.coef0})\n+        elif args.kernel == "sigmoid":\n+            pca_params.update({"coef0": args.coef0})\n+        elif args.kernel == "precomputed":\n             pca_input = np.dot(pca_input, pca_input.T)\n \n-        if args.eigen_solver == \'arpack\':\n-            pca_params.update({\'tol\': args.tolerance, \'max_iter\': args.max_iter})\n+        if args.eigen_solver == "arpack":\n+            pca_params.update({"tol": args.tolerance, "max_iter": args.max_iter})\n \n         pca = KernelPCA()\n \n     print(pca_params)\n     pca.set_params(**pca_params)\n     pca_output = pca.fit_transform(pca_input)\n-    np.savetxt(fname=args.outfile, X=pca_output, fmt=\'%.4f\', delimiter=\'\\t\')\n+    np.savetxt(fname=args.outfile, X=pca_output, fmt="%.4f", delimiter="\\t")\n \n \n if __name__ == "__main__":\n'
b
diff -r ce2fd1edbc6e -r 13b9ac5d277c search_model_validation.py
--- a/search_model_validation.py Fri Oct 02 08:59:31 2020 +0000
+++ b/search_model_validation.py Tue Apr 13 22:24:07 2021 +0000
[
b'@@ -1,55 +1,66 @@\n import argparse\n import collections\n-import imblearn\n-import joblib\n import json\n-import numpy as np\n import os\n-import pandas as pd\n import pickle\n-import skrebate\n import sys\n import warnings\n+\n+import imblearn\n+import joblib\n+import numpy as np\n+import pandas as pd\n+import skrebate\n+from galaxy_ml.utils import (\n+    clean_params,\n+    get_cv,\n+    get_main_estimator,\n+    get_module,\n+    get_scoring,\n+    load_model,\n+    read_columns,\n+    SafeEval,\n+    try_get_attr\n+)\n from scipy.io import mmread\n-from sklearn import (cluster, decomposition, feature_selection,\n-                     kernel_approximation, model_selection, preprocessing)\n+from sklearn import (\n+    cluster,\n+    decomposition,\n+    feature_selection,\n+    kernel_approximation,\n+    model_selection,\n+    preprocessing,\n+)\n from sklearn.exceptions import FitFailedWarning\n-from sklearn.model_selection._validation import _score, cross_validate\n from sklearn.model_selection import _search, _validation\n-from sklearn.pipeline import Pipeline\n-\n-from galaxy_ml.utils import (SafeEval, get_cv, get_scoring, load_model,\n-                             read_columns, try_get_attr, get_module,\n-                             clean_params, get_main_estimator)\n+from sklearn.model_selection._validation import _score, cross_validate\n \n \n-_fit_and_score = try_get_attr(\'galaxy_ml.model_validations\', \'_fit_and_score\')\n-setattr(_search, \'_fit_and_score\', _fit_and_score)\n-setattr(_validation, \'_fit_and_score\', _fit_and_score)\n+_fit_and_score = try_get_attr("galaxy_ml.model_validations", "_fit_and_score")\n+setattr(_search, "_fit_and_score", _fit_and_score)\n+setattr(_validation, "_fit_and_score", _fit_and_score)\n \n-N_JOBS = int(os.environ.get(\'GALAXY_SLOTS\', 1))\n+N_JOBS = int(os.environ.get("GALAXY_SLOTS", 1))\n # handle  disk cache\n-CACHE_DIR = os.path.join(os.getcwd(), \'cached\')\n+CACHE_DIR = os.path.join(os.getcwd(), "cached")\n del os\n-NON_SEARCHABLE = (\'n_jobs\', \'pre_dispatch\', \'memory\', \'_path\',\n-                  \'nthread\', \'callbacks\')\n+NON_SEARCHABLE = ("n_jobs", "pre_dispatch", "memory", "_path", "nthread", "callbacks")\n \n \n def _eval_search_params(params_builder):\n     search_params = {}\n \n-    for p in params_builder[\'param_set\']:\n-        search_list = p[\'sp_list\'].strip()\n-        if search_list == \'\':\n+    for p in params_builder["param_set"]:\n+        search_list = p["sp_list"].strip()\n+        if search_list == "":\n             continue\n \n-        param_name = p[\'sp_name\']\n+        param_name = p["sp_name"]\n         if param_name.lower().endswith(NON_SEARCHABLE):\n-            print("Warning: `%s` is not eligible for search and was "\n-                  "omitted!" % param_name)\n+            print("Warning: `%s` is not eligible for search and was " "omitted!" % param_name)\n             continue\n \n-        if not search_list.startswith(\':\'):\n+        if not search_list.startswith(":"):\n             safe_eval = SafeEval(load_scipy=True, load_numpy=True)\n             ev = safe_eval(search_list)\n             search_params[param_name] = ev\n@@ -60,26 +71,27 @@\n             # TODO maybe add regular express check\n             ev = safe_eval_es(search_list)\n             preprocessings = (\n-                preprocessing.StandardScaler(), preprocessing.Binarizer(),\n+                preprocessing.StandardScaler(),\n+                preprocessing.Binarizer(),\n                 preprocessing.MaxAbsScaler(),\n-                preprocessing.Normalizer(), preprocessing.MinMaxScaler(),\n+                preprocessing.Normalizer(),\n+                preprocessing.MinMaxScaler(),\n                 preprocessing.PolynomialFeatures(),\n-                preprocessing.RobustScaler(), feature_selection.SelectKBest(),\n+                preprocessing.RobustScaler(),\n+                feature_selection.SelectKBest(),\n                 feature_selection.GenericUnivariateSelect(),\n                 feature_selection.SelectPercentile(),\n-                feature_selection.SelectFpr(), feature_sel'..b'        """searcher = _do_train_test_split_val(\n             searcher, X, y, params,\n@@ -630,14 +668,15 @@\n             error_score=options[\'error_score\'],\n             groups=groups,\n             outfile=outfile_result)"""\n+        return 0\n \n     # no outer split\n     else:\n         searcher.set_params(n_jobs=N_JOBS)\n-        if options[\'error_score\'] == \'raise\':\n+        if options["error_score"] == "raise":\n             searcher.fit(X, y, groups=groups)\n         else:\n-            warnings.simplefilter(\'always\', FitFailedWarning)\n+            warnings.simplefilter("always", FitFailedWarning)\n             with warnings.catch_warnings(record=True) as w:\n                 try:\n                     searcher.fit(X, y, groups=groups)\n@@ -648,18 +687,19 @@\n \n         cv_results = pd.DataFrame(searcher.cv_results_)\n         cv_results = cv_results[sorted(cv_results.columns)]\n-        cv_results.to_csv(path_or_buf=outfile_result, sep=\'\\t\',\n-                          header=True, index=False)\n+        cv_results.to_csv(path_or_buf=outfile_result, sep="\\t", header=True, index=False)\n \n     memory.clear(warn=False)\n \n     # output best estimator, and weights if applicable\n     if outfile_object:\n-        best_estimator_ = getattr(searcher, \'best_estimator_\', None)\n+        best_estimator_ = getattr(searcher, "best_estimator_", None)\n         if not best_estimator_:\n-            warnings.warn("GridSearchCV object has no attribute "\n-                          "\'best_estimator_\', because either it\'s "\n-                          "nested gridsearch or `refit` is False!")\n+            warnings.warn(\n+                "GridSearchCV object has no attribute "\n+                "\'best_estimator_\', because either it\'s "\n+                "nested gridsearch or `refit` is False!"\n+            )\n             return\n \n         # clean prams\n@@ -667,24 +707,22 @@\n \n         main_est = get_main_estimator(best_estimator_)\n \n-        if hasattr(main_est, \'model_\') \\\n-                and hasattr(main_est, \'save_weights\'):\n+        if hasattr(main_est, "model_") and hasattr(main_est, "save_weights"):\n             if outfile_weights:\n                 main_est.save_weights(outfile_weights)\n             del main_est.model_\n             del main_est.fit_params\n             del main_est.model_class_\n             del main_est.validation_data\n-            if getattr(main_est, \'data_generator_\', None):\n+            if getattr(main_est, "data_generator_", None):\n                 del main_est.data_generator_\n \n-        with open(outfile_object, \'wb\') as output_handler:\n+        with open(outfile_object, "wb") as output_handler:\n             print("Best estimator is saved: %s " % repr(best_estimator_))\n-            pickle.dump(best_estimator_, output_handler,\n-                        pickle.HIGHEST_PROTOCOL)\n+            pickle.dump(best_estimator_, output_handler, pickle.HIGHEST_PROTOCOL)\n \n \n-if __name__ == \'__main__\':\n+if __name__ == "__main__":\n     aparser = argparse.ArgumentParser()\n     aparser.add_argument("-i", "--inputs", dest="inputs", required=True)\n     aparser.add_argument("-e", "--estimator", dest="infile_estimator")\n@@ -700,8 +738,17 @@\n     aparser.add_argument("-f", "--fasta_path", dest="fasta_path")\n     args = aparser.parse_args()\n \n-    main(args.inputs, args.infile_estimator, args.infile1, args.infile2,\n-         args.outfile_result, outfile_object=args.outfile_object,\n-         outfile_weights=args.outfile_weights, groups=args.groups,\n-         ref_seq=args.ref_seq, intervals=args.intervals,\n-         targets=args.targets, fasta_path=args.fasta_path)\n+    main(\n+        args.inputs,\n+        args.infile_estimator,\n+        args.infile1,\n+        args.infile2,\n+        args.outfile_result,\n+        outfile_object=args.outfile_object,\n+        outfile_weights=args.outfile_weights,\n+        groups=args.groups,\n+        ref_seq=args.ref_seq,\n+        intervals=args.intervals,\n+        targets=args.targets,\n+        fasta_path=args.fasta_path,\n+    )\n'
b
diff -r ce2fd1edbc6e -r 13b9ac5d277c simple_model_fit.py
--- a/simple_model_fit.py Fri Oct 02 08:59:31 2020 +0000
+++ b/simple_model_fit.py Tue Apr 13 22:24:07 2021 +0000
[
b'@@ -1,13 +1,14 @@\n import argparse\n import json\n-import pandas as pd\n import pickle\n \n+import pandas as pd\n from galaxy_ml.utils import load_model, read_columns\n+from scipy.io import mmread\n from sklearn.pipeline import Pipeline\n \n \n-N_JOBS = int(__import__(\'os\').environ.get(\'GALAXY_SLOTS\', 1))\n+N_JOBS = int(__import__("os").environ.get("GALAXY_SLOTS", 1))\n \n \n # TODO import from galaxy_ml.utils in future versions\n@@ -20,33 +21,35 @@\n     ------\n     Cleaned estimator object\n     """\n-    ALLOWED_CALLBACKS = (\'EarlyStopping\', \'TerminateOnNaN\',\n-                         \'ReduceLROnPlateau\', \'CSVLogger\', \'None\')\n+    ALLOWED_CALLBACKS = (\n+        "EarlyStopping",\n+        "TerminateOnNaN",\n+        "ReduceLROnPlateau",\n+        "CSVLogger",\n+        "None",\n+    )\n \n     estimator_params = estimator.get_params()\n \n     for name, p in estimator_params.items():\n         # all potential unauthorized file write\n-        if name == \'memory\' or name.endswith(\'__memory\') \\\n-                or name.endswith(\'_path\'):\n+        if name == "memory" or name.endswith("__memory") or name.endswith("_path"):\n             new_p = {name: None}\n             estimator.set_params(**new_p)\n-        elif n_jobs is not None and (name == \'n_jobs\' or\n-                                     name.endswith(\'__n_jobs\')):\n+        elif n_jobs is not None and (name == \'n_jobs\' or name.endswith(\'__n_jobs\')):\n             new_p = {name: n_jobs}\n             estimator.set_params(**new_p)\n-        elif name.endswith(\'callbacks\'):\n+        elif name.endswith("callbacks"):\n             for cb in p:\n-                cb_type = cb[\'callback_selection\'][\'callback_type\']\n+                cb_type = cb["callback_selection"]["callback_type"]\n                 if cb_type not in ALLOWED_CALLBACKS:\n-                    raise ValueError(\n-                        "Prohibited callback type: %s!" % cb_type)\n+                    raise ValueError("Prohibited callback type: %s!" % cb_type)\n \n     return estimator\n \n \n def _get_X_y(params, infile1, infile2):\n-    """ read from inputs and output X and y\n+    """read from inputs and output X and y\n \n     Parameters\n     ----------\n@@ -61,35 +64,40 @@\n     # store read dataframe object\n     loaded_df = {}\n \n-    input_type = params[\'input_options\'][\'selected_input\']\n+    input_type = params["input_options"]["selected_input"]\n     # tabular input\n-    if input_type == \'tabular\':\n-        header = \'infer\' if params[\'input_options\'][\'header1\'] else None\n-        column_option = (params[\'input_options\'][\'column_selector_options_1\']\n-                         [\'selected_column_selector_option\'])\n-        if column_option in [\'by_index_number\', \'all_but_by_index_number\',\n-                             \'by_header_name\', \'all_but_by_header_name\']:\n-            c = params[\'input_options\'][\'column_selector_options_1\'][\'col1\']\n+    if input_type == "tabular":\n+        header = "infer" if params["input_options"]["header1"] else None\n+        column_option = params["input_options"]["column_selector_options_1"]["selected_column_selector_option"]\n+        if column_option in [\n+            "by_index_number",\n+            "all_but_by_index_number",\n+            "by_header_name",\n+            "all_but_by_header_name",\n+        ]:\n+            c = params["input_options"]["column_selector_options_1"]["col1"]\n         else:\n             c = None\n \n         df_key = infile1 + repr(header)\n-        df = pd.read_csv(infile1, sep=\'\\t\', header=header,\n-                         parse_dates=True)\n+        df = pd.read_csv(infile1, sep="\\t", header=header, parse_dates=True)\n         loaded_df[df_key] = df\n \n         X = read_columns(df, c=c, c_option=column_option).astype(float)\n     # sparse input\n-    elif input_type == \'sparse\':\n-        X = mmread(open(infile1, \'r\'))\n+    elif input_type == "sparse":\n+        X = mmread(open(infile1, "r"))\n \n     # Get target y\n-    header = \'infer\' if params[\'input_options\'][\'header2\'] else None\n-    column_option = (params[\'inpu'..b'           [\'selected_column_selector_option2\'])\n-    if column_option in [\'by_index_number\', \'all_but_by_index_number\',\n-                         \'by_header_name\', \'all_but_by_header_name\']:\n-        c = params[\'input_options\'][\'column_selector_options_2\'][\'col2\']\n+    header = "infer" if params["input_options"]["header2"] else None\n+    column_option = params["input_options"]["column_selector_options_2"]["selected_column_selector_option2"]\n+    if column_option in [\n+        "by_index_number",\n+        "all_but_by_index_number",\n+        "by_header_name",\n+        "all_but_by_header_name",\n+    ]:\n+        c = params["input_options"]["column_selector_options_2"]["col2"]\n     else:\n         c = None\n \n@@ -97,26 +105,23 @@\n     if df_key in loaded_df:\n         infile2 = loaded_df[df_key]\n     else:\n-        infile2 = pd.read_csv(infile2, sep=\'\\t\',\n-                              header=header, parse_dates=True)\n+        infile2 = pd.read_csv(infile2, sep="\\t", header=header, parse_dates=True)\n         loaded_df[df_key] = infile2\n \n-    y = read_columns(\n-            infile2,\n-            c=c,\n-            c_option=column_option,\n-            sep=\'\\t\',\n-            header=header,\n-            parse_dates=True)\n+    y = read_columns(infile2,\n+                     c=c,\n+                     c_option=column_option,\n+                     sep=\'\\t\',\n+                     header=header,\n+                     parse_dates=True)\n     if len(y.shape) == 2 and y.shape[1] == 1:\n         y = y.ravel()\n \n     return X, y\n \n \n-def main(inputs, infile_estimator, infile1, infile2, out_object,\n-         out_weights=None):\n-    """ main\n+def main(inputs, infile_estimator, infile1, infile2, out_object, out_weights=None):\n+    """main\n \n     Parameters\n     ----------\n@@ -139,38 +144,37 @@\n         File path for output of weights\n \n     """\n-    with open(inputs, \'r\') as param_handler:\n+    with open(inputs, "r") as param_handler:\n         params = json.load(param_handler)\n \n     # load model\n-    with open(infile_estimator, \'rb\') as est_handler:\n+    with open(infile_estimator, "rb") as est_handler:\n         estimator = load_model(est_handler)\n     estimator = clean_params(estimator, n_jobs=N_JOBS)\n \n     X_train, y_train = _get_X_y(params, infile1, infile2)\n \n     estimator.fit(X_train, y_train)\n-    \n+\n     main_est = estimator\n     if isinstance(main_est, Pipeline):\n         main_est = main_est.steps[-1][-1]\n-    if hasattr(main_est, \'model_\') \\\n-            and hasattr(main_est, \'save_weights\'):\n+    if hasattr(main_est, "model_") and hasattr(main_est, "save_weights"):\n         if out_weights:\n             main_est.save_weights(out_weights)\n         del main_est.model_\n         del main_est.fit_params\n         del main_est.model_class_\n-        del main_est.validation_data\n-        if getattr(main_est, \'data_generator_\', None):\n+        if getattr(main_est, "validation_data", None):\n+            del main_est.validation_data\n+        if getattr(main_est, "data_generator_", None):\n             del main_est.data_generator_\n \n-    with open(out_object, \'wb\') as output_handler:\n-        pickle.dump(estimator, output_handler,\n-                    pickle.HIGHEST_PROTOCOL)\n+    with open(out_object, "wb") as output_handler:\n+        pickle.dump(estimator, output_handler, pickle.HIGHEST_PROTOCOL)\n \n \n-if __name__ == \'__main__\':\n+if __name__ == "__main__":\n     aparser = argparse.ArgumentParser()\n     aparser.add_argument("-i", "--inputs", dest="inputs", required=True)\n     aparser.add_argument("-X", "--infile_estimator", dest="infile_estimator")\n@@ -180,5 +184,11 @@\n     aparser.add_argument("-t", "--out_weights", dest="out_weights")\n     args = aparser.parse_args()\n \n-    main(args.inputs, args.infile_estimator, args.infile1,\n-         args.infile2, args.out_object, args.out_weights)\n+    main(\n+        args.inputs,\n+        args.infile_estimator,\n+        args.infile1,\n+        args.infile2,\n+        args.out_object,\n+        args.out_weights,\n+    )\n'
b
diff -r ce2fd1edbc6e -r 13b9ac5d277c stacking_ensembles.py
--- a/stacking_ensembles.py Fri Oct 02 08:59:31 2020 +0000
+++ b/stacking_ensembles.py Tue Apr 13 22:24:07 2021 +0000
[
@@ -1,26 +1,22 @@
 import argparse
 import ast
 import json
-import mlxtend.regressor
-import mlxtend.classifier
-import pandas as pd
 import pickle
-import sklearn
 import sys
 import warnings
-from sklearn import ensemble
 
-from galaxy_ml.utils import (load_model, get_cv, get_estimator,
-                             get_search_params)
+import mlxtend.classifier
+import mlxtend.regressor
+import pandas as pd
+from galaxy_ml.utils import get_cv, get_estimator, get_search_params, load_model
 
 
-warnings.filterwarnings('ignore')
+warnings.filterwarnings("ignore")
 
-N_JOBS = int(__import__('os').environ.get('GALAXY_SLOTS', 1))
+N_JOBS = int(__import__("os").environ.get("GALAXY_SLOTS", 1))
 
 
-def main(inputs_path, output_obj, base_paths=None, meta_path=None,
-         outfile_params=None):
+def main(inputs_path, output_obj, base_paths=None, meta_path=None, outfile_params=None):
     """
     Parameter
     ---------
@@ -39,87 +35,79 @@
     outfile_params : str
         File path for params output
     """
-    with open(inputs_path, 'r') as param_handler:
+    with open(inputs_path, "r") as param_handler:
         params = json.load(param_handler)
 
-    estimator_type = params['algo_selection']['estimator_type']
+    estimator_type = params["algo_selection"]["estimator_type"]
     # get base estimators
     base_estimators = []
-    for idx, base_file in enumerate(base_paths.split(',')):
-        if base_file and base_file != 'None':
-            with open(base_file, 'rb') as handler:
+    for idx, base_file in enumerate(base_paths.split(",")):
+        if base_file and base_file != "None":
+            with open(base_file, "rb") as handler:
                 model = load_model(handler)
         else:
-            estimator_json = (params['base_est_builder'][idx]
-                              ['estimator_selector'])
+            estimator_json = params["base_est_builder"][idx]["estimator_selector"]
             model = get_estimator(estimator_json)
 
-        if estimator_type.startswith('sklearn'):
+        if estimator_type.startswith("sklearn"):
             named = model.__class__.__name__.lower()
-            named = 'base_%d_%s' % (idx, named)
+            named = "base_%d_%s" % (idx, named)
             base_estimators.append((named, model))
         else:
             base_estimators.append(model)
 
     # get meta estimator, if applicable
-    if estimator_type.startswith('mlxtend'):
+    if estimator_type.startswith("mlxtend"):
         if meta_path:
-            with open(meta_path, 'rb') as f:
+            with open(meta_path, "rb") as f:
                 meta_estimator = load_model(f)
         else:
-            estimator_json = (params['algo_selection']
-                              ['meta_estimator']['estimator_selector'])
+            estimator_json = params["algo_selection"]["meta_estimator"]["estimator_selector"]
             meta_estimator = get_estimator(estimator_json)
 
-    options = params['algo_selection']['options']
+    options = params["algo_selection"]["options"]
 
-    cv_selector = options.pop('cv_selector', None)
+    cv_selector = options.pop("cv_selector", None)
     if cv_selector:
-        splitter, groups = get_cv(cv_selector)
-        options['cv'] = splitter
+        splitter, _groups = get_cv(cv_selector)
+        options["cv"] = splitter
         # set n_jobs
-        options['n_jobs'] = N_JOBS
+        options["n_jobs"] = N_JOBS
 
-    weights = options.pop('weights', None)
+    weights = options.pop("weights", None)
     if weights:
         weights = ast.literal_eval(weights)
         if weights:
-            options['weights'] = weights
+            options["weights"] = weights
 
-    mod_and_name = estimator_type.split('_')
+    mod_and_name = estimator_type.split("_")
     mod = sys.modules[mod_and_name[0]]
     klass = getattr(mod, mod_and_name[1])
 
-    if estimator_type.startswith('sklearn'):
-        options['n_jobs'] = N_JOBS
+    if estimator_type.startswith("sklearn"):
+        options["n_jobs"] = N_JOBS
         ensemble_estimator = klass(base_estimators, **options)
 
     elif mod == mlxtend.classifier:
-        ensemble_estimator = klass(
-            classifiers=base_estimators,
-            meta_classifier=meta_estimator,
-            **options)
+        ensemble_estimator = klass(classifiers=base_estimators, meta_classifier=meta_estimator, **options)
 
     else:
-        ensemble_estimator = klass(
-            regressors=base_estimators,
-            meta_regressor=meta_estimator,
-            **options)
+        ensemble_estimator = klass(regressors=base_estimators, meta_regressor=meta_estimator, **options)
 
     print(ensemble_estimator)
     for base_est in base_estimators:
         print(base_est)
 
-    with open(output_obj, 'wb') as out_handler:
+    with open(output_obj, "wb") as out_handler:
         pickle.dump(ensemble_estimator, out_handler, pickle.HIGHEST_PROTOCOL)
 
-    if params['get_params'] and outfile_params:
+    if params["get_params"] and outfile_params:
         results = get_search_params(ensemble_estimator)
-        df = pd.DataFrame(results, columns=['', 'Parameter', 'Value'])
-        df.to_csv(outfile_params, sep='\t', index=False)
+        df = pd.DataFrame(results, columns=["", "Parameter", "Value"])
+        df.to_csv(outfile_params, sep="\t", index=False)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     aparser = argparse.ArgumentParser()
     aparser.add_argument("-b", "--bases", dest="bases")
     aparser.add_argument("-m", "--meta", dest="meta")
@@ -128,5 +116,10 @@
     aparser.add_argument("-p", "--outfile_params", dest="outfile_params")
     args = aparser.parse_args()
 
-    main(args.inputs, args.outfile, base_paths=args.bases,
-         meta_path=args.meta, outfile_params=args.outfile_params)
+    main(
+        args.inputs,
+        args.outfile,
+        base_paths=args.bases,
+        meta_path=args.meta,
+        outfile_params=args.outfile_params,
+    )
b
diff -r ce2fd1edbc6e -r 13b9ac5d277c test-data/keras_batch_params01.tabular
--- a/test-data/keras_batch_params01.tabular Fri Oct 02 08:59:31 2020 +0000
+++ b/test-data/keras_batch_params01.tabular Tue Apr 13 22:24:07 2021 +0000
b
@@ -27,7 +27,7 @@
 @ schedule_decay schedule_decay: None
 @ seed seed: None
 @ steps_per_epoch steps_per_epoch: None
-@ validation_data validation_data: None
+@ validation_fraction validation_fraction: 0.1
 @ validation_steps validation_steps: None
 @ verbose verbose: 0
 * data_batch_generator__fasta_path data_batch_generator__fasta_path: 'to_be_determined'
b
diff -r ce2fd1edbc6e -r 13b9ac5d277c test-data/keras_batch_params04.tabular
--- a/test-data/keras_batch_params04.tabular Fri Oct 02 08:59:31 2020 +0000
+++ b/test-data/keras_batch_params04.tabular Tue Apr 13 22:24:07 2021 +0000
b
@@ -26,7 +26,7 @@
 @ schedule_decay schedule_decay: None
 @ seed seed: None
 @ steps_per_epoch steps_per_epoch: None
-@ validation_data validation_data: None
+@ validation_fraction validation_fraction: 0.1
 @ validation_steps validation_steps: None
 @ verbose verbose: 0
 * layers_0_Dense__class_name layers_0_Dense__class_name: 'Dense'
b
diff -r ce2fd1edbc6e -r 13b9ac5d277c test-data/keras_model01
b
Binary file test-data/keras_model01 has changed
b
diff -r ce2fd1edbc6e -r 13b9ac5d277c test-data/keras_model02
b
Binary file test-data/keras_model02 has changed
b
diff -r ce2fd1edbc6e -r 13b9ac5d277c test-data/keras_model04
b
Binary file test-data/keras_model04 has changed
b
diff -r ce2fd1edbc6e -r 13b9ac5d277c test-data/keras_params04.tabular
--- a/test-data/keras_params04.tabular Fri Oct 02 08:59:31 2020 +0000
+++ b/test-data/keras_params04.tabular Tue Apr 13 22:24:07 2021 +0000
b
@@ -22,7 +22,7 @@
 @ schedule_decay schedule_decay: None
 @ seed seed: 42
 @ steps_per_epoch steps_per_epoch: None
-@ validation_data validation_data: None
+@ validation_fraction validation_fraction: 0.1
 @ validation_steps validation_steps: None
 @ verbose verbose: 0
 * layers_0_Dense__class_name layers_0_Dense__class_name: 'Dense'
b
diff -r ce2fd1edbc6e -r 13b9ac5d277c test-data/ohe_in_w_header.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/ohe_in_w_header.tabular Tue Apr 13 22:24:07 2021 +0000
b
@@ -0,0 +1,9 @@
+Label
+0
+1
+2
+3
+3
+2
+1
+0
b
diff -r ce2fd1edbc6e -r 13b9ac5d277c test-data/ohe_in_wo_header.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/ohe_in_wo_header.tabular Tue Apr 13 22:24:07 2021 +0000
b
@@ -0,0 +1,8 @@
+0
+1
+2
+3
+3
+2
+1
+0
b
diff -r ce2fd1edbc6e -r 13b9ac5d277c test-data/ohe_out_4.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/ohe_out_4.tabular Tue Apr 13 22:24:07 2021 +0000
b
@@ -0,0 +1,8 @@
+1 0 0 0
+0 1 0 0
+0 0 1 0
+0 0 0 1
+0 0 0 1
+0 0 1 0
+0 1 0 0
+1 0 0 0
b
diff -r ce2fd1edbc6e -r 13b9ac5d277c test-data/ohe_out_5.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/ohe_out_5.tabular Tue Apr 13 22:24:07 2021 +0000
b
@@ -0,0 +1,8 @@
+1 0 0 0 0
+0 1 0 0 0
+0 0 1 0 0
+0 0 0 1 0
+0 0 0 1 0
+0 0 1 0 0
+0 1 0 0 0
+1 0 0 0 0
b
diff -r ce2fd1edbc6e -r 13b9ac5d277c test-data/pipeline_params05.tabular
--- a/test-data/pipeline_params05.tabular Fri Oct 02 08:59:31 2020 +0000
+++ b/test-data/pipeline_params05.tabular Tue Apr 13 22:24:07 2021 +0000
b
@@ -13,6 +13,6 @@
 * n_jobs n_jobs: 1
 @ oob_score oob_score: False
 @ random_state random_state: 42
-* verbose verbose: 0
+@ verbose verbose: 0
 @ warm_start warm_start: False
  Note: @, params eligible for search in searchcv tool.
b
diff -r ce2fd1edbc6e -r 13b9ac5d277c test-data/pipeline_params18
--- a/test-data/pipeline_params18 Fri Oct 02 08:59:31 2020 +0000
+++ b/test-data/pipeline_params18 Tue Apr 13 22:24:07 2021 +0000
b
@@ -47,7 +47,7 @@
                                                            output_distribution='uniform',
                                                            random_state=10,
                                                            subsample=100000))"
-* verbose verbose: False
+@ verbose verbose: False
 @ powertransformer__copy powertransformer__copy: True
 @ powertransformer__method powertransformer__method: 'yeo-johnson'
 @ powertransformer__standardize powertransformer__standardize: True
@@ -75,7 +75,7 @@
 * transformedtargetregressor__regressor__n_jobs transformedtargetregressor__regressor__n_jobs: 1
 @ transformedtargetregressor__regressor__oob_score transformedtargetregressor__regressor__oob_score: False
 @ transformedtargetregressor__regressor__random_state transformedtargetregressor__regressor__random_state: 10
-* transformedtargetregressor__regressor__verbose transformedtargetregressor__regressor__verbose: 0
+@ transformedtargetregressor__regressor__verbose transformedtargetregressor__regressor__verbose: 0
 @ transformedtargetregressor__regressor__warm_start transformedtargetregressor__regressor__warm_start: False
 @ transformedtargetregressor__transformer "transformedtargetregressor__transformer: QuantileTransformer(copy=True, ignore_implicit_zeros=False, n_quantiles=1000,
                     output_distribution='uniform', random_state=10,
b
diff -r ce2fd1edbc6e -r 13b9ac5d277c test-data/train_test_eval_model01
b
Binary file test-data/train_test_eval_model01 has changed
b
diff -r ce2fd1edbc6e -r 13b9ac5d277c test-data/train_test_eval_weights01.h5
b
Binary file test-data/train_test_eval_weights01.h5 has changed
b
diff -r ce2fd1edbc6e -r 13b9ac5d277c test-data/train_test_eval_weights02.h5
b
Binary file test-data/train_test_eval_weights02.h5 has changed
b
diff -r ce2fd1edbc6e -r 13b9ac5d277c to_categorical.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/to_categorical.py Tue Apr 13 22:24:07 2021 +0000
[
@@ -0,0 +1,50 @@
+import argparse
+import json
+import warnings
+
+import numpy as np
+import pandas as pd
+from keras.utils import to_categorical
+
+
+def main(inputs, infile, outfile, num_classes=None):
+    """
+    Parameter
+    ---------
+    input : str
+        File path to galaxy tool parameter
+
+    infile : str
+        File paths of input vector
+
+    outfile : str
+        File path to output matrix
+
+    num_classes : str
+        Total number of classes. If None, this would be inferred as the (largest number in y) + 1
+
+    """
+    warnings.simplefilter("ignore")
+
+    with open(inputs, "r") as param_handler:
+        params = json.load(param_handler)
+
+    input_header = params["header0"]
+    header = "infer" if input_header else None
+
+    input_vector = pd.read_csv(infile, sep="\t", header=header)
+
+    output_matrix = to_categorical(input_vector, num_classes=num_classes)
+
+    np.savetxt(outfile, output_matrix, fmt="%d", delimiter="\t")
+
+
+if __name__ == "__main__":
+    aparser = argparse.ArgumentParser()
+    aparser.add_argument("-i", "--inputs", dest="inputs", required=True)
+    aparser.add_argument("-y", "--infile", dest="infile")
+    aparser.add_argument("-n", "--num_classes", dest="num_classes", type=int, default=None)
+    aparser.add_argument("-o", "--outfile", dest="outfile")
+    args = aparser.parse_args()
+
+    main(args.inputs, args.infile, args.outfile, args.num_classes)
b
diff -r ce2fd1edbc6e -r 13b9ac5d277c train_test_eval.py
--- a/train_test_eval.py Fri Oct 02 08:59:31 2020 +0000
+++ b/train_test_eval.py Tue Apr 13 22:24:07 2021 +0000
[
b'@@ -1,59 +1,64 @@\n import argparse\n-import joblib\n import json\n-import numpy as np\n import os\n-import pandas as pd\n import pickle\n import warnings\n from itertools import chain\n+\n+import joblib\n+import numpy as np\n+import pandas as pd\n+from galaxy_ml.model_validations import train_test_split\n+from galaxy_ml.utils import (\n+    get_module,\n+    get_scoring,\n+    load_model,\n+    read_columns,\n+    SafeEval,\n+    try_get_attr,\n+)\n from scipy.io import mmread\n-from sklearn.base import clone\n-from sklearn import (cluster, compose, decomposition, ensemble,\n-                     feature_extraction, feature_selection,\n-                     gaussian_process, kernel_approximation, metrics,\n-                     model_selection, naive_bayes, neighbors,\n-                     pipeline, preprocessing, svm, linear_model,\n-                     tree, discriminant_analysis)\n-from sklearn.exceptions import FitFailedWarning\n+from sklearn import pipeline\n from sklearn.metrics.scorer import _check_multimetric_scoring\n-from sklearn.model_selection._validation import _score, cross_validate\n from sklearn.model_selection import _search, _validation\n+from sklearn.model_selection._validation import _score\n from sklearn.utils import indexable, safe_indexing\n \n-from galaxy_ml.model_validations import train_test_split\n-from galaxy_ml.utils import (SafeEval, get_scoring, load_model,\n-                             read_columns, try_get_attr, get_module)\n \n+_fit_and_score = try_get_attr("galaxy_ml.model_validations", "_fit_and_score")\n+setattr(_search, "_fit_and_score", _fit_and_score)\n+setattr(_validation, "_fit_and_score", _fit_and_score)\n \n-_fit_and_score = try_get_attr(\'galaxy_ml.model_validations\', \'_fit_and_score\')\n-setattr(_search, \'_fit_and_score\', _fit_and_score)\n-setattr(_validation, \'_fit_and_score\', _fit_and_score)\n-\n-N_JOBS = int(os.environ.get(\'GALAXY_SLOTS\', 1))\n-CACHE_DIR = os.path.join(os.getcwd(), \'cached\')\n+N_JOBS = int(os.environ.get("GALAXY_SLOTS", 1))\n+CACHE_DIR = os.path.join(os.getcwd(), "cached")\n del os\n-NON_SEARCHABLE = (\'n_jobs\', \'pre_dispatch\', \'memory\', \'_path\',\n-                  \'nthread\', \'callbacks\')\n-ALLOWED_CALLBACKS = (\'EarlyStopping\', \'TerminateOnNaN\', \'ReduceLROnPlateau\',\n-                     \'CSVLogger\', \'None\')\n+NON_SEARCHABLE = ("n_jobs", "pre_dispatch", "memory", "_path", "nthread", "callbacks")\n+ALLOWED_CALLBACKS = (\n+    "EarlyStopping",\n+    "TerminateOnNaN",\n+    "ReduceLROnPlateau",\n+    "CSVLogger",\n+    "None",\n+)\n \n \n def _eval_swap_params(params_builder):\n     swap_params = {}\n \n-    for p in params_builder[\'param_set\']:\n-        swap_value = p[\'sp_value\'].strip()\n-        if swap_value == \'\':\n+    for p in params_builder["param_set"]:\n+        swap_value = p["sp_value"].strip()\n+        if swap_value == "":\n             continue\n \n-        param_name = p[\'sp_name\']\n+        param_name = p["sp_name"]\n         if param_name.lower().endswith(NON_SEARCHABLE):\n-            warnings.warn("Warning: `%s` is not eligible for search and was "\n-                          "omitted!" % param_name)\n+            warnings.warn(\n+                "Warning: `%s` is not eligible for search and was "\n+                "omitted!" % param_name\n+            )\n             continue\n \n-        if not swap_value.startswith(\':\'):\n+        if not swap_value.startswith(":"):\n             safe_eval = SafeEval(load_scipy=True, load_numpy=True)\n             ev = safe_eval(swap_value)\n         else:\n@@ -80,23 +85,24 @@\n         else:\n             new_arrays.append(arr)\n \n-    if kwargs[\'shuffle\'] == \'None\':\n-        kwargs[\'shuffle\'] = None\n+    if kwargs["shuffle"] == "None":\n+        kwargs["shuffle"] = None\n \n-    group_names = kwargs.pop(\'group_names\', None)\n+    group_names = kwargs.pop("group_names", None)\n \n     if group_names is not None and group_names.strip():\n-        group_names = [name.strip() for name in\n-                       group_names.split(\',\')]\n+        group_names = [name.strip() for name in group_names.split(",")]\n     '..b'      if exp_scheme == \'train_val_test\':\n-            estimator.fit(X_train, y_train,\n-                          validation_data=(X_val, y_val))\n+    if hasattr(estimator, "validation_data"):\n+        if exp_scheme == "train_val_test":\n+            estimator.fit(X_train, y_train, validation_data=(X_val, y_val))\n         else:\n-            estimator.fit(X_train, y_train,\n-                          validation_data=(X_test, y_test))\n+            estimator.fit(X_train, y_train, validation_data=(X_test, y_test))\n     else:\n         estimator.fit(X_train, y_train)\n \n-    if hasattr(estimator, \'evaluate\'):\n-        scores = estimator.evaluate(X_test, y_test=y_test,\n-                                    scorer=scorer,\n-                                    is_multimetric=True)\n+    if hasattr(estimator, "evaluate"):\n+        scores = estimator.evaluate(\n+            X_test, y_test=y_test, scorer=scorer, is_multimetric=True\n+        )\n     else:\n-        scores = _score(estimator, X_test, y_test, scorer,\n-                        is_multimetric=True)\n+        scores = _score(estimator, X_test, y_test, scorer, is_multimetric=True)\n     # handle output\n     for name, score in scores.items():\n         scores[name] = [score]\n     df = pd.DataFrame(scores)\n     df = df[sorted(df.columns)]\n-    df.to_csv(path_or_buf=outfile_result, sep=\'\\t\',\n-              header=True, index=False)\n+    df.to_csv(path_or_buf=outfile_result, sep="\\t", header=True, index=False)\n \n     memory.clear(warn=False)\n \n@@ -395,23 +430,25 @@\n         if isinstance(estimator, pipeline.Pipeline):\n             main_est = estimator.steps[-1][-1]\n \n-        if hasattr(main_est, \'model_\') \\\n-                and hasattr(main_est, \'save_weights\'):\n+        if hasattr(main_est, "model_") and hasattr(main_est, "save_weights"):\n             if outfile_weights:\n                 main_est.save_weights(outfile_weights)\n-            del main_est.model_\n-            del main_est.fit_params\n-            del main_est.model_class_\n-            del main_est.validation_data\n-            if getattr(main_est, \'data_generator_\', None):\n+            if getattr(main_est, "model_", None):\n+                del main_est.model_\n+            if getattr(main_est, "fit_params", None):\n+                del main_est.fit_params\n+            if getattr(main_est, "model_class_", None):\n+                del main_est.model_class_\n+            if getattr(main_est, "validation_data", None):\n+                del main_est.validation_data\n+            if getattr(main_est, "data_generator_", None):\n                 del main_est.data_generator_\n \n-        with open(outfile_object, \'wb\') as output_handler:\n-            pickle.dump(estimator, output_handler,\n-                        pickle.HIGHEST_PROTOCOL)\n+        with open(outfile_object, "wb") as output_handler:\n+            pickle.dump(estimator, output_handler, pickle.HIGHEST_PROTOCOL)\n \n \n-if __name__ == \'__main__\':\n+if __name__ == "__main__":\n     aparser = argparse.ArgumentParser()\n     aparser.add_argument("-i", "--inputs", dest="inputs", required=True)\n     aparser.add_argument("-e", "--estimator", dest="infile_estimator")\n@@ -427,8 +464,17 @@\n     aparser.add_argument("-f", "--fasta_path", dest="fasta_path")\n     args = aparser.parse_args()\n \n-    main(args.inputs, args.infile_estimator, args.infile1, args.infile2,\n-         args.outfile_result, outfile_object=args.outfile_object,\n-         outfile_weights=args.outfile_weights, groups=args.groups,\n-         ref_seq=args.ref_seq, intervals=args.intervals,\n-         targets=args.targets, fasta_path=args.fasta_path)\n+    main(\n+        args.inputs,\n+        args.infile_estimator,\n+        args.infile1,\n+        args.infile2,\n+        args.outfile_result,\n+        outfile_object=args.outfile_object,\n+        outfile_weights=args.outfile_weights,\n+        groups=args.groups,\n+        ref_seq=args.ref_seq,\n+        intervals=args.intervals,\n+        targets=args.targets,\n+        fasta_path=args.fasta_path,\n+    )\n'
b
diff -r ce2fd1edbc6e -r 13b9ac5d277c train_test_split.py
--- a/train_test_split.py Fri Oct 02 08:59:31 2020 +0000
+++ b/train_test_split.py Tue Apr 13 22:24:07 2021 +0000
[
@@ -1,15 +1,14 @@
 import argparse
 import json
-import pandas as pd
 import warnings
 
+import pandas as pd
 from galaxy_ml.model_validations import train_test_split
 from galaxy_ml.utils import get_cv, read_columns
 
 
-def _get_single_cv_split(params, array, infile_labels=None,
-                         infile_groups=None):
-    """ output (train, test) subset from a cv splitter
+def _get_single_cv_split(params, array, infile_labels=None, infile_groups=None):
+    """output (train, test) subset from a cv splitter
 
     Parameters
     ----------
@@ -25,45 +24,50 @@
     y = None
     groups = None
 
-    nth_split = params['mode_selection']['nth_split']
+    nth_split = params["mode_selection"]["nth_split"]
 
     # read groups
     if infile_groups:
-        header = 'infer' if (params['mode_selection']['cv_selector']
-                             ['groups_selector']['header_g']) else None
-        column_option = (params['mode_selection']['cv_selector']
-                         ['groups_selector']['column_selector_options_g']
-                         ['selected_column_selector_option_g'])
-        if column_option in ['by_index_number', 'all_but_by_index_number',
-                             'by_header_name', 'all_but_by_header_name']:
-            c = (params['mode_selection']['cv_selector']['groups_selector']
-                 ['column_selector_options_g']['col_g'])
+        header = "infer" if (params["mode_selection"]["cv_selector"]["groups_selector"]["header_g"]) else None
+        column_option = params["mode_selection"]["cv_selector"]["groups_selector"]["column_selector_options_g"][
+            "selected_column_selector_option_g"
+        ]
+        if column_option in [
+            "by_index_number",
+            "all_but_by_index_number",
+            "by_header_name",
+            "all_but_by_header_name",
+        ]:
+            c = params["mode_selection"]["cv_selector"]["groups_selector"]["column_selector_options_g"]["col_g"]
         else:
             c = None
 
-        groups = read_columns(infile_groups, c=c, c_option=column_option,
-                              sep='\t', header=header, parse_dates=True)
+        groups = read_columns(
+            infile_groups,
+            c=c,
+            c_option=column_option,
+            sep="\t",
+            header=header,
+            parse_dates=True,
+        )
         groups = groups.ravel()
 
-        params['mode_selection']['cv_selector']['groups_selector'] = groups
+        params["mode_selection"]["cv_selector"]["groups_selector"] = groups
 
     # read labels
     if infile_labels:
-        target_input = (params['mode_selection']
-                        ['cv_selector'].pop('target_input'))
-        header = 'infer' if target_input['header1'] else None
-        col_index = target_input['col'][0] - 1
-        df = pd.read_csv(infile_labels, sep='\t', header=header,
-                         parse_dates=True)
+        target_input = params["mode_selection"]["cv_selector"].pop("target_input")
+        header = "infer" if target_input["header1"] else None
+        col_index = target_input["col"][0] - 1
+        df = pd.read_csv(infile_labels, sep="\t", header=header, parse_dates=True)
         y = df.iloc[:, col_index].values
 
     # construct the cv splitter object
-    splitter, groups = get_cv(params['mode_selection']['cv_selector'])
+    splitter, groups = get_cv(params["mode_selection"]["cv_selector"])
 
     total_n_splits = splitter.get_n_splits(array.values, y=y, groups=groups)
     if nth_split > total_n_splits:
-        raise ValueError("Total number of splits is {}, but got `nth_split` "
-                         "= {}".format(total_n_splits, nth_split))
+        raise ValueError("Total number of splits is {}, but got `nth_split` " "= {}".format(total_n_splits, nth_split))
 
     i = 1
     for train_index, test_index in splitter.split(array.values, y=y, groups=groups):
@@ -79,8 +83,14 @@
     return train, test
 
 
-def main(inputs, infile_array, outfile_train, outfile_test,
-         infile_labels=None, infile_groups=None):
+def main(
+    inputs,
+    infile_array,
+    outfile_train,
+    outfile_test,
+    infile_labels=None,
+    infile_groups=None,
+):
     """
     Parameter
     ---------
@@ -102,45 +112,41 @@
     outfile_test : str
         File path to dataset containing test split
     """
-    warnings.simplefilter('ignore')
+    warnings.simplefilter("ignore")
 
-    with open(inputs, 'r') as param_handler:
+    with open(inputs, "r") as param_handler:
         params = json.load(param_handler)
 
-    input_header = params['header0']
-    header = 'infer' if input_header else None
-    array = pd.read_csv(infile_array, sep='\t', header=header,
-                        parse_dates=True)
+    input_header = params["header0"]
+    header = "infer" if input_header else None
+    array = pd.read_csv(infile_array, sep="\t", header=header, parse_dates=True)
 
     # train test split
-    if params['mode_selection']['selected_mode'] == 'train_test_split':
-        options = params['mode_selection']['options']
-        shuffle_selection = options.pop('shuffle_selection')
-        options['shuffle'] = shuffle_selection['shuffle']
+    if params["mode_selection"]["selected_mode"] == "train_test_split":
+        options = params["mode_selection"]["options"]
+        shuffle_selection = options.pop("shuffle_selection")
+        options["shuffle"] = shuffle_selection["shuffle"]
         if infile_labels:
-            header = 'infer' if shuffle_selection['header1'] else None
-            col_index = shuffle_selection['col'][0] - 1
-            df = pd.read_csv(infile_labels, sep='\t', header=header,
-                             parse_dates=True)
+            header = "infer" if shuffle_selection["header1"] else None
+            col_index = shuffle_selection["col"][0] - 1
+            df = pd.read_csv(infile_labels, sep="\t", header=header, parse_dates=True)
             labels = df.iloc[:, col_index].values
-            options['labels'] = labels
+            options["labels"] = labels
 
         train, test = train_test_split(array, **options)
 
     # cv splitter
     else:
-        train, test = _get_single_cv_split(params, array,
-                                           infile_labels=infile_labels,
-                                           infile_groups=infile_groups)
+        train, test = _get_single_cv_split(params, array, infile_labels=infile_labels, infile_groups=infile_groups)
 
     print("Input shape: %s" % repr(array.shape))
     print("Train shape: %s" % repr(train.shape))
     print("Test shape: %s" % repr(test.shape))
-    train.to_csv(outfile_train, sep='\t', header=input_header, index=False)
-    test.to_csv(outfile_test, sep='\t', header=input_header, index=False)
+    train.to_csv(outfile_train, sep="\t", header=input_header, index=False)
+    test.to_csv(outfile_test, sep="\t", header=input_header, index=False)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     aparser = argparse.ArgumentParser()
     aparser.add_argument("-i", "--inputs", dest="inputs", required=True)
     aparser.add_argument("-X", "--infile_array", dest="infile_array")
@@ -150,5 +156,11 @@
     aparser.add_argument("-t", "--outfile_test", dest="outfile_test")
     args = aparser.parse_args()
 
-    main(args.inputs, args.infile_array, args.outfile_train,
-         args.outfile_test, args.infile_labels, args.infile_groups)
+    main(
+        args.inputs,
+        args.infile_array,
+        args.outfile_train,
+        args.outfile_test,
+        args.infile_labels,
+        args.infile_groups,
+    )