Repository 'sklearn_clf_metrics'
hg clone https://toolshed.g2.bx.psu.edu/repos/bgruening/sklearn_clf_metrics

Changeset 24:9bf11bbeccc3 (2019-05-14)
Previous changeset 23:3cd0dbc038ec (2018-12-30) Next changeset 25:68afcd163b3d (2019-07-09)
Commit message:
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit c0a3a186966888e5787335a7628bf0a4382637e7
modified:
clf_metrics.xml
main_macros.xml
search_model_validation.py
test-data/mv_result02.tabular
test-data/mv_result03.tabular
test-data/mv_result05.tabular
test-data/nn_model01
test-data/pipeline01
test-data/pipeline02
test-data/pipeline03
test-data/pipeline04
test-data/pipeline05
test-data/pipeline06
test-data/pipeline07
test-data/pipeline08
test-data/pipeline09
test-data/pipeline10
test-data/pipeline11
test-data/pipeline12
test-data/searchCV01
test-data/searchCV02
utils.py
added:
feature_selectors.py
iraps_classifier.py
model_validations.py
pk_whitelist.json
preprocessors.py
stacking_ensembles.py
test-data/GridSearchCV.zip
test-data/LinearRegression01.zip
test-data/LinearRegression02.zip
test-data/RFE.zip
test-data/RandomForestClassifier.zip
test-data/RandomForestRegressor01.zip
test-data/StackingCVRegressor01.zip
test-data/StackingCVRegressor02.zip
test-data/XGBRegressor01.zip
test-data/best_estimator_.zip
test-data/best_params_.txt
test-data/best_score_.tabular
test-data/feature_importances_.tabular
test-data/feature_selection_result13
test-data/final_estimator.zip
test-data/get_params.tabular
test-data/get_params01.tabular
test-data/get_params02.tabular
test-data/get_params03.tabular
test-data/get_params04.tabular
test-data/get_params05.tabular
test-data/get_params06.tabular
test-data/get_params07.tabular
test-data/get_params08.tabular
test-data/get_params09.tabular
test-data/get_params10.tabular
test-data/get_params11.tabular
test-data/get_params12.tabular
test-data/named_steps.txt
test-data/pipeline13
test-data/pipeline14
test-data/pipeline15
test-data/ranking_.tabular
removed:
sk_whitelist.json
test-data/mv_result01.tabular
test-data/mv_result04.tabular
test-data/mv_result06.tabular
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 clf_metrics.xml
--- a/clf_metrics.xml Sun Dec 30 01:58:56 2018 -0500
+++ b/clf_metrics.xml Tue May 14 18:20:45 2019 -0400
[
@@ -19,9 +19,11 @@
 import json
 import pandas
 import numpy as np
+import sys
 from sklearn import metrics
 
-exec(open("$__tool_directory__/utils.py").read(), globals())
+sys.path.insert(0, '$__tool_directory__')
+from utils import read_columns
 
 input_json_path = sys.argv[1]
 with open(input_json_path, "r") as param_handler:
@@ -39,8 +41,7 @@
         c_option = column_option,
         sep='\t',
         header=header,
-        parse_dates=True
-)
+        parse_dates=True)
 
 header='infer' if params["clf_metrics"].get("header2", None) else None
 column_option = params["clf_metrics"]["column_selector_options_2"]["selected_column_selector_option2"]
@@ -54,8 +55,7 @@
         c_option = column_option,
         sep='\t',
         header=header,
-        parse_dates=True
-)
+        parse_dates=True)
 
 options = params["clf_metrics"].get("options", {})
 print(options)
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 feature_selectors.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/feature_selectors.py Tue May 14 18:20:45 2019 -0400
[
b'@@ -0,0 +1,357 @@\n+"""\n+DyRFE\n+DyRFECV\n+MyPipeline\n+MyimbPipeline\n+check_feature_importances\n+"""\n+import numpy as np\n+\n+from imblearn import under_sampling, over_sampling, combine\n+from imblearn.pipeline import Pipeline as imbPipeline\n+from sklearn import (cluster, compose, decomposition, ensemble,\n+                     feature_extraction, feature_selection,\n+                     gaussian_process, kernel_approximation,\n+                     metrics, model_selection, naive_bayes,\n+                     neighbors, pipeline, preprocessing,\n+                     svm, linear_model, tree, discriminant_analysis)\n+\n+from sklearn.base import BaseEstimator\n+from sklearn.base import MetaEstimatorMixin, clone, is_classifier\n+from sklearn.feature_selection.rfe import _rfe_single_fit, RFE, RFECV\n+from sklearn.model_selection import check_cv\n+from sklearn.metrics.scorer import check_scoring\n+from sklearn.utils import check_X_y, safe_indexing, safe_sqr\n+from sklearn.utils._joblib import Parallel, delayed, effective_n_jobs\n+\n+\n+class DyRFE(RFE):\n+    """\n+    Mainly used with DyRFECV\n+\n+    Parameters\n+    ----------\n+    estimator : object\n+        A supervised learning estimator with a ``fit`` method that provides\n+        information about feature importance either through a ``coef_``\n+        attribute or through a ``feature_importances_`` attribute.\n+    n_features_to_select : int or None (default=None)\n+        The number of features to select. If `None`, half of the features\n+        are selected.\n+    step : int, float or list, optional (default=1)\n+        If greater than or equal to 1, then ``step`` corresponds to the\n+        (integer) number of features to remove at each iteration.\n+        If within (0.0, 1.0), then ``step`` corresponds to the percentage\n+        (rounded down) of features to remove at each iteration.\n+        If list, a series of steps of features to remove at each iteration.\n+        Iterations stops when steps finish\n+    verbose : int, (default=0)\n+        Controls verbosity of output.\n+\n+    """\n+    def __init__(self, estimator, n_features_to_select=None, step=1,\n+                 verbose=0):\n+        super(DyRFE, self).__init__(estimator, n_features_to_select,\n+                                    step, verbose)\n+\n+    def _fit(self, X, y, step_score=None):\n+\n+        if type(self.step) is not list:\n+            return super(DyRFE, self)._fit(X, y, step_score)\n+\n+        # dynamic step\n+        X, y = check_X_y(X, y, "csc")\n+        # Initialization\n+        n_features = X.shape[1]\n+        if self.n_features_to_select is None:\n+            n_features_to_select = n_features // 2\n+        else:\n+            n_features_to_select = self.n_features_to_select\n+\n+        step = []\n+        for s in self.step:\n+            if 0.0 < s < 1.0:\n+                step.append(int(max(1, s * n_features)))\n+            else:\n+                step.append(int(s))\n+            if s <= 0:\n+                raise ValueError("Step must be >0")\n+\n+        support_ = np.ones(n_features, dtype=np.bool)\n+        ranking_ = np.ones(n_features, dtype=np.int)\n+\n+        if step_score:\n+            self.scores_ = []\n+\n+        step_i = 0\n+        # Elimination\n+        while np.sum(support_) > n_features_to_select and step_i < len(step):\n+\n+            # if last step is 1, will keep loop\n+            if step_i == len(step) - 1 and step[step_i] != 0:\n+                step.append(step[step_i])\n+\n+            # Remaining features\n+            features = np.arange(n_features)[support_]\n+\n+            # Rank the remaining features\n+            estimator = clone(self.estimator)\n+            if self.verbose > 0:\n+                print("Fitting estimator with %d features." % np.sum(support_))\n+\n+            estimator.fit(X[:, features], y)\n+\n+            # Get coefs\n+            if hasattr(estimator, \'coef_\'):\n+                coefs = estimator.coef_\n+            else:\n+                coefs = getattr(estimator, \'feature_importances_\', None)\n+  '..b'        # Note that joblib raises a non-picklable error for bound methods\n+        # even if n_jobs is set to 1 with the default multiprocessing\n+        # backend.\n+        # This branching is done so that to\n+        # make sure that user code that sets n_jobs to 1\n+        # and provides bound methods as scorers is not broken with the\n+        # addition of n_jobs parameter in version 0.18.\n+\n+        if effective_n_jobs(self.n_jobs) == 1:\n+            parallel, func = list, _rfe_single_fit\n+        else:\n+            parallel = Parallel(n_jobs=self.n_jobs)\n+            func = delayed(_rfe_single_fit)\n+\n+        scores = parallel(\n+            func(rfe, self.estimator, X, y, train, test, scorer)\n+            for train, test in cv.split(X, y, groups))\n+\n+        scores = np.sum(scores, axis=0)\n+        diff = int(scores.shape[0]) - len(step)\n+        if diff > 0:\n+            step = np.r_[step, [step[-1]] * diff]\n+        scores_rev = scores[::-1]\n+        argmax_idx = len(scores) - np.argmax(scores_rev) - 1\n+        n_features_to_select = max(\n+            n_features - sum(step[:argmax_idx]),\n+            self.min_features_to_select)\n+\n+        # Re-execute an elimination with best_k over the whole set\n+        rfe = DyRFE(estimator=self.estimator,\n+                    n_features_to_select=n_features_to_select, step=self.step,\n+                    verbose=self.verbose)\n+\n+        rfe.fit(X, y)\n+\n+        # Set final attributes\n+        self.support_ = rfe.support_\n+        self.n_features_ = rfe.n_features_\n+        self.ranking_ = rfe.ranking_\n+        self.estimator_ = clone(self.estimator)\n+        self.estimator_.fit(self.transform(X), y)\n+\n+        # Fixing a normalization error, n is equal to get_n_splits(X, y) - 1\n+        # here, the scores are normalized by get_n_splits(X, y)\n+        self.grid_scores_ = scores[::-1] / cv.get_n_splits(X, y, groups)\n+        return self\n+\n+\n+class MyPipeline(pipeline.Pipeline):\n+    """\n+    Extend pipeline object to have feature_importances_ attribute\n+    """\n+    def fit(self, X, y=None, **fit_params):\n+        super(MyPipeline, self).fit(X, y, **fit_params)\n+        estimator = self.steps[-1][-1]\n+        if hasattr(estimator, \'coef_\'):\n+            coefs = estimator.coef_\n+        else:\n+            coefs = getattr(estimator, \'feature_importances_\', None)\n+        if coefs is None:\n+            raise RuntimeError(\'The estimator in the pipeline does not expose \'\n+                               \'"coef_" or "feature_importances_" \'\n+                               \'attributes\')\n+        self.feature_importances_ = coefs\n+        return self\n+\n+\n+class MyimbPipeline(imbPipeline):\n+    """\n+    Extend imblance pipeline object to have feature_importances_ attribute\n+    """\n+    def fit(self, X, y=None, **fit_params):\n+        super(MyimbPipeline, self).fit(X, y, **fit_params)\n+        estimator = self.steps[-1][-1]\n+        if hasattr(estimator, \'coef_\'):\n+            coefs = estimator.coef_\n+        else:\n+            coefs = getattr(estimator, \'feature_importances_\', None)\n+        if coefs is None:\n+            raise RuntimeError(\'The estimator in the pipeline does not expose \'\n+                               \'"coef_" or "feature_importances_" \'\n+                               \'attributes\')\n+        self.feature_importances_ = coefs\n+        return self\n+\n+\n+def check_feature_importances(estimator):\n+    """\n+    For pipeline object which has no feature_importances_ property,\n+    this function returns the same comfigured pipeline object with\n+    attached the last estimator\'s feature_importances_.\n+    """\n+    if estimator.__class__.__module__ == \'sklearn.pipeline\':\n+        pipeline_steps = estimator.get_params()[\'steps\']\n+        estimator = MyPipeline(pipeline_steps)\n+    elif estimator.__class__.__module__ == \'imblearn.pipeline\':\n+        pipeline_steps = estimator.get_params()[\'steps\']\n+        estimator = MyimbPipeline(pipeline_steps)\n+    else:\n+        return estimator\n'
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 iraps_classifier.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/iraps_classifier.py Tue May 14 18:20:45 2019 -0400
[
b'@@ -0,0 +1,569 @@\n+"""\n+class IRAPSCore\n+class IRAPSClassifier\n+class BinarizeTargetClassifier\n+class BinarizeTargetRegressor\n+class _BinarizeTargetScorer\n+class _BinarizeTargetProbaScorer\n+\n+binarize_auc_scorer\n+binarize_average_precision_scorer\n+\n+binarize_accuracy_scorer\n+binarize_balanced_accuracy_scorer\n+binarize_precision_scorer\n+binarize_recall_scorer\n+"""\n+\n+\n+import numpy as np\n+import random\n+import warnings\n+\n+from abc import ABCMeta\n+from scipy.stats import ttest_ind\n+from sklearn import metrics\n+from sklearn.base import BaseEstimator, clone, RegressorMixin\n+from sklearn.externals import six\n+from sklearn.feature_selection.univariate_selection import _BaseFilter\n+from sklearn.metrics.scorer import _BaseScorer\n+from sklearn.pipeline import Pipeline\n+from sklearn.utils import as_float_array, check_X_y\n+from sklearn.utils._joblib import Parallel, delayed\n+from sklearn.utils.validation import (check_array, check_is_fitted,\n+                                      check_memory, column_or_1d)\n+\n+\n+VERSION = \'0.1.1\'\n+\n+\n+class IRAPSCore(six.with_metaclass(ABCMeta, BaseEstimator)):\n+    """\n+    Base class of IRAPSClassifier\n+    From sklearn BaseEstimator:\n+        get_params()\n+        set_params()\n+\n+    Parameters\n+    ----------\n+    n_iter : int\n+        sample count\n+\n+    positive_thres : float\n+        z_score shreshold to discretize positive target values\n+\n+    negative_thres : float\n+        z_score threshold to discretize negative target values\n+\n+    verbose : int\n+        0 or geater, if not 0, print progress\n+\n+    n_jobs : int, default=1\n+        The number of CPUs to use to do the computation.\n+\n+    pre_dispatch : int, or string.\n+        Controls the number of jobs that get dispatched during parallel\n+        execution. Reducing this number can be useful to avoid an\n+        explosion of memory consumption when more jobs get dispatched\n+        than CPUs can process. This parameter can be:\n+            - None, in which case all the jobs are immediately\n+              created and spawned. Use this for lightweight and\n+              fast-running jobs, to avoid delays due to on-demand\n+              spawning of the jobs\n+            - An int, giving the exact number of total jobs that are\n+              spawned\n+            - A string, giving an expression as a function of n_jobs,\n+              as in \'2*n_jobs\'\n+\n+    random_state : int or None\n+    """\n+\n+    def __init__(self, n_iter=1000, positive_thres=-1, negative_thres=0,\n+                 verbose=0, n_jobs=1, pre_dispatch=\'2*n_jobs\',\n+                 random_state=None):\n+        """\n+        IRAPS turns towwards general Anomaly Detection\n+        It comapares positive_thres with negative_thres,\n+        and decide which portion is the positive target.\n+        e.g.:\n+        (positive_thres=-1, negative_thres=0)\n+                 => positive = Z_score of target < -1\n+        (positive_thres=1, negative_thres=0)\n+                 => positive = Z_score of target > 1\n+\n+        Note: The positive targets here is always the\n+            abnormal minority group.\n+        """\n+        self.n_iter = n_iter\n+        self.positive_thres = positive_thres\n+        self.negative_thres = negative_thres\n+        self.verbose = verbose\n+        self.n_jobs = n_jobs\n+        self.pre_dispatch = pre_dispatch\n+        self.random_state = random_state\n+\n+    def fit(self, X, y):\n+        """\n+        X: array-like (n_samples x n_features)\n+        y: 1-d array-like (n_samples)\n+        """\n+        X, y = check_X_y(X, y, [\'csr\', \'csc\'], multi_output=False)\n+\n+        def _stochastic_sampling(X, y, random_state=None, positive_thres=-1,\n+                                 negative_thres=0):\n+            # each iteration select a random number of random subset of\n+            # training samples. this is somewhat different from the original\n+            # IRAPS method, but effect is almost the same.\n+            SAMPLE_SIZE = [0.25, 0.75]\n+            n_samples = X.shape[0'..b'lue = main_estimator.discretize_value\n+        less_is_positive = main_estimator.less_is_positive\n+\n+        if less_is_positive:\n+            y_trans = y < discretize_value\n+        else:\n+            y_trans = y > discretize_value\n+\n+        y_pred = clf.predict(X)\n+        if sample_weight is not None:\n+            return self._sign * self._score_func(y_trans, y_pred,\n+                                                 sample_weight=sample_weight,\n+                                                 **self._kwargs)\n+        else:\n+            return self._sign * self._score_func(y_trans, y_pred,\n+                                                 **self._kwargs)\n+\n+\n+# roc_auc\n+binarize_auc_scorer =\\\n+        _BinarizeTargetProbaScorer(metrics.roc_auc_score, 1, {})\n+\n+# average_precision_scorer\n+binarize_average_precision_scorer =\\\n+        _BinarizeTargetProbaScorer(metrics.average_precision_score, 1, {})\n+\n+# roc_auc_scorer\n+iraps_auc_scorer = binarize_auc_scorer\n+\n+# average_precision_scorer\n+iraps_average_precision_scorer = binarize_average_precision_scorer\n+\n+\n+class BinarizeTargetRegressor(BaseEstimator, RegressorMixin):\n+    """\n+    Extend regression estimator to have discretize_value\n+\n+    Parameters\n+    ----------\n+    regressor: object\n+        Estimator object such as derived from sklearn `RegressionMixin`.\n+\n+    z_score: float, default=-1.0\n+        Threshold value based on z_score. Will be ignored when\n+        fixed_value is set\n+\n+    value: float, default=None\n+        Threshold value\n+\n+    less_is_positive: boolean, default=True\n+        When target is less the threshold value, it will be converted\n+        to True, False otherwise.\n+\n+    Attributes\n+    ----------\n+    regressor_: object\n+        Fitted regressor\n+\n+    discretize_value: float\n+        The threshold value used to discretize True and False targets\n+    """\n+\n+    def __init__(self, regressor, z_score=-1, value=None,\n+                 less_is_positive=True):\n+        self.regressor = regressor\n+        self.z_score = z_score\n+        self.value = value\n+        self.less_is_positive = less_is_positive\n+\n+    def fit(self, X, y, sample_weight=None):\n+        """\n+        Calculate the discretize_value fit the regressor with traning data\n+\n+        Returns\n+        ------\n+        self: object\n+        """\n+        y = check_array(y, accept_sparse=False, force_all_finite=True,\n+                        ensure_2d=False, dtype=\'numeric\')\n+        y = column_or_1d(y)\n+\n+        if self.value is None:\n+            discretize_value = y.mean() + y.std() * self.z_score\n+        else:\n+            discretize_value = self.Value\n+        self.discretize_value = discretize_value\n+\n+        self.regressor_ = clone(self.regressor)\n+\n+        if sample_weight is not None:\n+            self.regressor_.fit(X, y, sample_weight=sample_weight)\n+        else:\n+            self.regressor_.fit(X, y)\n+\n+        # attach classifier attributes\n+        if hasattr(self.regressor_, \'feature_importances_\'):\n+            self.feature_importances_ = self.regressor_.feature_importances_\n+        if hasattr(self.regressor_, \'coef_\'):\n+            self.coef_ = self.regressor_.coef_\n+        if hasattr(self.regressor_, \'n_outputs_\'):\n+            self.n_outputs_ = self.regressor_.n_outputs_\n+        if hasattr(self.regressor_, \'n_features_\'):\n+            self.n_features_ = self.regressor_.n_features_\n+\n+        return self\n+\n+    def predict(self, X):\n+        """Predict target value of X\n+        """\n+        check_is_fitted(self, \'regressor_\')\n+        y_pred = self.regressor_.predict(X)\n+        if not np.all((y_pred >= 0) & (y_pred <= 1)):\n+            y_pred = (y_pred - y_pred.min()) / (y_pred.max() - y_pred.min())\n+        if self.less_is_positive:\n+            y_pred = 1 - y_pred\n+        return y_pred\n+\n+\n+# roc_auc_scorer\n+regression_auc_scorer = binarize_auc_scorer\n+\n+# average_precision_scorer\n+regression_average_precision_scorer = binarize_average_precision_scorer\n'
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 main_macros.xml
--- a/main_macros.xml Sun Dec 30 01:58:56 2018 -0500
+++ b/main_macros.xml Tue May 14 18:20:45 2019 -0400
[
b'@@ -1,14 +1,17 @@\n <macros>\n-  <token name="@VERSION@">1.0</token>\n+  <token name="@VERSION@">1.0.0.4</token>\n \n   <xml name="python_requirements">\n       <requirements>\n           <requirement type="package" version="3.6">python</requirement>\n-          <requirement type="package" version="0.20.2">scikit-learn</requirement>\n-          <requirement type="package" version="0.23.4">pandas</requirement>\n+          <requirement type="package" version="0.20.3">scikit-learn</requirement>\n+          <requirement type="package" version="0.24.2">pandas</requirement>\n           <requirement type="package" version="0.80">xgboost</requirement>\n           <requirement type="package" version="0.9.13">asteval</requirement>\n-          <yield />\n+          <requirement type="package" version="0.6">skrebate</requirement>\n+          <requirement type="package" version="0.4.2">imbalanced-learn</requirement>\n+          <requirement type="package" version="0.16.0">mlxtend</requirement>\n+          <yield/>\n       </requirements>\n   </xml>\n \n@@ -352,10 +355,10 @@\n       <option value="all_columns">All columns</option>\n     </param>\n     <when value="by_index_number">\n-      <param name="@COL_NAME@" multiple="@MULTIPLE@" type="data_column" data_ref="@INFILE@" label="Select target column(s):"/>\n+      <param name="@COL_NAME@" multiple="@MULTIPLE@" type="data_column" use_header_names="true" data_ref="@INFILE@" label="Select target column(s):"/>\n     </when>\n     <when value="all_but_by_index_number">\n-      <param name="@COL_NAME@" multiple="@MULTIPLE@" type="data_column" data_ref="@INFILE@" label="Select target column(s):"/>\n+      <param name="@COL_NAME@" multiple="@MULTIPLE@" type="data_column" use_header_names="true" data_ref="@INFILE@" label="Select target column(s):"/>\n     </when>\n     <when value="by_header_name">\n       <param name="@COL_NAME@" type="text" value="" label="Type header name(s):" help="Comma-separated string. For example: target1,target2"/>\n@@ -428,7 +431,7 @@\n           <option value="sparse">sparse matrix</option>\n       </param>\n       <when value="tabular">\n-          <expand macro="samples_tabular" multiple1="true"/>\n+          <expand macro="samples_tabular" multiple1="true" multiple2="false"/>\n       </when>\n       <when value="sparse">\n           <expand macro="sparse_target"/>\n@@ -823,6 +826,8 @@\n     <option value="StratifiedShuffleSplit">StratifiedShuffleSplit</option>\n     <option value="TimeSeriesSplit">TimeSeriesSplit</option>\n     <option value="PredefinedSplit">PredefinedSplit</option>\n+    <option value="OrderedKFold">OrderedKFold</option>\n+    <option value="RepeatedOrderedKFold">RepeatedOrderedKFold</option>\n     <yield/>\n   </xml>\n \n@@ -872,6 +877,16 @@\n     <when value="PredefinedSplit">\n       <param argument="test_fold" type="text" value="" area="true" label="test_fold" help="List, e.g., [0, 1, -1, 1], represents two test sets, [X[0]] and [X[1], X[3]], X[2] is excluded from any test set due to \'-1\'."/>\n     </when>\n+    <when value="OrderedKFold">\n+      <expand macro="cv_n_splits"/>\n+      <expand macro="cv_shuffle"/>\n+      <expand macro="random_state"/>\n+    </when>\n+    <when value="RepeatedOrderedKFold">\n+      <expand macro="cv_n_splits"/>\n+      <param argument="n_repeats" type="integer" value="5"/>\n+      <expand macro="random_state"/>\n+    </when>\n     <yield/>\n   </xml>\n \n@@ -929,7 +944,13 @@\n   </xml>\n \n   <xml name="cv_groups" >\n-    <param argument="groups" type="text" value="" area="true" label="Groups" help="Group lables in a list. e.g., [1, 1, 2, 2, 3, 3, 3]"/>\n+    <section name="groups_selector" title="Groups column selector" expanded="true">\n+      <param name="infile_g" type="data" format="tabular" label="Choose dataset containing groups info:"/>\n+      <param name="header_g" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolfalse" checked="False" label="Does the dataset contain header:" />\n+      <conditional name="column_selector_options_g">\n+        <expand macro="sa'..b'   </sanitizer>\n-    </param>\n-  </xml>\n-\n   <xml name="search_cv_options">\n       <expand macro="scoring_selection"/>\n       <expand macro="model_validation_common_options"/>\n-      <expand macro="pre_dispatch" value="2*n_jobs" help="Controls the number of jobs that get dispatched during parallel execution"/>\n+      <!--expand macro="pre_dispatch" default_value="2*n_jobs" help="Controls the number of jobs that get dispatched during parallel execution"/-->\n       <param argument="iid" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="true" label="iid" help="If True, data is identically distributed across the folds"/>\n       <param argument="refit" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="true" label="refit" help="Refit an estimator using the best found parameters on the whole dataset."/>\n       <param argument="error_score" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="true" label="Raise fit error:" help="If false, the metric score is assigned to NaN if an error occurs in estimator fitting and FitFailedWarning is raised."/>\n@@ -1403,12 +1454,12 @@\n     <conditional name="estimator_selector">\n       <param name="selected_module" type="select" label="Choose the module that contains target estimator:" >\n         <expand macro="estimator_module_options">\n-            <option value="customer_estimator">Load a customer estimator</option>\n+            <option value="custom_estimator">Load a custom estimator</option>\n         </expand>\n       </param>\n       <expand macro="estimator_suboptions">\n-        <when value="customer_estimator">\n-            <param name="c_estimator" type="data" format="zip" label="Choose the dataset containing the customer estimator or pipeline:"/>\n+        <when value="custom_estimator">\n+            <param name="c_estimator" type="data" format="zip" label="Choose the dataset containing the custom estimator or pipeline:"/>\n         </when>\n       </expand>\n     </conditional>\n@@ -1591,6 +1642,7 @@\n         <option value="over_sampling.SMOTENC">over_sampling.SMOTENC</option>\n         <option value="combine.SMOTEENN">combine.SMOTEENN</option>\n         <option value="combine.SMOTETomek">combine.SMOTETomek</option>\n+        <option value="Z_RandomOverSampler">Z_RandomOverSampler - for regression</option>\n       </param>\n       <when value="under_sampling.ClusterCentroids">\n         <expand macro="estimator_params_text"\n@@ -1668,6 +1720,33 @@\n         <expand macro="estimator_params_text"\n               help="Default(=blank): sampling_strategy=\'auto\', random_state=None, smote=None, tomek=None."/>\n       </when>\n+      <when value="Z_RandomOverSampler">\n+        <expand macro="estimator_params_text"\n+              help="Default(=blank): sampling_strategy=\'auto\', random_state=None, negative_thres=0, positive_thres=-1."/>\n+      </when>\n+    </conditional>\n+  </xml>\n+\n+  <xml name="stacking_ensemble_inputs">\n+    <section name="options" title="Advanced Options" expanded="false">\n+        <yield/>\n+        <param argument="use_features_in_secondary" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="false"/>\n+        <param argument="store_train_meta_features" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="false"/>\n+    </section>\n+  </xml>\n+\n+  <xml name="stacking_base_estimator">\n+    <conditional name="estimator_selector">\n+        <param name="selected_module" type="select" label="Choose the module that contains target estimator:" >\n+            <expand macro="estimator_module_options">\n+                <option value="custom_estimator">Load a custom estimator</option>\n+            </expand>\n+        </param>\n+        <expand macro="estimator_suboptions">\n+            <when value="custom_estimator">\n+                <param name="c_estimator" type="data" format="zip" label="Choose the dataset containing the custom estimator or pipeline"/>\n+            </when>\n+        </expand>\n     </conditional>\n   </xml>\n \n'
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 model_validations.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/model_validations.py Tue May 14 18:20:45 2019 -0400
[
b'@@ -0,0 +1,252 @@\n+"""\n+class\n+-----\n+OrderedKFold\n+RepeatedOrderedKold\n+\n+\n+function\n+--------\n+train_test_split\n+"""\n+\n+import numpy as np\n+import warnings\n+\n+from itertools import chain\n+from math import ceil, floor\n+from sklearn.model_selection import (GroupShuffleSplit, ShuffleSplit,\n+                                     StratifiedShuffleSplit)\n+from sklearn.model_selection._split import _BaseKFold, _RepeatedSplits\n+from sklearn.utils import check_random_state, indexable, safe_indexing\n+from sklearn.utils.validation import _num_samples, check_array\n+\n+\n+def _validate_shuffle_split(n_samples, test_size, train_size,\n+                            default_test_size=None):\n+    """\n+    Validation helper to check if the test/test sizes are meaningful wrt to the\n+    size of the data (n_samples)\n+    """\n+    if test_size is None and train_size is None:\n+        test_size = default_test_size\n+\n+    test_size_type = np.asarray(test_size).dtype.kind\n+    train_size_type = np.asarray(train_size).dtype.kind\n+\n+    if (test_size_type == \'i\' and (test_size >= n_samples or test_size <= 0)\n+       or test_size_type == \'f\' and (test_size <= 0 or test_size >= 1)):\n+        raise ValueError(\'test_size={0} should be either positive and smaller\'\n+                         \' than the number of samples {1} or a float in the \'\n+                         \'(0, 1) range\'.format(test_size, n_samples))\n+\n+    if (train_size_type == \'i\' and (train_size >= n_samples or train_size <= 0)\n+       or train_size_type == \'f\' and (train_size <= 0 or train_size >= 1)):\n+        raise ValueError(\'train_size={0} should be either positive and smaller\'\n+                         \' than the number of samples {1} or a float in the \'\n+                         \'(0, 1) range\'.format(train_size, n_samples))\n+\n+    if train_size is not None and train_size_type not in (\'i\', \'f\'):\n+        raise ValueError("Invalid value for train_size: {}".format(train_size))\n+    if test_size is not None and test_size_type not in (\'i\', \'f\'):\n+        raise ValueError("Invalid value for test_size: {}".format(test_size))\n+\n+    if (train_size_type == \'f\' and test_size_type == \'f\' and\n+            train_size + test_size > 1):\n+        raise ValueError(\n+            \'The sum of test_size and train_size = {}, should be in the (0, 1)\'\n+            \' range. Reduce test_size and/or train_size.\'\n+            .format(train_size + test_size))\n+\n+    if test_size_type == \'f\':\n+        n_test = ceil(test_size * n_samples)\n+    elif test_size_type == \'i\':\n+        n_test = float(test_size)\n+\n+    if train_size_type == \'f\':\n+        n_train = floor(train_size * n_samples)\n+    elif train_size_type == \'i\':\n+        n_train = float(train_size)\n+\n+    if train_size is None:\n+        n_train = n_samples - n_test\n+    elif test_size is None:\n+        n_test = n_samples - n_train\n+\n+    if n_train + n_test > n_samples:\n+        raise ValueError(\'The sum of train_size and test_size = %d, \'\n+                         \'should be smaller than the number of \'\n+                         \'samples %d. Reduce test_size and/or \'\n+                         \'train_size.\' % (n_train + n_test, n_samples))\n+\n+    n_train, n_test = int(n_train), int(n_test)\n+\n+    if n_train == 0:\n+        raise ValueError(\n+            \'With n_samples={}, test_size={} and train_size={}, the \'\n+            \'resulting train set will be empty. Adjust any of the \'\n+            \'aforementioned parameters.\'.format(n_samples, test_size,\n+                                                train_size)\n+        )\n+\n+    return n_train, n_test\n+\n+\n+def train_test_split(*arrays, **options):\n+    """Extend sklearn.model_selection.train_test_slit to have group split.\n+\n+    Parameters\n+    ----------\n+    *arrays : sequence of indexables with same length / shape[0]\n+        Allowed inputs are lists, numpy arrays, scipy-sparse\n+        matrices or pandas dataframes.\n+\n+    test_size : float, int or None, optional (default=None)\n+        If float, should be betw'..b'arrays == 0:\n+        raise ValueError("At least one array required as input")\n+    test_size = options.pop(\'test_size\', None)\n+    train_size = options.pop(\'train_size\', None)\n+    random_state = options.pop(\'random_state\', None)\n+    shuffle = options.pop(\'shuffle\', \'simple\')\n+    labels = options.pop(\'labels\', None)\n+\n+    if options:\n+        raise TypeError("Invalid parameters passed: %s" % str(options))\n+\n+    arrays = indexable(*arrays)\n+\n+    n_samples = _num_samples(arrays[0])\n+    if shuffle == \'group\':\n+        if labels is None:\n+            raise ValueError("When shuffle=\'group\', "\n+                             "labels should not be None!")\n+        labels = check_array(labels, ensure_2d=False, dtype=None)\n+        uniques = np.unique(labels)\n+        n_samples = uniques.size\n+\n+    n_train, n_test = _validate_shuffle_split(n_samples, test_size, train_size,\n+                                              default_test_size=0.25)\n+\n+    shuffle_options = dict(test_size=n_test,\n+                           train_size=n_train,\n+                           random_state=random_state)\n+\n+    if shuffle is None:\n+        if labels is not None:\n+            warnings.warn("The `labels` is ignored for "\n+                          "shuffle being None!")\n+\n+        train = np.arange(n_train)\n+        test = np.arange(n_train, n_train + n_test)\n+\n+    elif shuffle == \'simple\':\n+        if labels is not None:\n+            warnings.warn("The `labels` is not needed and therefore "\n+                          "ignored for ShuffleSplit, as shuffle=\'simple\'!")\n+\n+        cv = ShuffleSplit(**shuffle_options)\n+        train, test = next(cv.split(X=arrays[0], y=None))\n+\n+    elif shuffle == \'stratified\':\n+        cv = StratifiedShuffleSplit(**shuffle_options)\n+        train, test = next(cv.split(X=arrays[0], y=labels))\n+\n+    elif shuffle == \'group\':\n+        cv = GroupShuffleSplit(**shuffle_options)\n+        train, test = next(cv.split(X=arrays[0], y=None, groups=labels))\n+\n+    else:\n+        raise ValueError("The argument `shuffle` only supports None, "\n+                         "\'simple\', \'stratified\' and \'group\', but got `%s`!"\n+                         % shuffle)\n+\n+    return list(chain.from_iterable((safe_indexing(a, train),\n+                                    safe_indexing(a, test)) for a in arrays))\n+\n+\n+class OrderedKFold(_BaseKFold):\n+    """\n+    Split into K fold based on ordered target value\n+\n+    Parameters\n+    ----------\n+    n_splits : int, default=3\n+        Number of folds. Must be at least 2.\n+    shuffle: bool\n+    random_state: None or int\n+    """\n+\n+    def __init__(self, n_splits=3, shuffle=False, random_state=None):\n+        super(OrderedKFold, self).__init__(n_splits, shuffle, random_state)\n+\n+    def _iter_test_indices(self, X, y, groups=None):\n+        n_samples = _num_samples(X)\n+        n_splits = self.n_splits\n+        y = np.asarray(y)\n+        sorted_index = np.argsort(y)\n+        if self.shuffle:\n+            current = 0\n+            rng = check_random_state(self.random_state)\n+            for i in range(n_samples // int(n_splits)):\n+                start, stop = current, current + n_splits\n+                rng.shuffle(sorted_index[start:stop])\n+                current = stop\n+            rng.shuffle(sorted_index[current:])\n+\n+        for i in range(n_splits):\n+            yield sorted_index[i:n_samples:n_splits]\n+\n+\n+class RepeatedOrderedKFold(_RepeatedSplits):\n+    """ Repeated OrderedKFold runs mutiple times with different randomization.\n+\n+    Parameters\n+    ----------\n+    n_splits : int, default=5\n+        Number of folds. Must be at least 2.\n+\n+    n_repeats : int, default=5\n+        Number of times cross-validator to be repeated.\n+\n+    random_state: int, RandomState instance or None. Optional\n+    """\n+    def __init__(self, n_splits=5, n_repeats=5, random_state=None):\n+        super(RepeatedOrderedKFold, self).__init__(\n+            OrderedKFold, n_repeats, random_state, n_splits=n_splits)\n'
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 pk_whitelist.json
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pk_whitelist.json Tue May 14 18:20:45 2019 -0400
[
b'@@ -0,0 +1,768 @@\n+{ "SK_NAMES": [\n+    "sklearn._ASSUME_FINITE", "sklearn._isotonic._inplace_contiguous_isotonic_regression",\n+    "sklearn._isotonic._make_unique", "sklearn.base.BaseEstimator",\n+    "sklearn.base.BiclusterMixin", "sklearn.base.ClassifierMixin",\n+    "sklearn.base.ClusterMixin", "sklearn.base.DensityMixin",\n+    "sklearn.base.MetaEstimatorMixin", "sklearn.base.RegressorMixin",\n+    "sklearn.base.TransformerMixin", "sklearn.base._first_and_last_element",\n+    "sklearn.base._pprint", "sklearn.base.clone",\n+    "sklearn.base.is_classifier", "sklearn.base.is_regressor",\n+    "sklearn.clone", "sklearn.cluster.AffinityPropagation",\n+    "sklearn.cluster.AgglomerativeClustering", "sklearn.cluster.Birch",\n+    "sklearn.cluster.DBSCAN", "sklearn.cluster.FeatureAgglomeration",\n+    "sklearn.cluster.KMeans", "sklearn.cluster.MeanShift",\n+    "sklearn.cluster.MiniBatchKMeans", "sklearn.cluster.SpectralBiclustering",\n+    "sklearn.cluster.SpectralClustering", "sklearn.cluster.SpectralCoclustering",\n+    "sklearn.cluster._dbscan_inner.dbscan_inner", "sklearn.cluster._feature_agglomeration.AgglomerationTransform",\n+    "sklearn.cluster._hierarchical.WeightedEdge", "sklearn.cluster._hierarchical._get_parents",\n+    "sklearn.cluster._hierarchical._hc_get_descendent", "sklearn.cluster._hierarchical.average_merge",\n+    "sklearn.cluster._hierarchical.compute_ward_dist", "sklearn.cluster._hierarchical.hc_get_heads",\n+    "sklearn.cluster._hierarchical.max_merge", "sklearn.cluster._k_means._assign_labels_array",\n+    "sklearn.cluster._k_means._assign_labels_csr", "sklearn.cluster._k_means._centers_dense",\n+    "sklearn.cluster._k_means._centers_sparse", "sklearn.cluster._k_means._mini_batch_update_csr",\n+    "sklearn.cluster._k_means_elkan.k_means_elkan", "sklearn.cluster.affinity_propagation",\n+    "sklearn.cluster.affinity_propagation_.AffinityPropagation", "sklearn.cluster.affinity_propagation_.affinity_propagation",\n+    "sklearn.cluster.bicluster.BaseSpectral", "sklearn.cluster.bicluster.SpectralBiclustering",\n+    "sklearn.cluster.bicluster.SpectralCoclustering", "sklearn.cluster.bicluster._bistochastic_normalize",\n+    "sklearn.cluster.bicluster._log_normalize", "sklearn.cluster.bicluster._scale_normalize",\n+    "sklearn.cluster.birch.Birch", "sklearn.cluster.birch._CFNode",\n+    "sklearn.cluster.birch._CFSubcluster", "sklearn.cluster.birch._iterate_sparse_X",\n+    "sklearn.cluster.birch._split_node", "sklearn.cluster.dbscan",\n+    "sklearn.cluster.dbscan_.DBSCAN", "sklearn.cluster.dbscan_.dbscan",\n+    "sklearn.cluster.estimate_bandwidth", "sklearn.cluster.get_bin_seeds",\n+    "sklearn.cluster.hierarchical.AgglomerativeClustering", "sklearn.cluster.hierarchical.FeatureAgglomeration",\n+    "sklearn.cluster.hierarchical._TREE_BUILDERS", "sklearn.cluster.hierarchical._average_linkage",\n+    "sklearn.cluster.hierarchical._complete_linkage", "sklearn.cluster.hierarchical._fix_connectivity",\n+    "sklearn.cluster.hierarchical._hc_cut", "sklearn.cluster.hierarchical.linkage_tree",\n+    "sklearn.cluster.hierarchical.ward_tree", "sklearn.cluster.k_means",\n+    "sklearn.cluster.k_means_.FLOAT_DTYPES", "sklearn.cluster.k_means_.KMeans",\n+    "sklearn.cluster.k_means_.MiniBatchKMeans", "sklearn.cluster.k_means_._init_centroids",\n+    "sklearn.cluster.k_means_._k_init", "sklearn.cluster.k_means_._kmeans_single_elkan",\n+    "sklearn.cluster.k_means_._kmeans_single_lloyd", "sklearn.cluster.k_means_._labels_inertia",\n+    "sklearn.cluster.k_means_._labels_inertia_precompute_dense", "sklearn.cluster.k_means_._mini_batch_convergence",\n+    "sklearn.cluster.k_means_._mini_batch_step", "sklearn.cluster.k_means_._tolerance",\n+    "sklearn.cluster.k_means_._validate_center_shape", "sklearn.cluster.k_means_.k_means",\n+    "sklearn.cluster.k_means_.string_types", "sklearn.cluster.linkage_tree",\n+    "sklearn.cluster.mean_shift", "sklearn.cluster.mean_shift_.MeanShift",\n+    "sklearn.cluster.mean_shift_._mean_shift_single_seed", "sklearn.cluster'..b'ltiSURFstar",\n+    "skrebate.ReliefF", "skrebate.SURF",\n+    "skrebate.SURFstar", "skrebate.TuRF",\n+    "skrebate.multisurf.MultiSURF", "skrebate.multisurfstar.MultiSURFstar",\n+    "skrebate.relieff.ReliefF", "skrebate.scoring_utils.MultiSURF_compute_scores",\n+    "skrebate.scoring_utils.MultiSURFstar_compute_scores", "skrebate.scoring_utils.ReliefF_compute_scores",\n+    "skrebate.scoring_utils.SURF_compute_scores", "skrebate.scoring_utils.SURFstar_compute_scores",\n+    "skrebate.scoring_utils.compute_score", "skrebate.scoring_utils.get_row_missing",\n+    "skrebate.scoring_utils.ramp_function", "skrebate.surf.SURF",\n+    "skrebate.surfstar.SURFstar", "skrebate.turf.TuRF"\n+  ],\n+\n+  "XGB_NAMES": [\n+    "xgboost.Booster", "xgboost.DMatrix",\n+    "xgboost.VERSION_FILE", "xgboost.XGBClassifier",\n+    "xgboost.XGBModel", "xgboost.XGBRegressor",\n+    "xgboost.callback._fmt_metric", "xgboost.callback._get_callback_context",\n+    "xgboost.callback.early_stop", "xgboost.callback.print_evaluation",\n+    "xgboost.callback.record_evaluation", "xgboost.callback.reset_learning_rate",\n+    "xgboost.compat.PANDAS_INSTALLED", "xgboost.compat.PY3",\n+    "xgboost.compat.SKLEARN_INSTALLED", "xgboost.compat.STRING_TYPES",\n+    "xgboost.compat.py_str", "xgboost.core.Booster",\n+    "xgboost.core.CallbackEnv", "xgboost.core.DMatrix",\n+    "xgboost.core.EarlyStopException", "xgboost.core.PANDAS_DTYPE_MAPPER",\n+    "xgboost.core.PANDAS_INSTALLED", "xgboost.core.PY3",\n+    "xgboost.core.STRING_TYPES", "xgboost.core.XGBoostError",\n+    "xgboost.core._check_call", "xgboost.core._load_lib",\n+    "xgboost.core._maybe_pandas_data", "xgboost.core._maybe_pandas_label",\n+    "xgboost.core.c_array", "xgboost.core.c_str",\n+    "xgboost.core.ctypes2buffer", "xgboost.core.ctypes2numpy",\n+    "xgboost.core.from_cstr_to_pystr", "xgboost.core.from_pystr_to_cstr",\n+    "xgboost.cv", "xgboost.f",\n+    "xgboost.libpath.XGBoostLibraryNotFound", "xgboost.libpath.find_lib_path",\n+    "xgboost.plot_importance", "xgboost.plot_tree",\n+    "xgboost.plotting._EDGEPAT", "xgboost.plotting._EDGEPAT2",\n+    "xgboost.plotting._LEAFPAT", "xgboost.plotting._NODEPAT",\n+    "xgboost.plotting._parse_edge", "xgboost.plotting._parse_node",\n+    "xgboost.plotting.plot_importance", "xgboost.plotting.plot_tree",\n+    "xgboost.plotting.to_graphviz", "xgboost.rabit.DTYPE_ENUM__",\n+    "xgboost.rabit.STRING_TYPES", "xgboost.rabit._init_rabit",\n+    "xgboost.rabit.allreduce", "xgboost.rabit.broadcast",\n+    "xgboost.rabit.finalize", "xgboost.rabit.get_processor_name",\n+    "xgboost.rabit.get_rank", "xgboost.rabit.get_world_size",\n+    "xgboost.rabit.init", "xgboost.rabit.tracker_print",\n+    "xgboost.rabit.version_number", "xgboost.sklearn.SKLEARN_INSTALLED",\n+    "xgboost.sklearn.XGBClassifier", "xgboost.sklearn.XGBModel",\n+    "xgboost.sklearn.XGBRegressor", "xgboost.sklearn._objective_decorator",\n+    "xgboost.to_graphviz", "xgboost.train",\n+    "xgboost.training.CVPack", "xgboost.training.SKLEARN_INSTALLED",\n+    "xgboost.training.STRING_TYPES", "xgboost.training._train_internal",\n+    "xgboost.training.aggcv", "xgboost.training.cv",\n+    "xgboost.training.mknfold", "xgboost.training.train"\n+  ],\n+\n+\n+  "NUMPY_NAMES": [\n+    "numpy.core.multiarray._reconstruct", "numpy.ndarray",\n+    "numpy.dtype", "numpy.core.multiarray.scalar", "numpy.random.__RandomState_ctor",\n+    "numpy.ma.core._mareconstruct", "numpy.ma.core.MaskedArray"\n+  ],\n+\n+  "IMBLEARN_NAMES":[\n+    "imblearn.pipeline.Pipeline", "imblearn.over_sampling._random_over_sampler.RandomOverSampler",\n+    "imblearn.under_sampling._prototype_selection._edited_nearest_neighbours.EditedNearestNeighbours"\n+  ],\n+\n+  "MLXTEND_NAMES":[\n+    "mlxtend.classifier.stacking_cv_classification.StackingCVClassifier",\n+    "mlxtend.classifier.stacking_classification.StackingClassifier",\n+    "mlxtend.regressor.stacking_cv_regression.StackingCVRegressor",\n+    "mlxtend.regressor.stacking_regression.StackingRegressor"\n+  ]\n+}\n\\ No newline at end of file\n'
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 preprocessors.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/preprocessors.py Tue May 14 18:20:45 2019 -0400
[
@@ -0,0 +1,184 @@
+"""
+Z_RandomOverSampler
+"""
+
+import imblearn
+import numpy as np
+
+from collections import Counter
+from imblearn.over_sampling.base import BaseOverSampler
+from imblearn.over_sampling import RandomOverSampler
+from imblearn.pipeline import Pipeline as imbPipeline
+from imblearn.utils import check_target_type
+from scipy import sparse
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.preprocessing.data import _handle_zeros_in_scale
+from sklearn.utils import check_array, safe_indexing
+from sklearn.utils.fixes import nanpercentile
+from sklearn.utils.validation import (check_is_fitted, check_X_y,
+                                      FLOAT_DTYPES)
+
+
+class Z_RandomOverSampler(BaseOverSampler):
+
+    def __init__(self, sampling_strategy='auto',
+                 return_indices=False,
+                 random_state=None,
+                 ratio=None,
+                 negative_thres=0,
+                 positive_thres=-1):
+        super(Z_RandomOverSampler, self).__init__(
+            sampling_strategy=sampling_strategy, ratio=ratio)
+        self.random_state = random_state
+        self.return_indices = return_indices
+        self.negative_thres = negative_thres
+        self.positive_thres = positive_thres
+
+    @staticmethod
+    def _check_X_y(X, y):
+        y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
+        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'], dtype=None)
+        return X, y, binarize_y
+
+    def _fit_resample(self, X, y):
+        n_samples = X.shape[0]
+
+        # convert y to z_score
+        y_z = (y - y.mean()) / y.std()
+
+        index0 = np.arange(n_samples)
+        index_negative = index0[y_z > self.negative_thres]
+        index_positive = index0[y_z <= self.positive_thres]
+        index_unclassified = [x for x in index0
+                              if x not in index_negative
+                              and x not in index_positive]
+
+        y_z[index_negative] = 0
+        y_z[index_positive] = 1
+        y_z[index_unclassified] = -1
+
+        ros = RandomOverSampler(
+            sampling_strategy=self.sampling_strategy,
+            random_state=self.random_state,
+            ratio=self.ratio)
+        _, _ = ros.fit_resample(X, y_z)
+        sample_indices = ros.sample_indices_
+
+        print("Before sampler: %s. Total after: %s"
+              % (Counter(y_z), sample_indices.shape))
+
+        self.sample_indices_ = np.array(sample_indices)
+
+        if self.return_indices:
+            return (safe_indexing(X, sample_indices),
+                    safe_indexing(y, sample_indices),
+                    sample_indices)
+        return (safe_indexing(X, sample_indices),
+                safe_indexing(y, sample_indices))
+
+
+def _get_quantiles(X, quantile_range):
+    """
+    Calculate column percentiles for 2d array
+
+    Parameters
+    ----------
+    X : array-like, shape [n_samples, n_features]
+    """
+    quantiles = []
+    for feature_idx in range(X.shape[1]):
+        if sparse.issparse(X):
+            column_nnz_data = X.data[
+                X.indptr[feature_idx]: X.indptr[feature_idx + 1]]
+            column_data = np.zeros(shape=X.shape[0], dtype=X.dtype)
+            column_data[:len(column_nnz_data)] = column_nnz_data
+        else:
+            column_data = X[:, feature_idx]
+        quantiles.append(nanpercentile(column_data, quantile_range))
+
+    quantiles = np.transpose(quantiles)
+
+    return quantiles
+
+
+class TDMScaler(BaseEstimator, TransformerMixin):
+    """
+    Scale features using Training Distribution Matching (TDM) algorithm
+
+    References
+    ----------
+    .. [1] Thompson JA, Tan J and Greene CS (2016) Cross-platform
+           normalization of microarray and RNA-seq data for machine
+           learning applications. PeerJ 4, e1621.
+    """
+
+    def __init__(self, q_lower=25.0, q_upper=75.0, ):
+        self.q_lower = q_lower
+        self.q_upper = q_upper
+
+    def fit(self, X, y=None):
+        """
+        Parameters
+        ----------
+        X : array-like, shape [n_samples, n_features]
+        """
+        X = check_array(X, copy=True, estimator=self, dtype=FLOAT_DTYPES,
+                        force_all_finite=True)
+
+        if not 0 <= self.q_lower <= self.q_upper <= 100:
+            raise ValueError("Invalid quantile parameter values: "
+                             "q_lower %s, q_upper: %s"
+                             % (str(self.q_lower), str(self.q_upper)))
+
+        # TODO sparse data
+        quantiles = nanpercentile(X, (self.q_lower, self.q_upper))
+        iqr = quantiles[1] - quantiles[0]
+
+        self.q_lower_ = quantiles[0]
+        self.q_upper_ = quantiles[1]
+        self.iqr_ = _handle_zeros_in_scale(iqr, copy=False)
+
+        self.max_ = np.nanmax(X)
+        self.min_ = np.nanmin(X)
+
+        return self
+
+    def transform(self, X):
+        """
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}
+            The data used to scale along the specified axis.
+        """
+        check_is_fitted(self, 'iqr_', 'max_')
+        X = check_array(X, copy=True, estimator=self, dtype=FLOAT_DTYPES,
+                        force_all_finite=True)
+
+        # TODO sparse data
+        train_upper_scale = (self.max_ - self.q_upper_) / self.iqr_
+        train_lower_scale = (self.q_lower_ - self.min_) / self.iqr_
+
+        test_quantiles = nanpercentile(X, (self.q_lower, self.q_upper))
+        test_iqr = _handle_zeros_in_scale(
+            test_quantiles[1] - test_quantiles[0], copy=False)
+
+        test_upper_bound = test_quantiles[1] + train_upper_scale * test_iqr
+        test_lower_bound = test_quantiles[0] - train_lower_scale * test_iqr
+
+        test_min = np.nanmin(X)
+        if test_lower_bound < test_min:
+            test_lower_bound = test_min
+
+        X[X > test_upper_bound] = test_upper_bound
+        X[X < test_lower_bound] = test_lower_bound
+
+        X = (X - test_lower_bound) / (test_upper_bound - test_lower_bound)\
+            * (self.max_ - self.min_) + self.min_
+
+        return X
+
+    def inverse_transform(self, X):
+        """
+        Scale the data back to the original state
+        """
+        raise NotImplementedError("Inverse transformation is not implemented!")
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 search_model_validation.py
--- a/search_model_validation.py Sun Dec 30 01:58:56 2018 -0500
+++ b/search_model_validation.py Tue May 14 18:20:45 2019 -0400
[
b'@@ -1,7 +1,8 @@\n+import argparse\n+import collections\n import imblearn\n import json\n import numpy as np\n-import os\n import pandas\n import pickle\n import skrebate\n@@ -9,93 +10,124 @@\n import sys\n import xgboost\n import warnings\n+import iraps_classifier\n+import model_validations\n+import preprocessors\n+import feature_selectors\n from imblearn import under_sampling, over_sampling, combine\n-from imblearn.pipeline import Pipeline as imbPipeline\n-from sklearn import (cluster, compose, decomposition, ensemble, feature_extraction,\n-                    feature_selection, gaussian_process, kernel_approximation, metrics,\n-                    model_selection, naive_bayes, neighbors, pipeline, preprocessing,\n-                    svm, linear_model, tree, discriminant_analysis)\n+from scipy.io import mmread\n+from mlxtend import classifier, regressor\n+from sklearn import (cluster, compose, decomposition, ensemble,\n+                     feature_extraction, feature_selection,\n+                     gaussian_process, kernel_approximation, metrics,\n+                     model_selection, naive_bayes, neighbors,\n+                     pipeline, preprocessing, svm, linear_model,\n+                     tree, discriminant_analysis)\n from sklearn.exceptions import FitFailedWarning\n from sklearn.externals import joblib\n-from utils import get_cv, get_scoring, get_X_y, load_model, read_columns, SafeEval\n+from sklearn.model_selection._validation import _score\n+\n+from utils import (SafeEval, get_cv, get_scoring, get_X_y,\n+                   load_model, read_columns)\n+from model_validations import train_test_split\n \n \n-N_JOBS = int(os.environ.get(\'GALAXY_SLOTS\', 1))\n+N_JOBS = int(__import__(\'os\').environ.get(\'GALAXY_SLOTS\', 1))\n+CACHE_DIR = \'./cached\'\n+NON_SEARCHABLE = (\'n_jobs\', \'pre_dispatch\', \'memory\', \'steps\',\n+                  \'nthread\', \'verbose\')\n \n \n-def get_search_params(params_builder):\n+def _eval_search_params(params_builder):\n     search_params = {}\n-    safe_eval = SafeEval(load_scipy=True, load_numpy=True)\n-    safe_eval_es = SafeEval(load_estimators=True)\n \n     for p in params_builder[\'param_set\']:\n-        search_p = p[\'search_param_selector\'][\'search_p\']\n-        if search_p.strip() == \'\':\n+        search_list = p[\'sp_list\'].strip()\n+        if search_list == \'\':\n             continue\n-        param_type = p[\'search_param_selector\'][\'selected_param_type\']\n+\n+        param_name = p[\'sp_name\']\n+        if param_name.lower().endswith(NON_SEARCHABLE):\n+            print("Warning: `%s` is not eligible for search and was "\n+                  "omitted!" % param_name)\n+            continue\n \n-        lst = search_p.split(\':\')\n-        assert (len(lst) == 2), "Error, make sure there is one and only one colon in search parameter input."\n-        literal = lst[1].strip()\n-        param_name = lst[0].strip()\n-        if param_name:\n-            if param_name.lower() == \'n_jobs\':\n-                sys.exit("Parameter `%s` is invalid for search." %param_name)\n-            elif not param_name.endswith(\'-\'):\n-                ev = safe_eval(literal)\n-                if param_type == \'final_estimator_p\':\n-                    search_params[\'estimator__\' + param_name] = ev\n-                else:\n-                    search_params[\'preprocessing_\' + param_type[5:6] + \'__\' + param_name] = ev\n-            else:\n-                # only for estimator eval, add `-` to the end of param\n-                #TODO maybe add regular express check\n-                ev = safe_eval_es(literal)\n-                for obj in ev:\n-                    if \'n_jobs\' in obj.get_params():\n-                        obj.set_params( n_jobs=N_JOBS )\n-                if param_type == \'final_estimator_p\':\n-                    search_params[\'estimator__\' + param_name[:-1]] = ev\n-                else:\n-                    search_params[\'preprocessing_\' + param_type[5:6] + \'__\' + param_name[:-1]] = ev\n-        elif param_type != \'final_estimator_p\':\n-            #TODO regular express check ?\n-           '..b'_train_test_split == \'yes\':\n+        # make sure refit is choosen\n+        if not options[\'refit\']:\n+            raise ValueError("Refit must be `True` for shuffle splitting!")\n+        split_options = params[\'train_test_split\']\n+\n+        # splits\n+        if split_options[\'shuffle\'] == \'stratified\':\n+            split_options[\'labels\'] = y\n+            X, X_test, y, y_test = train_test_split(X, y, **split_options)\n+        elif split_options[\'shuffle\'] == \'group\':\n+            if not groups:\n+                raise ValueError("No group based CV option was "\n+                                 "choosen for group shuffle!")\n+            split_options[\'labels\'] = groups\n+            X, X_test, y, y_test, groups, _ =\\\n+                train_test_split(X, y, **split_options)\n+        else:\n+            if split_options[\'shuffle\'] == \'None\':\n+                split_options[\'shuffle\'] = None\n+            X, X_test, y, y_test =\\\n+                train_test_split(X, y, **split_options)\n+    # end train_test_split\n \n     if options[\'error_score\'] == \'raise\':\n-        searcher.fit(X, y)\n+        searcher.fit(X, y, groups=groups)\n     else:\n         warnings.simplefilter(\'always\', FitFailedWarning)\n         with warnings.catch_warnings(record=True) as w:\n             try:\n-                searcher.fit(X, y)\n+                searcher.fit(X, y, groups=groups)\n             except ValueError:\n                 pass\n             for warning in w:\n                 print(repr(warning.message))\n \n-    cv_result = pandas.DataFrame(searcher.cv_results_)\n-    cv_result.rename(inplace=True, columns={\'mean_test_primary\': \'mean_test_\'+primary_scoring, \'rank_test_primary\': \'rank_test_\'+primary_scoring})\n-    cv_result.to_csv(path_or_buf=outfile_result, sep=\'\\t\', header=True, index=False)\n+    if do_train_test_split == \'no\':\n+        # save results\n+        cv_results = pandas.DataFrame(searcher.cv_results_)\n+        cv_results = cv_results[sorted(cv_results.columns)]\n+        cv_results.to_csv(path_or_buf=outfile_result, sep=\'\\t\',\n+                          header=True, index=False)\n+\n+    # output test result using best_estimator_\n+    else:\n+        best_estimator_ = searcher.best_estimator_\n+        if isinstance(options[\'scoring\'], collections.Mapping):\n+            is_multimetric = True\n+        else:\n+            is_multimetric = False\n \n-    if outfile_estimator:\n-        with open(outfile_estimator, \'wb\') as output_handler:\n-            pickle.dump(searcher.best_estimator_, output_handler, pickle.HIGHEST_PROTOCOL)\n+        test_score = _score(best_estimator_, X_test,\n+                            y_test, options[\'scoring\'],\n+                            is_multimetric=is_multimetric)\n+        if not is_multimetric:\n+            test_score = {primary_scoring: test_score}\n+        for key, value in test_score.items():\n+            test_score[key] = [value]\n+        result_df = pandas.DataFrame(test_score)\n+        result_df.to_csv(path_or_buf=outfile_result, sep=\'\\t\',\n+                         header=True, index=False)\n+\n+    memory.clear(warn=False)\n+\n+    if outfile_object:\n+        with open(outfile_object, \'wb\') as output_handler:\n+            pickle.dump(searcher, output_handler, pickle.HIGHEST_PROTOCOL)\n+\n+\n+if __name__ == \'__main__\':\n+    aparser = argparse.ArgumentParser()\n+    aparser.add_argument("-i", "--inputs", dest="inputs", required=True)\n+    aparser.add_argument("-e", "--estimator", dest="infile_estimator")\n+    aparser.add_argument("-X", "--infile1", dest="infile1")\n+    aparser.add_argument("-y", "--infile2", dest="infile2")\n+    aparser.add_argument("-r", "--outfile_result", dest="outfile_result")\n+    aparser.add_argument("-o", "--outfile_object", dest="outfile_object")\n+    aparser.add_argument("-g", "--groups", dest="groups")\n+    args = aparser.parse_args()\n+\n+    main(args.inputs, args.infile_estimator, args.infile1, args.infile2,\n+         args.outfile_result, outfile_object=args.outfile_object,\n+         groups=args.groups)\n'
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 sk_whitelist.json
--- a/sk_whitelist.json Sun Dec 30 01:58:56 2018 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
b'@@ -1,761 +0,0 @@\n-{ "SK_NAMES": [\n-    "sklearn._ASSUME_FINITE", "sklearn._isotonic._inplace_contiguous_isotonic_regression",\n-    "sklearn._isotonic._make_unique", "sklearn.base.BaseEstimator",\n-    "sklearn.base.BiclusterMixin", "sklearn.base.ClassifierMixin",\n-    "sklearn.base.ClusterMixin", "sklearn.base.DensityMixin",\n-    "sklearn.base.MetaEstimatorMixin", "sklearn.base.RegressorMixin",\n-    "sklearn.base.TransformerMixin", "sklearn.base._first_and_last_element",\n-    "sklearn.base._pprint", "sklearn.base.clone",\n-    "sklearn.base.is_classifier", "sklearn.base.is_regressor",\n-    "sklearn.clone", "sklearn.cluster.AffinityPropagation",\n-    "sklearn.cluster.AgglomerativeClustering", "sklearn.cluster.Birch",\n-    "sklearn.cluster.DBSCAN", "sklearn.cluster.FeatureAgglomeration",\n-    "sklearn.cluster.KMeans", "sklearn.cluster.MeanShift",\n-    "sklearn.cluster.MiniBatchKMeans", "sklearn.cluster.SpectralBiclustering",\n-    "sklearn.cluster.SpectralClustering", "sklearn.cluster.SpectralCoclustering",\n-    "sklearn.cluster._dbscan_inner.dbscan_inner", "sklearn.cluster._feature_agglomeration.AgglomerationTransform",\n-    "sklearn.cluster._hierarchical.WeightedEdge", "sklearn.cluster._hierarchical._get_parents",\n-    "sklearn.cluster._hierarchical._hc_get_descendent", "sklearn.cluster._hierarchical.average_merge",\n-    "sklearn.cluster._hierarchical.compute_ward_dist", "sklearn.cluster._hierarchical.hc_get_heads",\n-    "sklearn.cluster._hierarchical.max_merge", "sklearn.cluster._k_means._assign_labels_array",\n-    "sklearn.cluster._k_means._assign_labels_csr", "sklearn.cluster._k_means._centers_dense",\n-    "sklearn.cluster._k_means._centers_sparse", "sklearn.cluster._k_means._mini_batch_update_csr",\n-    "sklearn.cluster._k_means_elkan.k_means_elkan", "sklearn.cluster.affinity_propagation",\n-    "sklearn.cluster.affinity_propagation_.AffinityPropagation", "sklearn.cluster.affinity_propagation_.affinity_propagation",\n-    "sklearn.cluster.bicluster.BaseSpectral", "sklearn.cluster.bicluster.SpectralBiclustering",\n-    "sklearn.cluster.bicluster.SpectralCoclustering", "sklearn.cluster.bicluster._bistochastic_normalize",\n-    "sklearn.cluster.bicluster._log_normalize", "sklearn.cluster.bicluster._scale_normalize",\n-    "sklearn.cluster.birch.Birch", "sklearn.cluster.birch._CFNode",\n-    "sklearn.cluster.birch._CFSubcluster", "sklearn.cluster.birch._iterate_sparse_X",\n-    "sklearn.cluster.birch._split_node", "sklearn.cluster.dbscan",\n-    "sklearn.cluster.dbscan_.DBSCAN", "sklearn.cluster.dbscan_.dbscan",\n-    "sklearn.cluster.estimate_bandwidth", "sklearn.cluster.get_bin_seeds",\n-    "sklearn.cluster.hierarchical.AgglomerativeClustering", "sklearn.cluster.hierarchical.FeatureAgglomeration",\n-    "sklearn.cluster.hierarchical._TREE_BUILDERS", "sklearn.cluster.hierarchical._average_linkage",\n-    "sklearn.cluster.hierarchical._complete_linkage", "sklearn.cluster.hierarchical._fix_connectivity",\n-    "sklearn.cluster.hierarchical._hc_cut", "sklearn.cluster.hierarchical.linkage_tree",\n-    "sklearn.cluster.hierarchical.ward_tree", "sklearn.cluster.k_means",\n-    "sklearn.cluster.k_means_.FLOAT_DTYPES", "sklearn.cluster.k_means_.KMeans",\n-    "sklearn.cluster.k_means_.MiniBatchKMeans", "sklearn.cluster.k_means_._init_centroids",\n-    "sklearn.cluster.k_means_._k_init", "sklearn.cluster.k_means_._kmeans_single_elkan",\n-    "sklearn.cluster.k_means_._kmeans_single_lloyd", "sklearn.cluster.k_means_._labels_inertia",\n-    "sklearn.cluster.k_means_._labels_inertia_precompute_dense", "sklearn.cluster.k_means_._mini_batch_convergence",\n-    "sklearn.cluster.k_means_._mini_batch_step", "sklearn.cluster.k_means_._tolerance",\n-    "sklearn.cluster.k_means_._validate_center_shape", "sklearn.cluster.k_means_.k_means",\n-    "sklearn.cluster.k_means_.string_types", "sklearn.cluster.linkage_tree",\n-    "sklearn.cluster.mean_shift", "sklearn.cluster.mean_shift_.MeanShift",\n-    "sklearn.cluster.mean_shift_._mean_shift_single_seed", "sklearn.cluster'..b'lidation.check_non_negative", "sklearn.utils.validation.check_random_state",\n-    "sklearn.utils.validation.check_symmetric", "sklearn.utils.validation.column_or_1d",\n-    "sklearn.utils.validation.has_fit_parameter", "sklearn.utils.validation.indexable",\n-    "sklearn.utils.weight_vector.WeightVector"\n-],\n-\n-  "SKR_NAMES": [\n-    "skrebate.MultiSURF", "skrebate.MultiSURFstar",\n-    "skrebate.ReliefF", "skrebate.SURF",\n-    "skrebate.SURFstar", "skrebate.TuRF",\n-    "skrebate.multisurf.MultiSURF", "skrebate.multisurfstar.MultiSURFstar",\n-    "skrebate.relieff.ReliefF", "skrebate.scoring_utils.MultiSURF_compute_scores",\n-    "skrebate.scoring_utils.MultiSURFstar_compute_scores", "skrebate.scoring_utils.ReliefF_compute_scores",\n-    "skrebate.scoring_utils.SURF_compute_scores", "skrebate.scoring_utils.SURFstar_compute_scores",\n-    "skrebate.scoring_utils.compute_score", "skrebate.scoring_utils.get_row_missing",\n-    "skrebate.scoring_utils.ramp_function", "skrebate.surf.SURF",\n-    "skrebate.surfstar.SURFstar", "skrebate.turf.TuRF"\n-  ],\n-\n-  "XGB_NAMES": [\n-    "xgboost.Booster", "xgboost.DMatrix",\n-    "xgboost.VERSION_FILE", "xgboost.XGBClassifier",\n-    "xgboost.XGBModel", "xgboost.XGBRegressor",\n-    "xgboost.callback._fmt_metric", "xgboost.callback._get_callback_context",\n-    "xgboost.callback.early_stop", "xgboost.callback.print_evaluation",\n-    "xgboost.callback.record_evaluation", "xgboost.callback.reset_learning_rate",\n-    "xgboost.compat.PANDAS_INSTALLED", "xgboost.compat.PY3",\n-    "xgboost.compat.SKLEARN_INSTALLED", "xgboost.compat.STRING_TYPES",\n-    "xgboost.compat.py_str", "xgboost.core.Booster",\n-    "xgboost.core.CallbackEnv", "xgboost.core.DMatrix",\n-    "xgboost.core.EarlyStopException", "xgboost.core.PANDAS_DTYPE_MAPPER",\n-    "xgboost.core.PANDAS_INSTALLED", "xgboost.core.PY3",\n-    "xgboost.core.STRING_TYPES", "xgboost.core.XGBoostError",\n-    "xgboost.core._check_call", "xgboost.core._load_lib",\n-    "xgboost.core._maybe_pandas_data", "xgboost.core._maybe_pandas_label",\n-    "xgboost.core.c_array", "xgboost.core.c_str",\n-    "xgboost.core.ctypes2buffer", "xgboost.core.ctypes2numpy",\n-    "xgboost.core.from_cstr_to_pystr", "xgboost.core.from_pystr_to_cstr",\n-    "xgboost.cv", "xgboost.f",\n-    "xgboost.libpath.XGBoostLibraryNotFound", "xgboost.libpath.find_lib_path",\n-    "xgboost.plot_importance", "xgboost.plot_tree",\n-    "xgboost.plotting._EDGEPAT", "xgboost.plotting._EDGEPAT2",\n-    "xgboost.plotting._LEAFPAT", "xgboost.plotting._NODEPAT",\n-    "xgboost.plotting._parse_edge", "xgboost.plotting._parse_node",\n-    "xgboost.plotting.plot_importance", "xgboost.plotting.plot_tree",\n-    "xgboost.plotting.to_graphviz", "xgboost.rabit.DTYPE_ENUM__",\n-    "xgboost.rabit.STRING_TYPES", "xgboost.rabit._init_rabit",\n-    "xgboost.rabit.allreduce", "xgboost.rabit.broadcast",\n-    "xgboost.rabit.finalize", "xgboost.rabit.get_processor_name",\n-    "xgboost.rabit.get_rank", "xgboost.rabit.get_world_size",\n-    "xgboost.rabit.init", "xgboost.rabit.tracker_print",\n-    "xgboost.rabit.version_number", "xgboost.sklearn.SKLEARN_INSTALLED",\n-    "xgboost.sklearn.XGBClassifier", "xgboost.sklearn.XGBModel",\n-    "xgboost.sklearn.XGBRegressor", "xgboost.sklearn._objective_decorator",\n-    "xgboost.to_graphviz", "xgboost.train",\n-    "xgboost.training.CVPack", "xgboost.training.SKLEARN_INSTALLED",\n-    "xgboost.training.STRING_TYPES", "xgboost.training._train_internal",\n-    "xgboost.training.aggcv", "xgboost.training.cv",\n-    "xgboost.training.mknfold", "xgboost.training.train"\n-  ],\n-\n-\n-  "NUMPY_NAMES": [\n-    "numpy.core.multiarray._reconstruct", "numpy.ndarray",\n-    "numpy.dtype", "numpy.core.multiarray.scalar",\n-    "numpy.random.__RandomState_ctor"\n-  ],\n-\n-  "IMBLEARN_NAMES":[\n-    "imblearn.pipeline.Pipeline", "imblearn.over_sampling._random_over_sampler.RandomOverSampler",\n-    "imblearn.under_sampling._prototype_selection._edited_nearest_neighbours.EditedNearestNeighbours"\n-  ]\n-}\n\\ No newline at end of file\n'
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 stacking_ensembles.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/stacking_ensembles.py Tue May 14 18:20:45 2019 -0400
[
@@ -0,0 +1,128 @@
+import argparse
+import json
+import pandas as pd
+import pickle
+import xgboost
+import warnings
+from sklearn import (cluster, compose, decomposition, ensemble,
+                     feature_extraction, feature_selection,
+                     gaussian_process, kernel_approximation, metrics,
+                     model_selection, naive_bayes, neighbors,
+                     pipeline, preprocessing, svm, linear_model,
+                     tree, discriminant_analysis)
+from sklearn.model_selection._split import check_cv
+from feature_selectors import (DyRFE, DyRFECV,
+                               MyPipeline, MyimbPipeline)
+from iraps_classifier import (IRAPSCore, IRAPSClassifier,
+                              BinarizeTargetClassifier,
+                              BinarizeTargetRegressor)
+from preprocessors import Z_RandomOverSampler
+from utils import load_model, get_cv, get_estimator, get_search_params
+
+from mlxtend.regressor import StackingCVRegressor, StackingRegressor
+from mlxtend.classifier import StackingCVClassifier, StackingClassifier
+
+
+warnings.filterwarnings('ignore')
+
+N_JOBS = int(__import__('os').environ.get('GALAXY_SLOTS', 1))
+
+
+def main(inputs_path, output_obj, base_paths=None, meta_path=None,
+         outfile_params=None):
+    """
+    Parameter
+    ---------
+    inputs_path : str
+        File path for Galaxy parameters
+
+    output_obj : str
+        File path for ensemble estimator ouput
+
+    base_paths : str
+        File path or paths concatenated by comma.
+
+    meta_path : str
+        File path
+
+    outfile_params : str
+        File path for params output
+    """
+    with open(inputs_path, 'r') as param_handler:
+        params = json.load(param_handler)
+
+    base_estimators = []
+    for idx, base_file in enumerate(base_paths.split(',')):
+        if base_file and base_file != 'None':
+            with open(base_file, 'rb') as handler:
+                model = load_model(handler)
+        else:
+            estimator_json = (params['base_est_builder'][idx]
+                              ['estimator_selector'])
+            model = get_estimator(estimator_json)
+        base_estimators.append(model)
+
+    if meta_path:
+        with open(meta_path, 'rb') as f:
+            meta_estimator = load_model(f)
+    else:
+        estimator_json = params['meta_estimator']['estimator_selector']
+        meta_estimator = get_estimator(estimator_json)
+
+    options = params['algo_selection']['options']
+
+    cv_selector = options.pop('cv_selector', None)
+    if cv_selector:
+        splitter, groups = get_cv(cv_selector)
+        options['cv'] = splitter
+        # set n_jobs
+        options['n_jobs'] = N_JOBS
+
+    if params['algo_selection']['estimator_type'] == 'StackingCVClassifier':
+        ensemble_estimator = StackingCVClassifier(
+            classifiers=base_estimators,
+            meta_classifier=meta_estimator,
+            **options)
+
+    elif params['algo_selection']['estimator_type'] == 'StackingClassifier':
+        ensemble_estimator = StackingClassifier(
+            classifiers=base_estimators,
+            meta_classifier=meta_estimator,
+            **options)
+
+    elif params['algo_selection']['estimator_type'] == 'StackingCVRegressor':
+        ensemble_estimator = StackingCVRegressor(
+            regressors=base_estimators,
+            meta_regressor=meta_estimator,
+            **options)
+
+    else:
+        ensemble_estimator = StackingRegressor(
+            regressors=base_estimators,
+            meta_regressor=meta_estimator,
+            **options)
+
+    print(ensemble_estimator)
+    for base_est in base_estimators:
+        print(base_est)
+
+    with open(output_obj, 'wb') as out_handler:
+        pickle.dump(ensemble_estimator, out_handler, pickle.HIGHEST_PROTOCOL)
+
+    if params['get_params'] and outfile_params:
+        results = get_search_params(ensemble_estimator)
+        df = pd.DataFrame(results, columns=['', 'Parameter', 'Value'])
+        df.to_csv(outfile_params, sep='\t', index=False)
+
+
+if __name__ == '__main__':
+    aparser = argparse.ArgumentParser()
+    aparser.add_argument("-b", "--bases", dest="bases")
+    aparser.add_argument("-m", "--meta", dest="meta")
+    aparser.add_argument("-i", "--inputs", dest="inputs")
+    aparser.add_argument("-o", "--outfile", dest="outfile")
+    aparser.add_argument("-p", "--outfile_params", dest="outfile_params")
+    args = aparser.parse_args()
+
+    main(args.inputs, args.outfile, base_paths=args.bases,
+         meta_path=args.meta, outfile_params=args.outfile_params)
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 test-data/GridSearchCV.zip
b
Binary file test-data/GridSearchCV.zip has changed
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 test-data/LinearRegression01.zip
b
Binary file test-data/LinearRegression01.zip has changed
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 test-data/LinearRegression02.zip
b
Binary file test-data/LinearRegression02.zip has changed
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 test-data/RFE.zip
b
Binary file test-data/RFE.zip has changed
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 test-data/RandomForestClassifier.zip
b
Binary file test-data/RandomForestClassifier.zip has changed
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 test-data/RandomForestRegressor01.zip
b
Binary file test-data/RandomForestRegressor01.zip has changed
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 test-data/StackingCVRegressor01.zip
b
Binary file test-data/StackingCVRegressor01.zip has changed
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 test-data/StackingCVRegressor02.zip
b
Binary file test-data/StackingCVRegressor02.zip has changed
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 test-data/XGBRegressor01.zip
b
Binary file test-data/XGBRegressor01.zip has changed
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 test-data/best_estimator_.zip
b
Binary file test-data/best_estimator_.zip has changed
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 test-data/best_params_.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/best_params_.txt Tue May 14 18:20:45 2019 -0400
b
@@ -0,0 +1,1 @@
+{'estimator__n_estimators': 100}
\ No newline at end of file
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 test-data/best_score_.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/best_score_.tabular Tue May 14 18:20:45 2019 -0400
b
@@ -0,0 +1,2 @@
+best_score_
+0.7976348550293088
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 test-data/feature_importances_.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/feature_importances_.tabular Tue May 14 18:20:45 2019 -0400
b
@@ -0,0 +1,11 @@
+feature_importances_
+0.15959252
+0.20373514
+0.22071308
+0.06281833
+0.098471984
+0.06960951
+0.13073005
+0.027164686
+0.022071308
+0.0050933785
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 test-data/feature_selection_result13
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/feature_selection_result13 Tue May 14 18:20:45 2019 -0400
b
@@ -0,0 +1,262 @@
+temp_1 average forecast_noaa friend
+69.0 69.7 65.0 88.0
+59.0 58.1 57.0 66.0
+88.0 77.3 75.0 70.0
+65.0 64.7 63.0 58.0
+50.0 47.5 44.0 58.0
+51.0 48.2 45.0 63.0
+52.0 48.6 45.0 41.0
+78.0 76.7 75.0 66.0
+35.0 45.2 43.0 38.0
+40.0 46.1 45.0 36.0
+47.0 45.3 41.0 58.0
+72.0 76.3 76.0 88.0
+76.0 74.4 73.0 72.0
+39.0 45.3 45.0 46.0
+78.0 72.2 70.0 84.0
+71.0 67.3 63.0 85.0
+48.0 47.7 44.0 61.0
+72.0 77.0 77.0 68.0
+57.0 54.7 50.0 70.0
+40.0 45.1 44.0 39.0
+54.0 47.6 47.0 53.0
+58.0 53.2 52.0 71.0
+68.0 58.6 58.0 54.0
+65.0 55.3 55.0 65.0
+47.0 48.8 46.0 51.0
+44.0 45.6 43.0 42.0
+64.0 67.1 64.0 69.0
+62.0 57.1 57.0 67.0
+66.0 65.7 64.0 74.0
+70.0 71.8 67.0 90.0
+57.0 54.2 54.0 70.0
+50.0 50.5 46.0 57.0
+55.0 51.8 49.0 71.0
+55.0 49.5 46.0 67.0
+42.0 45.2 41.0 47.0
+65.0 60.1 57.0 41.0
+63.0 65.6 63.0 73.0
+48.0 47.3 45.0 28.0
+42.0 46.3 44.0 62.0
+51.0 46.2 45.0 38.0
+64.0 68.0 65.0 64.0
+75.0 74.6 74.0 63.0
+52.0 46.7 42.0 39.0
+67.0 68.6 66.0 80.0
+68.0 68.7 65.0 56.0
+54.0 55.0 53.0 42.0
+62.0 56.8 52.0 70.0
+76.0 76.1 76.0 61.0
+73.0 73.1 71.0 93.0
+52.0 50.3 50.0 35.0
+70.0 73.9 71.0 68.0
+77.0 77.4 75.0 62.0
+60.0 56.6 52.0 72.0
+52.0 53.3 50.0 54.0
+79.0 75.0 71.0 85.0
+76.0 57.2 53.0 74.0
+66.0 66.5 64.0 85.0
+57.0 61.8 58.0 62.0
+66.0 57.4 57.0 60.0
+61.0 58.4 58.0 41.0
+55.0 53.1 52.0 65.0
+48.0 48.1 46.0 54.0
+49.0 49.2 46.0 63.0
+65.0 66.7 64.0 73.0
+60.0 62.5 58.0 56.0
+56.0 53.0 53.0 36.0
+59.0 57.4 56.0 44.0
+44.0 45.7 41.0 35.0
+82.0 63.2 62.0 83.0
+64.0 67.0 65.0 76.0
+43.0 45.5 41.0 46.0
+64.0 55.7 51.0 57.0
+63.0 52.7 49.0 49.0
+70.0 70.6 67.0 79.0
+71.0 52.4 48.0 42.0
+76.0 73.5 69.0 85.0
+68.0 62.1 58.0 55.0
+39.0 45.3 44.0 39.0
+71.0 70.7 70.0 52.0
+69.0 71.7 68.0 89.0
+74.0 71.5 71.0 82.0
+81.0 64.1 62.0 81.0
+51.0 49.3 49.0 34.0
+45.0 46.8 44.0 61.0
+87.0 76.8 73.0 73.0
+71.0 73.8 71.0 86.0
+55.0 60.3 56.0 77.0
+80.0 76.9 72.0 81.0
+67.0 69.0 65.0 76.0
+61.0 61.4 60.0 78.0
+46.0 46.6 43.0 65.0
+39.0 45.1 42.0 51.0
+67.0 68.3 67.0 61.0
+52.0 47.8 43.0 50.0
+67.0 69.8 68.0 87.0
+75.0 71.2 67.0 77.0
+68.0 73.3 73.0 79.0
+92.0 68.2 65.0 71.0
+67.0 72.8 69.0 56.0
+44.0 45.8 43.0 56.0
+61.0 61.0 56.0 73.0
+65.0 53.4 49.0 41.0
+68.0 73.0 72.0 70.0
+87.0 62.1 62.0 69.0
+117.0 54.8 51.0 62.0
+80.0 76.4 75.0 66.0
+57.0 51.0 47.0 46.0
+67.0 63.6 61.0 68.0
+58.0 54.0 51.0 56.0
+65.0 56.2 53.0 41.0
+52.0 48.6 45.0 47.0
+59.0 55.3 52.0 39.0
+57.0 53.9 53.0 35.0
+81.0 59.2 56.0 66.0
+75.0 77.1 76.0 75.0
+76.0 77.4 76.0 95.0
+57.0 64.8 61.0 53.0
+69.0 74.2 72.0 86.0
+77.0 66.8 66.0 64.0
+55.0 49.9 47.0 55.0
+49.0 46.8 45.0 53.0
+54.0 52.7 48.0 57.0
+55.0 51.2 49.0 42.0
+56.0 55.6 53.0 45.0
+68.0 74.6 72.0 77.0
+54.0 53.4 49.0 44.0
+67.0 69.0 69.0 87.0
+49.0 46.9 45.0 33.0
+49.0 49.1 47.0 45.0
+56.0 48.5 48.0 49.0
+73.0 71.0 66.0 78.0
+66.0 66.4 65.0 60.0
+69.0 66.5 66.0 62.0
+82.0 64.5 64.0 65.0
+90.0 76.7 75.0 65.0
+51.0 50.7 49.0 43.0
+77.0 57.1 57.0 41.0
+60.0 61.4 58.0 58.0
+74.0 72.8 71.0 87.0
+85.0 77.2 73.0 74.0
+68.0 62.8 61.0 64.0
+56.0 49.5 46.0 37.0
+71.0 56.2 55.0 45.0
+62.0 59.5 57.0 40.0
+83.0 77.3 76.0 76.0
+64.0 65.4 62.0 56.0
+56.0 48.4 45.0 54.0
+41.0 45.1 42.0 31.0
+65.0 66.2 66.0 67.0
+65.0 53.7 49.0 38.0
+40.0 46.0 46.0 41.0
+45.0 45.6 43.0 29.0
+52.0 48.4 48.0 58.0
+63.0 51.7 50.0 63.0
+52.0 47.6 47.0 44.0
+60.0 57.9 55.0 77.0
+81.0 75.7 73.0 89.0
+75.0 75.8 74.0 77.0
+59.0 51.4 48.0 64.0
+73.0 77.1 77.0 94.0
+75.0 77.3 73.0 66.0
+60.0 58.5 56.0 59.0
+75.0 71.3 68.0 56.0
+59.0 57.6 56.0 40.0
+53.0 49.1 47.0 56.0
+79.0 77.2 76.0 60.0
+57.0 52.1 49.0 46.0
+75.0 67.6 64.0 77.0
+71.0 69.4 67.0 81.0
+53.0 50.2 50.0 42.0
+46.0 48.8 48.0 56.0
+81.0 76.9 72.0 70.0
+49.0 48.9 47.0 29.0
+57.0 48.4 44.0 34.0
+60.0 58.8 54.0 53.0
+67.0 73.7 72.0 64.0
+61.0 64.1 62.0 60.0
+66.0 69.5 66.0 85.0
+64.0 51.9 50.0 55.0
+66.0 65.7 62.0 49.0
+64.0 52.2 52.0 49.0
+71.0 65.2 61.0 56.0
+75.0 63.8 62.0 60.0
+48.0 46.4 46.0 47.0
+53.0 52.5 48.0 70.0
+49.0 47.1 46.0 65.0
+85.0 68.5 67.0 81.0
+62.0 49.4 48.0 30.0
+50.0 47.0 42.0 58.0
+58.0 55.9 51.0 39.0
+72.0 77.2 74.0 95.0
+55.0 50.7 50.0 34.0
+74.0 72.3 70.0 91.0
+85.0 77.3 77.0 77.0
+73.0 77.3 77.0 93.0
+52.0 47.4 44.0 39.0
+67.0 67.6 64.0 62.0
+45.0 45.1 45.0 35.0
+46.0 47.2 46.0 41.0
+66.0 60.6 60.0 57.0
+71.0 77.0 75.0 86.0
+70.0 69.3 66.0 79.0
+58.0 49.9 46.0 53.0
+72.0 77.1 76.0 65.0
+74.0 75.4 74.0 71.0
+65.0 64.5 63.0 49.0
+77.0 58.8 55.0 39.0
+59.0 50.9 49.0 35.0
+45.0 45.7 41.0 61.0
+53.0 50.5 49.0 46.0
+53.0 54.9 54.0 72.0
+79.0 77.3 73.0 79.0
+49.0 49.0 44.0 44.0
+63.0 62.9 62.0 78.0
+69.0 56.5 54.0 45.0
+60.0 50.8 47.0 46.0
+64.0 62.5 60.0 73.0
+79.0 71.0 66.0 64.0
+55.0 47.0 43.0 58.0
+73.0 56.0 54.0 41.0
+60.0 59.1 57.0 62.0
+67.0 70.2 67.0 77.0
+42.0 45.2 45.0 58.0
+60.0 65.0 62.0 55.0
+57.0 49.8 47.0 30.0
+35.0 45.2 44.0 36.0
+75.0 70.3 66.0 84.0
+61.0 51.1 48.0 65.0
+51.0 50.6 46.0 59.0
+71.0 71.9 67.0 70.0
+74.0 75.3 74.0 71.0
+48.0 45.4 44.0 42.0
+74.0 74.9 70.0 60.0
+76.0 70.8 68.0 57.0
+58.0 51.6 47.0 37.0
+51.0 50.4 48.0 43.0
+72.0 72.6 68.0 78.0
+76.0 67.2 64.0 74.0
+52.0 47.9 47.0 60.0
+53.0 48.2 48.0 53.0
+65.0 69.1 65.0 83.0
+58.0 58.1 58.0 43.0
+77.0 75.6 74.0 56.0
+61.0 52.9 51.0 35.0
+67.0 65.3 64.0 54.0
+54.0 49.3 46.0 58.0
+79.0 67.4 65.0 58.0
+77.0 64.3 63.0 67.0
+71.0 67.7 64.0 55.0
+58.0 57.7 54.0 61.0
+68.0 55.9 55.0 56.0
+40.0 45.4 45.0 49.0
+80.0 77.3 75.0 71.0
+74.0 62.3 59.0 61.0
+57.0 45.5 42.0 57.0
+52.0 47.8 43.0 57.0
+71.0 75.1 71.0 95.0
+49.0 53.6 49.0 70.0
+89.0 59.0 59.0 61.0
+60.0 60.2 56.0 78.0
+59.0 58.3 58.0 40.0
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 test-data/final_estimator.zip
b
Binary file test-data/final_estimator.zip has changed
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 test-data/get_params.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/get_params.tabular Tue May 14 18:20:45 2019 -0400
b
@@ -0,0 +1,6 @@
+ Parameter Value
+@ copy_X copy_X: True
+@ fit_intercept fit_intercept: True
+* n_jobs n_jobs: 1
+@ normalize normalize: False
+ Note: @, params eligible for search in searchcv tool.
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 test-data/get_params01.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/get_params01.tabular Tue May 14 18:20:45 2019 -0400
[
@@ -0,0 +1,30 @@
+ Parameter Value
+* memory memory: None
+* steps "steps: [('robustscaler', RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
+       with_scaling=True)), ('selectkbest', SelectKBest(k=10, score_func=<function f_classif at 0x111ef0158>)), ('svr', SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
+  gamma='auto_deprecated', kernel='linear', max_iter=-1, shrinking=True,
+  tol=0.001, verbose=False))]"
+@ robustscaler "robustscaler: RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
+       with_scaling=True)"
+@ selectkbest selectkbest: SelectKBest(k=10, score_func=<function f_classif at 0x111ef0158>)
+@ svr "svr: SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
+  gamma='auto_deprecated', kernel='linear', max_iter=-1, shrinking=True,
+  tol=0.001, verbose=False)"
+@ robustscaler__copy robustscaler__copy: True
+@ robustscaler__quantile_range robustscaler__quantile_range: (25.0, 75.0)
+@ robustscaler__with_centering robustscaler__with_centering: True
+@ robustscaler__with_scaling robustscaler__with_scaling: True
+@ selectkbest__k selectkbest__k: 10
+@ selectkbest__score_func selectkbest__score_func: <function f_classif at 0x111ef0158>
+@ svr__C svr__C: 1.0
+@ svr__cache_size svr__cache_size: 200
+@ svr__coef0 svr__coef0: 0.0
+@ svr__degree svr__degree: 3
+@ svr__epsilon svr__epsilon: 0.1
+@ svr__gamma svr__gamma: 'auto_deprecated'
+@ svr__kernel svr__kernel: 'linear'
+@ svr__max_iter svr__max_iter: -1
+@ svr__shrinking svr__shrinking: True
+@ svr__tol svr__tol: 0.001
+* svr__verbose svr__verbose: False
+ Note: @, searchable params in searchcv too.
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 test-data/get_params02.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/get_params02.tabular Tue May 14 18:20:45 2019 -0400
[
@@ -0,0 +1,33 @@
+ Parameter Value
+* memory memory: None
+* steps "steps: [('robustscaler', RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
+       with_scaling=True)), ('lassocv', LassoCV(alphas=None, copy_X=True, cv='warn', eps=0.001, fit_intercept=True,
+    max_iter=1000, n_alphas=100, n_jobs=1, normalize=False, positive=False,
+    precompute='auto', random_state=None, selection='cyclic', tol=0.0001,
+    verbose=False))]"
+@ robustscaler "robustscaler: RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
+       with_scaling=True)"
+@ lassocv "lassocv: LassoCV(alphas=None, copy_X=True, cv='warn', eps=0.001, fit_intercept=True,
+    max_iter=1000, n_alphas=100, n_jobs=1, normalize=False, positive=False,
+    precompute='auto', random_state=None, selection='cyclic', tol=0.0001,
+    verbose=False)"
+@ robustscaler__copy robustscaler__copy: True
+@ robustscaler__quantile_range robustscaler__quantile_range: (25.0, 75.0)
+@ robustscaler__with_centering robustscaler__with_centering: True
+@ robustscaler__with_scaling robustscaler__with_scaling: True
+@ lassocv__alphas lassocv__alphas: None
+@ lassocv__copy_X lassocv__copy_X: True
+@ lassocv__cv lassocv__cv: 'warn'
+@ lassocv__eps lassocv__eps: 0.001
+@ lassocv__fit_intercept lassocv__fit_intercept: True
+@ lassocv__max_iter lassocv__max_iter: 1000
+@ lassocv__n_alphas lassocv__n_alphas: 100
+* lassocv__n_jobs lassocv__n_jobs: 1
+@ lassocv__normalize lassocv__normalize: False
+@ lassocv__positive lassocv__positive: False
+@ lassocv__precompute lassocv__precompute: 'auto'
+@ lassocv__random_state lassocv__random_state: None
+@ lassocv__selection lassocv__selection: 'cyclic'
+@ lassocv__tol lassocv__tol: 0.0001
+* lassocv__verbose lassocv__verbose: False
+ Note: @, searchable params in searchcv too.
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 test-data/get_params03.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/get_params03.tabular Tue May 14 18:20:45 2019 -0400
[
@@ -0,0 +1,43 @@
+ Parameter Value
+* memory memory: None
+* steps "steps: [('robustscaler', RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
+       with_scaling=True)), ('xgbclassifier', XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
+       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
+       max_depth=3, min_child_weight=1, missing=nan, n_estimators=100,
+       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
+       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
+       silent=True, subsample=1))]"
+@ robustscaler "robustscaler: RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
+       with_scaling=True)"
+@ xgbclassifier "xgbclassifier: XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
+       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
+       max_depth=3, min_child_weight=1, missing=nan, n_estimators=100,
+       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
+       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
+       silent=True, subsample=1)"
+@ robustscaler__copy robustscaler__copy: True
+@ robustscaler__quantile_range robustscaler__quantile_range: (25.0, 75.0)
+@ robustscaler__with_centering robustscaler__with_centering: True
+@ robustscaler__with_scaling robustscaler__with_scaling: True
+@ xgbclassifier__base_score xgbclassifier__base_score: 0.5
+@ xgbclassifier__booster xgbclassifier__booster: 'gbtree'
+@ xgbclassifier__colsample_bylevel xgbclassifier__colsample_bylevel: 1
+@ xgbclassifier__colsample_bytree xgbclassifier__colsample_bytree: 1
+@ xgbclassifier__gamma xgbclassifier__gamma: 0
+@ xgbclassifier__learning_rate xgbclassifier__learning_rate: 0.1
+@ xgbclassifier__max_delta_step xgbclassifier__max_delta_step: 0
+@ xgbclassifier__max_depth xgbclassifier__max_depth: 3
+@ xgbclassifier__min_child_weight xgbclassifier__min_child_weight: 1
+@ xgbclassifier__missing xgbclassifier__missing: nan
+@ xgbclassifier__n_estimators xgbclassifier__n_estimators: 100
+* xgbclassifier__n_jobs xgbclassifier__n_jobs: 1
+* xgbclassifier__nthread xgbclassifier__nthread: None
+@ xgbclassifier__objective xgbclassifier__objective: 'binary:logistic'
+@ xgbclassifier__random_state xgbclassifier__random_state: 0
+@ xgbclassifier__reg_alpha xgbclassifier__reg_alpha: 0
+@ xgbclassifier__reg_lambda xgbclassifier__reg_lambda: 1
+@ xgbclassifier__scale_pos_weight xgbclassifier__scale_pos_weight: 1
+@ xgbclassifier__seed xgbclassifier__seed: None
+@ xgbclassifier__silent xgbclassifier__silent: True
+@ xgbclassifier__subsample xgbclassifier__subsample: 1
+ Note: @, searchable params in searchcv too.
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 test-data/get_params04.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/get_params04.tabular Tue May 14 18:20:45 2019 -0400
[
@@ -0,0 +1,39 @@
+ Parameter Value
+* memory memory: None
+* steps "steps: [('selectfrommodel', SelectFromModel(estimator=AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
+          learning_rate=1.0, n_estimators=50, random_state=None),
+        max_features=None, norm_order=1, prefit=False, threshold=None)), ('linearsvc', LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
+     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
+     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
+     verbose=0))]"
+@ selectfrommodel "selectfrommodel: SelectFromModel(estimator=AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
+          learning_rate=1.0, n_estimators=50, random_state=None),
+        max_features=None, norm_order=1, prefit=False, threshold=None)"
+@ linearsvc "linearsvc: LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
+     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
+     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
+     verbose=0)"
+@ selectfrommodel__estimator__algorithm selectfrommodel__estimator__algorithm: 'SAMME.R'
+@ selectfrommodel__estimator__base_estimator selectfrommodel__estimator__base_estimator: None
+@ selectfrommodel__estimator__learning_rate selectfrommodel__estimator__learning_rate: 1.0
+@ selectfrommodel__estimator__n_estimators selectfrommodel__estimator__n_estimators: 50
+@ selectfrommodel__estimator__random_state selectfrommodel__estimator__random_state: None
+@ selectfrommodel__estimator "selectfrommodel__estimator: AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
+          learning_rate=1.0, n_estimators=50, random_state=None)"
+@ selectfrommodel__max_features selectfrommodel__max_features: None
+@ selectfrommodel__norm_order selectfrommodel__norm_order: 1
+@ selectfrommodel__prefit selectfrommodel__prefit: False
+@ selectfrommodel__threshold selectfrommodel__threshold: None
+@ linearsvc__C linearsvc__C: 1.0
+@ linearsvc__class_weight linearsvc__class_weight: None
+@ linearsvc__dual linearsvc__dual: True
+@ linearsvc__fit_intercept linearsvc__fit_intercept: True
+@ linearsvc__intercept_scaling linearsvc__intercept_scaling: 1
+@ linearsvc__loss linearsvc__loss: 'squared_hinge'
+@ linearsvc__max_iter linearsvc__max_iter: 1000
+@ linearsvc__multi_class linearsvc__multi_class: 'ovr'
+@ linearsvc__penalty linearsvc__penalty: 'l2'
+@ linearsvc__random_state linearsvc__random_state: None
+@ linearsvc__tol linearsvc__tol: 0.0001
+* linearsvc__verbose linearsvc__verbose: 0
+ Note: @, searchable params in searchcv too.
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 test-data/get_params05.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/get_params05.tabular Tue May 14 18:20:45 2019 -0400
[
@@ -0,0 +1,31 @@
+ Parameter Value
+* memory memory: None
+* steps "steps: [('randomforestregressor', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
+           max_features='auto', max_leaf_nodes=None,
+           min_impurity_decrease=0.0, min_impurity_split=None,
+           min_samples_leaf=1, min_samples_split=2,
+           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
+           oob_score=False, random_state=42, verbose=0, warm_start=False))]"
+@ randomforestregressor "randomforestregressor: RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
+           max_features='auto', max_leaf_nodes=None,
+           min_impurity_decrease=0.0, min_impurity_split=None,
+           min_samples_leaf=1, min_samples_split=2,
+           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
+           oob_score=False, random_state=42, verbose=0, warm_start=False)"
+@ randomforestregressor__bootstrap randomforestregressor__bootstrap: True
+@ randomforestregressor__criterion randomforestregressor__criterion: 'mse'
+@ randomforestregressor__max_depth randomforestregressor__max_depth: None
+@ randomforestregressor__max_features randomforestregressor__max_features: 'auto'
+@ randomforestregressor__max_leaf_nodes randomforestregressor__max_leaf_nodes: None
+@ randomforestregressor__min_impurity_decrease randomforestregressor__min_impurity_decrease: 0.0
+@ randomforestregressor__min_impurity_split randomforestregressor__min_impurity_split: None
+@ randomforestregressor__min_samples_leaf randomforestregressor__min_samples_leaf: 1
+@ randomforestregressor__min_samples_split randomforestregressor__min_samples_split: 2
+@ randomforestregressor__min_weight_fraction_leaf randomforestregressor__min_weight_fraction_leaf: 0.0
+@ randomforestregressor__n_estimators randomforestregressor__n_estimators: 100
+* randomforestregressor__n_jobs randomforestregressor__n_jobs: 1
+@ randomforestregressor__oob_score randomforestregressor__oob_score: False
+@ randomforestregressor__random_state randomforestregressor__random_state: 42
+* randomforestregressor__verbose randomforestregressor__verbose: 0
+@ randomforestregressor__warm_start randomforestregressor__warm_start: False
+ Note: @, searchable params in searchcv too.
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 test-data/get_params06.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/get_params06.tabular Tue May 14 18:20:45 2019 -0400
[
@@ -0,0 +1,22 @@
+ Parameter Value
+* memory memory: None
+* steps "steps: [('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
+  svd_solver='auto', tol=0.0, whiten=False)), ('adaboostregressor', AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
+         n_estimators=50, random_state=None))]"
+@ pca "pca: PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
+  svd_solver='auto', tol=0.0, whiten=False)"
+@ adaboostregressor "adaboostregressor: AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
+         n_estimators=50, random_state=None)"
+@ pca__copy pca__copy: True
+@ pca__iterated_power pca__iterated_power: 'auto'
+@ pca__n_components pca__n_components: None
+@ pca__random_state pca__random_state: None
+@ pca__svd_solver pca__svd_solver: 'auto'
+@ pca__tol pca__tol: 0.0
+@ pca__whiten pca__whiten: False
+@ adaboostregressor__base_estimator adaboostregressor__base_estimator: None
+@ adaboostregressor__learning_rate adaboostregressor__learning_rate: 1.0
+@ adaboostregressor__loss adaboostregressor__loss: 'linear'
+@ adaboostregressor__n_estimators adaboostregressor__n_estimators: 50
+@ adaboostregressor__random_state adaboostregressor__random_state: None
+ Note: @, searchable params in searchcv too.
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 test-data/get_params07.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/get_params07.tabular Tue May 14 18:20:45 2019 -0400
[
@@ -0,0 +1,16 @@
+ Parameter Value
+* memory memory: None
+* steps "steps: [('rbfsampler', RBFSampler(gamma=2.0, n_components=10, random_state=None)), ('adaboostclassifier', AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
+          learning_rate=1.0, n_estimators=50, random_state=None))]"
+@ rbfsampler rbfsampler: RBFSampler(gamma=2.0, n_components=10, random_state=None)
+@ adaboostclassifier "adaboostclassifier: AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
+          learning_rate=1.0, n_estimators=50, random_state=None)"
+@ rbfsampler__gamma rbfsampler__gamma: 2.0
+@ rbfsampler__n_components rbfsampler__n_components: 10
+@ rbfsampler__random_state rbfsampler__random_state: None
+@ adaboostclassifier__algorithm adaboostclassifier__algorithm: 'SAMME.R'
+@ adaboostclassifier__base_estimator adaboostclassifier__base_estimator: None
+@ adaboostclassifier__learning_rate adaboostclassifier__learning_rate: 1.0
+@ adaboostclassifier__n_estimators adaboostclassifier__n_estimators: 50
+@ adaboostclassifier__random_state adaboostclassifier__random_state: None
+ Note: @, searchable params in searchcv too.
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 test-data/get_params08.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/get_params08.tabular Tue May 14 18:20:45 2019 -0400
[
@@ -0,0 +1,24 @@
+ Parameter Value
+* memory memory: None
+* steps "steps: [('featureagglomeration', FeatureAgglomeration(affinity='euclidean', compute_full_tree='auto',
+           connectivity=None, linkage='ward', memory=None, n_clusters=3,
+           pooling_func=<function mean at 0x1123f1620>)), ('adaboostclassifier', AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
+          learning_rate=1.0, n_estimators=50, random_state=None))]"
+@ featureagglomeration "featureagglomeration: FeatureAgglomeration(affinity='euclidean', compute_full_tree='auto',
+           connectivity=None, linkage='ward', memory=None, n_clusters=3,
+           pooling_func=<function mean at 0x1123f1620>)"
+@ adaboostclassifier "adaboostclassifier: AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
+          learning_rate=1.0, n_estimators=50, random_state=None)"
+@ featureagglomeration__affinity featureagglomeration__affinity: 'euclidean'
+@ featureagglomeration__compute_full_tree featureagglomeration__compute_full_tree: 'auto'
+@ featureagglomeration__connectivity featureagglomeration__connectivity: None
+@ featureagglomeration__linkage featureagglomeration__linkage: 'ward'
+* featureagglomeration__memory featureagglomeration__memory: None
+@ featureagglomeration__n_clusters featureagglomeration__n_clusters: 3
+@ featureagglomeration__pooling_func featureagglomeration__pooling_func: <function mean at 0x1123f1620>
+@ adaboostclassifier__algorithm adaboostclassifier__algorithm: 'SAMME.R'
+@ adaboostclassifier__base_estimator adaboostclassifier__base_estimator: None
+@ adaboostclassifier__learning_rate adaboostclassifier__learning_rate: 1.0
+@ adaboostclassifier__n_estimators adaboostclassifier__n_estimators: 50
+@ adaboostclassifier__random_state adaboostclassifier__random_state: None
+ Note: @, searchable params in searchcv too.
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 test-data/get_params09.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/get_params09.tabular Tue May 14 18:20:45 2019 -0400
[
@@ -0,0 +1,39 @@
+ Parameter Value
+* memory memory: None
+* steps "steps: [('relieff', ReliefF(discrete_threshold=10, n_features_to_select=3, n_jobs=1,
+    n_neighbors=100, verbose=False)), ('randomforestregressor', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
+           max_features='auto', max_leaf_nodes=None,
+           min_impurity_decrease=0.0, min_impurity_split=None,
+           min_samples_leaf=1, min_samples_split=2,
+           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=1,
+           oob_score=False, random_state=None, verbose=0, warm_start=False))]"
+@ relieff "relieff: ReliefF(discrete_threshold=10, n_features_to_select=3, n_jobs=1,
+    n_neighbors=100, verbose=False)"
+@ randomforestregressor "randomforestregressor: RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
+           max_features='auto', max_leaf_nodes=None,
+           min_impurity_decrease=0.0, min_impurity_split=None,
+           min_samples_leaf=1, min_samples_split=2,
+           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=1,
+           oob_score=False, random_state=None, verbose=0, warm_start=False)"
+@ relieff__discrete_threshold relieff__discrete_threshold: 10
+@ relieff__n_features_to_select relieff__n_features_to_select: 3
+* relieff__n_jobs relieff__n_jobs: 1
+@ relieff__n_neighbors relieff__n_neighbors: 100
+* relieff__verbose relieff__verbose: False
+@ randomforestregressor__bootstrap randomforestregressor__bootstrap: True
+@ randomforestregressor__criterion randomforestregressor__criterion: 'mse'
+@ randomforestregressor__max_depth randomforestregressor__max_depth: None
+@ randomforestregressor__max_features randomforestregressor__max_features: 'auto'
+@ randomforestregressor__max_leaf_nodes randomforestregressor__max_leaf_nodes: None
+@ randomforestregressor__min_impurity_decrease randomforestregressor__min_impurity_decrease: 0.0
+@ randomforestregressor__min_impurity_split randomforestregressor__min_impurity_split: None
+@ randomforestregressor__min_samples_leaf randomforestregressor__min_samples_leaf: 1
+@ randomforestregressor__min_samples_split randomforestregressor__min_samples_split: 2
+@ randomforestregressor__min_weight_fraction_leaf randomforestregressor__min_weight_fraction_leaf: 0.0
+@ randomforestregressor__n_estimators randomforestregressor__n_estimators: 'warn'
+* randomforestregressor__n_jobs randomforestregressor__n_jobs: 1
+@ randomforestregressor__oob_score randomforestregressor__oob_score: False
+@ randomforestregressor__random_state randomforestregressor__random_state: None
+* randomforestregressor__verbose randomforestregressor__verbose: 0
+@ randomforestregressor__warm_start randomforestregressor__warm_start: False
+ Note: @, searchable params in searchcv too.
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 test-data/get_params10.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/get_params10.tabular Tue May 14 18:20:45 2019 -0400
[
@@ -0,0 +1,12 @@
+ Parameter Value
+* memory memory: None
+* steps "steps: [('adaboostregressor', AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
+         n_estimators=50, random_state=None))]"
+@ adaboostregressor "adaboostregressor: AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
+         n_estimators=50, random_state=None)"
+@ adaboostregressor__base_estimator adaboostregressor__base_estimator: None
+@ adaboostregressor__learning_rate adaboostregressor__learning_rate: 1.0
+@ adaboostregressor__loss adaboostregressor__loss: 'linear'
+@ adaboostregressor__n_estimators adaboostregressor__n_estimators: 50
+@ adaboostregressor__random_state adaboostregressor__random_state: None
+ Note: @, params eligible for search in searchcv tool.
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 test-data/get_params11.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/get_params11.tabular Tue May 14 18:20:45 2019 -0400
[
@@ -0,0 +1,46 @@
+ Parameter Value
+* memory memory: None
+* steps "steps: [('editednearestneighbours', EditedNearestNeighbours(kind_sel='all', n_jobs=1, n_neighbors=3,
+            random_state=None, ratio=None, return_indices=False,
+            sampling_strategy='auto')), ('randomforestclassifier', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
+            max_depth=None, max_features='auto', max_leaf_nodes=None,
+            min_impurity_decrease=0.0, min_impurity_split=None,
+            min_samples_leaf=1, min_samples_split=2,
+            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=1,
+            oob_score=False, random_state=None, verbose=0,
+            warm_start=False))]"
+@ editednearestneighbours "editednearestneighbours: EditedNearestNeighbours(kind_sel='all', n_jobs=1, n_neighbors=3,
+            random_state=None, ratio=None, return_indices=False,
+            sampling_strategy='auto')"
+@ randomforestclassifier "randomforestclassifier: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
+            max_depth=None, max_features='auto', max_leaf_nodes=None,
+            min_impurity_decrease=0.0, min_impurity_split=None,
+            min_samples_leaf=1, min_samples_split=2,
+            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=1,
+            oob_score=False, random_state=None, verbose=0,
+            warm_start=False)"
+@ editednearestneighbours__kind_sel editednearestneighbours__kind_sel: 'all'
+* editednearestneighbours__n_jobs editednearestneighbours__n_jobs: 1
+@ editednearestneighbours__n_neighbors editednearestneighbours__n_neighbors: 3
+@ editednearestneighbours__random_state editednearestneighbours__random_state: None
+@ editednearestneighbours__ratio editednearestneighbours__ratio: None
+@ editednearestneighbours__return_indices editednearestneighbours__return_indices: False
+@ editednearestneighbours__sampling_strategy editednearestneighbours__sampling_strategy: 'auto'
+@ randomforestclassifier__bootstrap randomforestclassifier__bootstrap: True
+@ randomforestclassifier__class_weight randomforestclassifier__class_weight: None
+@ randomforestclassifier__criterion randomforestclassifier__criterion: 'gini'
+@ randomforestclassifier__max_depth randomforestclassifier__max_depth: None
+@ randomforestclassifier__max_features randomforestclassifier__max_features: 'auto'
+@ randomforestclassifier__max_leaf_nodes randomforestclassifier__max_leaf_nodes: None
+@ randomforestclassifier__min_impurity_decrease randomforestclassifier__min_impurity_decrease: 0.0
+@ randomforestclassifier__min_impurity_split randomforestclassifier__min_impurity_split: None
+@ randomforestclassifier__min_samples_leaf randomforestclassifier__min_samples_leaf: 1
+@ randomforestclassifier__min_samples_split randomforestclassifier__min_samples_split: 2
+@ randomforestclassifier__min_weight_fraction_leaf randomforestclassifier__min_weight_fraction_leaf: 0.0
+@ randomforestclassifier__n_estimators randomforestclassifier__n_estimators: 'warn'
+* randomforestclassifier__n_jobs randomforestclassifier__n_jobs: 1
+@ randomforestclassifier__oob_score randomforestclassifier__oob_score: False
+@ randomforestclassifier__random_state randomforestclassifier__random_state: None
+* randomforestclassifier__verbose randomforestclassifier__verbose: 0
+@ randomforestclassifier__warm_start randomforestclassifier__warm_start: False
+ Note: @, searchable params in searchcv too.
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 test-data/get_params12.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/get_params12.tabular Tue May 14 18:20:45 2019 -0400
[
@@ -0,0 +1,47 @@
+ Parameter Value
+* memory memory: None
+* steps "steps: [('rfe', RFE(estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
+       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
+       max_depth=3, min_child_weight=1, missing=nan, n_estimators=100,
+       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
+       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
+       silent=True, subsample=1),
+  n_features_to_select=None, step=1, verbose=0))]"
+@ rfe "rfe: RFE(estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
+       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
+       max_depth=3, min_child_weight=1, missing=nan, n_estimators=100,
+       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
+       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
+       silent=True, subsample=1),
+  n_features_to_select=None, step=1, verbose=0)"
+@ rfe__estimator__base_score rfe__estimator__base_score: 0.5
+@ rfe__estimator__booster rfe__estimator__booster: 'gbtree'
+@ rfe__estimator__colsample_bylevel rfe__estimator__colsample_bylevel: 1
+@ rfe__estimator__colsample_bytree rfe__estimator__colsample_bytree: 1
+@ rfe__estimator__gamma rfe__estimator__gamma: 0
+@ rfe__estimator__learning_rate rfe__estimator__learning_rate: 0.1
+@ rfe__estimator__max_delta_step rfe__estimator__max_delta_step: 0
+@ rfe__estimator__max_depth rfe__estimator__max_depth: 3
+@ rfe__estimator__min_child_weight rfe__estimator__min_child_weight: 1
+@ rfe__estimator__missing rfe__estimator__missing: nan
+@ rfe__estimator__n_estimators rfe__estimator__n_estimators: 100
+* rfe__estimator__n_jobs rfe__estimator__n_jobs: 1
+* rfe__estimator__nthread rfe__estimator__nthread: None
+@ rfe__estimator__objective rfe__estimator__objective: 'reg:linear'
+@ rfe__estimator__random_state rfe__estimator__random_state: 0
+@ rfe__estimator__reg_alpha rfe__estimator__reg_alpha: 0
+@ rfe__estimator__reg_lambda rfe__estimator__reg_lambda: 1
+@ rfe__estimator__scale_pos_weight rfe__estimator__scale_pos_weight: 1
+@ rfe__estimator__seed rfe__estimator__seed: None
+@ rfe__estimator__silent rfe__estimator__silent: True
+@ rfe__estimator__subsample rfe__estimator__subsample: 1
+@ rfe__estimator "rfe__estimator: XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
+       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
+       max_depth=3, min_child_weight=1, missing=nan, n_estimators=100,
+       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
+       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
+       silent=True, subsample=1)"
+@ rfe__n_features_to_select rfe__n_features_to_select: None
+@ rfe__step rfe__step: 1
+* rfe__verbose rfe__verbose: 0
+ Note: @, searchable params in searchcv too.
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 test-data/mv_result01.tabular
--- a/test-data/mv_result01.tabular Sun Dec 30 01:58:56 2018 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,3 +0,0 @@
-0.9452947345848994
-0.9926363525448115
--0.4384003222944141
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 test-data/mv_result02.tabular
--- a/test-data/mv_result02.tabular Sun Dec 30 01:58:56 2018 -0500
+++ b/test-data/mv_result02.tabular Tue May 14 18:20:45 2019 -0400
b
@@ -1,10 +1,11 @@
-1.6957921248350636
--0.9248588846061156
--0.48640795813792376
-0.647707440306449
-0.32740690920811427
--0.8229559569886034
-1.2150108977866847
-0.14723254190255275
-0.6053186541119763
-0.3972102859168325
+Predicted
+1.578912095858962
+-1.199072894940544
+-0.7173258906076226
+0.3255908318822695
+0.21919344304093213
+-0.6841926371423699
+1.1144698671662865
+0.19379531649046616
+0.9405094785593062
+1.2581284896870837
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 test-data/mv_result03.tabular
--- a/test-data/mv_result03.tabular Sun Dec 30 01:58:56 2018 -0500
+++ b/test-data/mv_result03.tabular Tue May 14 18:20:45 2019 -0400
b
@@ -1,3 +1,6 @@
-0.9452947345848994
-0.9926363525448115
--0.4384003222944141
+train_sizes_abs mean_train_scores std_train_scores mean_test_scores std_test_scores
+17 0.9668700841937653 0.00277836829836518 0.7008862995946905 0.03857541198731935
+56 0.9730008602419361 0.006839342612121988 0.7963376762427242 0.004846330083938778
+95 0.9728783377589098 0.0037790183626530663 0.814592845745573 0.020457691766770824
+134 0.9739086338111185 0.001627343246847077 0.7985540571195479 0.03954641079310707
+174 0.9726218628287785 0.0032867750457225182 0.8152971572131146 0.04280261115004303
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 test-data/mv_result04.tabular
--- a/test-data/mv_result04.tabular Sun Dec 30 01:58:56 2018 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,5 +0,0 @@
-17
-56
-95
-134
-174
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 test-data/mv_result05.tabular
--- a/test-data/mv_result05.tabular Sun Dec 30 01:58:56 2018 -0500
+++ b/test-data/mv_result05.tabular Tue May 14 18:20:45 2019 -0400
b
@@ -1,1 +1,262 @@
-0.4998435882784322
+Predicted
+70.16
+62.06
+83.04
+62.84
+48.63
+51.25
+54.98
+80.3
+42.84
+41.52
+43.83
+73.15
+74.22
+42.88
+74.93
+72.9
+53.74
+78.86
+59.0
+40.28
+54.52
+58.34
+62.74
+62.35
+49.15
+41.92
+65.59
+59.91
+66.49
+72.08
+60.44
+53.84
+54.82
+52.66
+42.37
+61.3
+63.14
+50.62
+42.75
+47.39
+67.8
+73.58
+49.97
+67.04
+67.45
+54.67
+64.87
+77.23
+73.52
+53.55
+70.53
+77.98
+61.99
+53.08
+78.12
+66.55
+63.95
+60.57
+61.6
+60.37
+55.29
+54.31
+52.54
+65.31
+61.51
+57.3
+60.02
+43.64
+74.78
+68.26
+42.72
+61.26
+61.25
+71.58
+61.03
+70.53
+70.25
+43.4
+71.39
+72.31
+72.7
+72.11
+53.55
+43.4
+80.6
+73.72
+58.86
+76.71
+68.36
+60.26
+48.56
+38.96
+69.67
+52.9
+67.63
+75.12
+70.92
+70.89
+67.05
+43.89
+59.94
+62.98
+71.1
+79.22
+77.31
+79.06
+61.11
+66.32
+54.7
+61.1
+54.59
+58.7
+59.6
+73.79
+72.69
+81.83
+61.08
+69.21
+74.8
+54.37
+50.85
+53.07
+58.53
+55.44
+72.62
+54.14
+68.12
+48.81
+50.11
+56.06
+73.63
+63.29
+71.0
+74.87
+81.24
+54.67
+66.96
+61.37
+74.84
+76.71
+69.27
+56.53
+71.91
+58.74
+77.83
+64.57
+51.93
+42.84
+64.11
+59.47
+42.46
+43.79
+51.75
+63.98
+54.71
+64.95
+79.72
+72.12
+60.66
+79.3
+71.26
+59.9
+74.25
+59.68
+52.37
+78.52
+58.52
+71.98
+71.77
+54.48
+48.96
+81.42
+54.08
+53.52
+64.38
+70.79
+63.95
+67.48
+61.76
+66.15
+62.1
+75.68
+69.72
+43.8
+56.27
+53.38
+81.31
+57.54
+48.15
+59.47
+78.01
+56.39
+72.33
+78.8
+78.66
+52.01
+66.68
+48.56
+47.75
+65.67
+77.93
+72.68
+58.0
+77.83
+73.37
+65.39
+69.79
+55.98
+46.35
+54.31
+55.58
+79.69
+52.76
+62.62
+66.54
+60.29
+62.57
+74.86
+48.05
+65.09
+65.02
+67.84
+41.86
+62.28
+57.05
+43.68
+72.0
+63.04
+54.41
+73.37
+75.11
+42.65
+73.16
+71.68
+58.61
+53.54
+73.33
+72.16
+49.96
+54.78
+64.24
+60.13
+76.46
+61.53
+68.36
+53.1
+71.33
+76.12
+70.86
+61.35
+67.12
+43.25
+80.2
+71.16
+58.63
+52.37
+74.93
+53.34
+76.41
+63.87
+59.97
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 test-data/mv_result06.tabular
--- a/test-data/mv_result06.tabular Sun Dec 30 01:58:56 2018 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,5 +0,0 @@
-0.07547169811320754 0.10344827586206896 0.10294117647058823
-0.07547169811320754 0.10344827586206896 0.10294117647058823
-0.07547169811320754 0.10344827586206896 0.10294117647058823
-0.07547169811320754 0.10344827586206896 0.10294117647058823
-0.07547169811320754 0.10344827586206896 0.10294117647058823
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 test-data/named_steps.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/named_steps.txt Tue May 14 18:20:45 2019 -0400
b
@@ -0,0 +1,6 @@
+{'preprocessing_1': SelectKBest(k=10, score_func=<function f_regression at 0x113310ea0>), 'estimator': XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
+       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
+       max_depth=3, min_child_weight=1, missing=nan, n_estimators=100,
+       n_jobs=1, nthread=None, objective='reg:linear', random_state=10,
+       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
+       silent=True, subsample=1)}
\ No newline at end of file
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 test-data/nn_model01
b
Binary file test-data/nn_model01 has changed
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 test-data/pipeline01
b
Binary file test-data/pipeline01 has changed
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 test-data/pipeline02
b
Binary file test-data/pipeline02 has changed
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 test-data/pipeline03
b
Binary file test-data/pipeline03 has changed
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 test-data/pipeline04
b
Binary file test-data/pipeline04 has changed
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 test-data/pipeline05
b
Binary file test-data/pipeline05 has changed
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 test-data/pipeline06
b
Binary file test-data/pipeline06 has changed
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 test-data/pipeline07
b
Binary file test-data/pipeline07 has changed
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 test-data/pipeline08
b
Binary file test-data/pipeline08 has changed
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 test-data/pipeline09
b
Binary file test-data/pipeline09 has changed
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 test-data/pipeline10
b
Binary file test-data/pipeline10 has changed
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 test-data/pipeline11
b
Binary file test-data/pipeline11 has changed
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 test-data/pipeline12
b
Binary file test-data/pipeline12 has changed
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 test-data/pipeline13
b
Binary file test-data/pipeline13 has changed
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 test-data/pipeline14
b
Binary file test-data/pipeline14 has changed
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 test-data/pipeline15
b
Binary file test-data/pipeline15 has changed
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 test-data/ranking_.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/ranking_.tabular Tue May 14 18:20:45 2019 -0400
b
@@ -0,0 +1,18 @@
+ranking_
+17
+7
+4
+5
+2
+1
+9
+6
+8
+3
+10
+15
+14
+11
+13
+12
+16
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 test-data/searchCV01
b
Binary file test-data/searchCV01 has changed
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 test-data/searchCV02
b
Binary file test-data/searchCV02 has changed
b
diff -r 3cd0dbc038ec -r 9bf11bbeccc3 utils.py
--- a/utils.py Sun Dec 30 01:58:56 2018 -0500
+++ b/utils.py Tue May 14 18:20:45 2019 -0400
[
b'@@ -1,80 +1,134 @@\n+import ast\n import json\n+import imblearn\n import numpy as np\n-import os\n import pandas\n import pickle\n import re\n import scipy\n import sklearn\n+import skrebate\n import sys\n import warnings\n import xgboost\n \n+from collections import Counter\n from asteval import Interpreter, make_symbol_table\n-from sklearn import (cluster, compose, decomposition, ensemble, feature_extraction,\n-                    feature_selection, gaussian_process, kernel_approximation, metrics,\n-                    model_selection, naive_bayes, neighbors, pipeline, preprocessing,\n-                    svm, linear_model, tree, discriminant_analysis)\n+from imblearn import under_sampling, over_sampling, combine\n+from imblearn.pipeline import Pipeline as imbPipeline\n+from mlxtend import regressor, classifier\n+from scipy.io import mmread\n+from sklearn import (\n+    cluster, compose, decomposition, ensemble, feature_extraction,\n+    feature_selection, gaussian_process, kernel_approximation, metrics,\n+    model_selection, naive_bayes, neighbors, pipeline, preprocessing,\n+    svm, linear_model, tree, discriminant_analysis)\n+\n+try:\n+    import iraps_classifier\n+except ImportError:\n+    pass\n \n try:\n-    import skrebate\n-except ModuleNotFoundError:\n+    import model_validations\n+except ImportError:\n+    pass\n+\n+try:\n+    import feature_selectors\n+except ImportError:\n     pass\n \n-\n-N_JOBS = int(os.environ.get(\'GALAXY_SLOTS\', 1))\n+try:\n+    import preprocessors\n+except ImportError:\n+    pass\n \n-try:\n-    sk_whitelist\n-except NameError:\n-    sk_whitelist = None\n+# handle pickle white list file\n+WL_FILE = __import__(\'os\').path.join(\n+    __import__(\'os\').path.dirname(__file__), \'pk_whitelist.json\')\n+\n+N_JOBS = int(__import__(\'os\').environ.get(\'GALAXY_SLOTS\', 1))\n \n \n-class SafePickler(pickle.Unpickler):\n+class _SafePickler(pickle.Unpickler, object):\n     """\n-    Used to safely deserialize scikit-learn model objects serialized by cPickle.dump\n+    Used to safely deserialize scikit-learn model objects\n     Usage:\n-        eg.: SafePickler.load(pickled_file_object)\n+        eg.: _SafePickler.load(pickled_file_object)\n     """\n-    def find_class(self, module, name):\n+    def __init__(self, file):\n+        super(_SafePickler, self).__init__(file)\n+        # load global white list\n+        with open(WL_FILE, \'r\') as f:\n+            self.pk_whitelist = json.load(f)\n \n-        # sk_whitelist could be read from tool\n-        global sk_whitelist\n-        if not sk_whitelist:\n-            whitelist_file = os.path.join(os.path.dirname(__file__), \'sk_whitelist.json\')\n-            with open(whitelist_file, \'r\') as f:\n-                sk_whitelist = json.load(f)\n+        self.bad_names = (\n+            \'and\', \'as\', \'assert\', \'break\', \'class\', \'continue\',\n+            \'def\', \'del\', \'elif\', \'else\', \'except\', \'exec\',\n+            \'finally\', \'for\', \'from\', \'global\', \'if\', \'import\',\n+            \'in\', \'is\', \'lambda\', \'not\', \'or\', \'pass\', \'print\',\n+            \'raise\', \'return\', \'try\', \'system\', \'while\', \'with\',\n+            \'True\', \'False\', \'None\', \'eval\', \'execfile\', \'__import__\',\n+            \'__package__\', \'__subclasses__\', \'__bases__\', \'__globals__\',\n+            \'__code__\', \'__closure__\', \'__func__\', \'__self__\', \'__module__\',\n+            \'__dict__\', \'__class__\', \'__call__\', \'__get__\',\n+            \'__getattribute__\', \'__subclasshook__\', \'__new__\',\n+            \'__init__\', \'func_globals\', \'func_code\', \'func_closure\',\n+            \'im_class\', \'im_func\', \'im_self\', \'gi_code\', \'gi_frame\',\n+            \'__asteval__\', \'f_locals\', \'__mro__\')\n \n-        bad_names = (\'and\', \'as\', \'assert\', \'break\', \'class\', \'continue\',\n-                    \'def\', \'del\', \'elif\', \'else\', \'except\', \'exec\',\n-                    \'finally\', \'for\', \'from\', \'global\', \'if\', \'import\',\n-                    \'in\', \'is\', \'lambda\', \'not\', \'or\', \'pass\', \'print\',\n-                    \'raise\', \'return\', \'try\', \'system\', \'while\', \'with\',\n-                    \'True\', \'False\', \'None\', \'eval\', \'execfile\', \'__impo'..b'eader_name\', \'all_but_by_header_name\']:\n+            c = groups[\'column_selector_options_g\'][\'col_g\']\n+        else:\n+            c = None\n+        groups = read_columns(\n+                infile_g,\n+                c=c,\n+                c_option=column_option,\n+                sep=\'\\t\',\n+                header=header,\n+                parse_dates=True)\n+        groups = groups.ravel()\n \n     for k, v in cv_json.items():\n         if v == \'\':\n@@ -341,7 +502,12 @@\n     if test_size and test_size > 1.0:\n         cv_json[\'test_size\'] = int(test_size)\n \n-    cv_class = getattr(model_selection, cv)\n+    if cv == \'OrderedKFold\':\n+        cv_class = try_get_attr(\'model_validations\', \'OrderedKFold\')\n+    elif cv == \'RepeatedOrderedKFold\':\n+        cv_class = try_get_attr(\'model_validations\', \'RepeatedOrderedKFold\')\n+    else:\n+        cv_class = getattr(model_selection, cv)\n     splitter = cv_class(**cv_json)\n \n     return splitter, groups\n@@ -349,6 +515,9 @@\n \n # needed when sklearn < v0.20\n def balanced_accuracy_score(y_true, y_pred):\n+    """Compute balanced accuracy score, which is now available in\n+        scikit-learn from v0.20.0.\n+    """\n     C = metrics.confusion_matrix(y_true, y_pred)\n     with np.errstate(divide=\'ignore\', invalid=\'ignore\'):\n         per_class = np.diag(C) / C.sum(axis=1)\n@@ -360,21 +529,71 @@\n \n \n def get_scoring(scoring_json):\n-\n+    """Return single sklearn scorer class\n+        or multiple scoers in dictionary\n+    """\n     if scoring_json[\'primary_scoring\'] == \'default\':\n         return None\n \n     my_scorers = metrics.SCORERS\n+    my_scorers[\'binarize_auc_scorer\'] =\\\n+        try_get_attr(\'iraps_classifier\', \'binarize_auc_scorer\')\n+    my_scorers[\'binarize_average_precision_scorer\'] =\\\n+        try_get_attr(\'iraps_classifier\', \'binarize_average_precision_scorer\')\n     if \'balanced_accuracy\' not in my_scorers:\n-        my_scorers[\'balanced_accuracy\'] = metrics.make_scorer(balanced_accuracy_score)\n+        my_scorers[\'balanced_accuracy\'] =\\\n+            metrics.make_scorer(balanced_accuracy_score)\n \n     if scoring_json[\'secondary_scoring\'] != \'None\'\\\n-            and scoring_json[\'secondary_scoring\'] != scoring_json[\'primary_scoring\']:\n-        scoring = {}\n-        scoring[\'primary\'] = my_scorers[scoring_json[\'primary_scoring\']]\n+            and scoring_json[\'secondary_scoring\'] !=\\\n+            scoring_json[\'primary_scoring\']:\n+        return_scoring = {}\n+        primary_scoring = scoring_json[\'primary_scoring\']\n+        return_scoring[primary_scoring] = my_scorers[primary_scoring]\n         for scorer in scoring_json[\'secondary_scoring\'].split(\',\'):\n             if scorer != scoring_json[\'primary_scoring\']:\n-                scoring[scorer] = my_scorers[scorer]\n-        return scoring\n+                return_scoring[scorer] = my_scorers[scorer]\n+        return return_scoring\n \n     return my_scorers[scoring_json[\'primary_scoring\']]\n+\n+\n+def get_search_params(estimator):\n+    """Format the output of `estimator.get_params()`\n+    """\n+    params = estimator.get_params()\n+    results = []\n+    for k, v in params.items():\n+        # params below won\'t be shown for search in the searchcv tool\n+        keywords = (\'n_jobs\', \'pre_dispatch\', \'memory\', \'steps\',\n+                    \'nthread\', \'verbose\')\n+        if k.endswith(keywords):\n+            results.append([\'*\', k, k+": "+repr(v)])\n+        else:\n+            results.append([\'@\', k, k+": "+repr(v)])\n+    results.append(\n+        ["", "Note:",\n+         "@, params eligible for search in searchcv tool."])\n+\n+    return results\n+\n+\n+def try_get_attr(module, name):\n+    """try to get attribute from a custom module\n+\n+    Parameters\n+    ----------\n+    module : str\n+        Module name\n+    name : str\n+        Attribute (class/function) name.\n+\n+    Returns\n+    -------\n+    class or function\n+    """\n+    mod = sys.modules.get(module, None)\n+    if mod:\n+        return getattr(mod, name)\n+    else:\n+        raise Exception("No module named %s." % module)\n'