Previous changeset 17:2bbbac61e48d (2018-12-30) Next changeset 19:231e9a9849e8 (2019-07-09) |
Commit message:
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit c0a3a186966888e5787335a7628bf0a4382637e7 |
modified:
feature_selection.xml main_macros.xml search_model_validation.py test-data/mv_result02.tabular test-data/mv_result03.tabular test-data/mv_result05.tabular test-data/nn_model01 test-data/pipeline01 test-data/pipeline02 test-data/pipeline03 test-data/pipeline04 test-data/pipeline05 test-data/pipeline06 test-data/pipeline07 test-data/pipeline08 test-data/pipeline09 test-data/pipeline10 test-data/pipeline11 test-data/pipeline12 test-data/searchCV01 test-data/searchCV02 utils.py |
added:
feature_selectors.py iraps_classifier.py model_validations.py pk_whitelist.json preprocessors.py stacking_ensembles.py test-data/GridSearchCV.zip test-data/LinearRegression01.zip test-data/LinearRegression02.zip test-data/RFE.zip test-data/RandomForestClassifier.zip test-data/RandomForestRegressor01.zip test-data/StackingCVRegressor01.zip test-data/StackingCVRegressor02.zip test-data/XGBRegressor01.zip test-data/best_estimator_.zip test-data/best_params_.txt test-data/best_score_.tabular test-data/feature_importances_.tabular test-data/feature_selection_result13 test-data/final_estimator.zip test-data/get_params.tabular test-data/get_params01.tabular test-data/get_params02.tabular test-data/get_params03.tabular test-data/get_params04.tabular test-data/get_params05.tabular test-data/get_params06.tabular test-data/get_params07.tabular test-data/get_params08.tabular test-data/get_params09.tabular test-data/get_params10.tabular test-data/get_params11.tabular test-data/get_params12.tabular test-data/named_steps.txt test-data/pipeline13 test-data/pipeline14 test-data/pipeline15 test-data/ranking_.tabular |
removed:
sk_whitelist.json test-data/mv_result01.tabular test-data/mv_result04.tabular test-data/mv_result06.tabular |
b |
diff -r 2bbbac61e48d -r ec25331946b8 feature_selection.xml --- a/feature_selection.xml Sun Dec 30 01:57:11 2018 -0500 +++ b/feature_selection.xml Tue May 14 18:17:57 2019 -0400 |
[ |
@@ -4,6 +4,7 @@ <import>main_macros.xml</import> </macros> <expand macro="python_requirements"/> + <!--TODO: Add imblearn package support--> <expand macro="macro_stdio"/> <version_command>echo "@VERSION@"</version_command> <command> @@ -17,10 +18,21 @@ <![CDATA[ import json import sklearn.feature_selection +import skrebate +import pandas +import sys +import warnings +import xgboost +from sklearn import ( + cluster, compose, decomposition, ensemble, feature_extraction, + feature_selection, gaussian_process, kernel_approximation, metrics, + model_selection, naive_bayes, neighbors, pipeline, preprocessing, + svm, linear_model, tree, discriminant_analysis) +from imblearn.pipeline import Pipeline as imbPipeline +from sklearn.pipeline import Pipeline -with open('$__tool_directory__/sk_whitelist.json', 'r') as f: - sk_whitelist = json.load(f) -exec(open('$__tool_directory__/utils.py').read(), globals()) +sys.path.insert(0, '$__tool_directory__') +from utils import SafeEval, feature_selector, read_columns warnings.simplefilter('ignore') @@ -30,7 +42,7 @@ with open(input_json_path, 'r') as param_handler: params = json.load(param_handler) -#handle cheetah +## handle cheetah #if $fs_algorithm_selector.selected_algorithm == 'SelectFromModel'\ and $fs_algorithm_selector.model_inputter.input_mode == 'prefitted': params['fs_algorithm_selector']['model_inputter']['fitted_estimator'] =\ @@ -39,18 +51,25 @@ #if $fs_algorithm_selector.selected_algorithm == 'SelectFromModel'\ and $fs_algorithm_selector.model_inputter.input_mode == 'new'\ - and $fs_algorithm_selector.model_inputter.estimator_selector.selected_module == 'customer_estimator': + and $fs_algorithm_selector.model_inputter.estimator_selector.selected_module == 'custom_estimator': params['fs_algorithm_selector']['model_inputter']['estimator_selector']['c_estimator'] =\ '$fs_algorithm_selector.model_inputter.estimator_selector.c_estimator' #end if -#if $fs_algorithm_selector.selected_algorithm in ['RFE', 'RFECV']\ - and $fs_algorithm_selector.estimator_selector.selected_module == 'customer_estimator': +#if $fs_algorithm_selector.selected_algorithm in ['RFE', 'RFECV', 'DyRFECV']\ + and $fs_algorithm_selector.estimator_selector.selected_module == 'custom_estimator': params['fs_algorithm_selector']['estimator_selector']['c_estimator'] =\ '$fs_algorithm_selector.estimator_selector.c_estimator' #end if -# Read features +#if $fs_algorithm_selector.selected_algorithm in ['RFECV', 'DyRFECV']\ + and $fs_algorithm_selector.options.cv_selector.selected_cv\ + in ['GroupKFold', 'GroupShuffleSplit', 'LeaveOneGroupOut', 'LeavePGroupsOut']: +params['fs_algorithm_selector']['options']['cv_selector']['groups_selector']['infile_g'] =\ + '$fs_algorithm_selector.options.cv_selector.groups_selector.infile_g' +#end if + +## Read features features_has_header = params['input_options']['header1'] input_type = params['input_options']['selected_input'] if input_type == 'tabular': @@ -67,12 +86,12 @@ return_df = True, sep='\t', header=header, - parse_dates=True - ) + parse_dates=True) + X = X.astype(float) else: X = mmread('$input_options.infile1') -# Read labels +## Read labels header = 'infer' if params['input_options']['header2'] else None column_option = params['input_options']['column_selector_options_2']['selected_column_selector_option2'] if column_option in ['by_index_number', 'all_but_by_index_number', 'by_header_name', 'all_but_by_header_name']: @@ -85,12 +104,11 @@ c_option = column_option, sep='\t', header=header, - parse_dates=True -) -y=y.ravel() + parse_dates=True) +y = y.ravel() -# Create feature selector -new_selector = feature_selector(params['fs_algorithm_selector']) +## Create feature selector +new_selector = feature_selector(params['fs_algorithm_selector'], X=X, y=y) if params['fs_algorithm_selector']['selected_algorithm'] != 'SelectFromModel'\ or params['fs_algorithm_selector']['model_inputter']['input_mode'] != 'prefitted' : new_selector.fit(X, y) @@ -266,6 +284,28 @@ <param name="header2" value="false"/> <output name="outfile" file="feature_selection_result12"/> </test> + <test> + <param name="selected_algorithm" value="RFECV"/> + <param name="input_mode" value="new"/> + <param name="selected_module" value="ensemble"/> + <param name="selected_estimator" value="RandomForestRegressor"/> + <param name="text_params" value="n_estimators=10, random_state=10"/> + <section name="groups_selector"> + <param name="infile_groups" value="regression_y.tabular" ftype="tabular"/> + <param name="header_g" value="true"/> + <param name="selected_column_selector_option_g" value="by_index_number"/> + <param name="col_g" value="1"/> + </section> + <param name="selected_cv" value="GroupShuffleSplit"/> + <param name="random_state" value="0"/> + <param name="infile1" value="regression_X.tabular" ftype="tabular"/> + <param name="header1" value="true"/> + <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17"/> + <param name="infile2" value="regression_y.tabular" ftype="tabular"/> + <param name="col2" value="1"/> + <param name="header2" value="true"/> + <output name="outfile" file="feature_selection_result13"/> + </test> </tests> <help> <![CDATA[ |
b |
diff -r 2bbbac61e48d -r ec25331946b8 feature_selectors.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/feature_selectors.py Tue May 14 18:17:57 2019 -0400 |
[ |
b'@@ -0,0 +1,357 @@\n+"""\n+DyRFE\n+DyRFECV\n+MyPipeline\n+MyimbPipeline\n+check_feature_importances\n+"""\n+import numpy as np\n+\n+from imblearn import under_sampling, over_sampling, combine\n+from imblearn.pipeline import Pipeline as imbPipeline\n+from sklearn import (cluster, compose, decomposition, ensemble,\n+ feature_extraction, feature_selection,\n+ gaussian_process, kernel_approximation,\n+ metrics, model_selection, naive_bayes,\n+ neighbors, pipeline, preprocessing,\n+ svm, linear_model, tree, discriminant_analysis)\n+\n+from sklearn.base import BaseEstimator\n+from sklearn.base import MetaEstimatorMixin, clone, is_classifier\n+from sklearn.feature_selection.rfe import _rfe_single_fit, RFE, RFECV\n+from sklearn.model_selection import check_cv\n+from sklearn.metrics.scorer import check_scoring\n+from sklearn.utils import check_X_y, safe_indexing, safe_sqr\n+from sklearn.utils._joblib import Parallel, delayed, effective_n_jobs\n+\n+\n+class DyRFE(RFE):\n+ """\n+ Mainly used with DyRFECV\n+\n+ Parameters\n+ ----------\n+ estimator : object\n+ A supervised learning estimator with a ``fit`` method that provides\n+ information about feature importance either through a ``coef_``\n+ attribute or through a ``feature_importances_`` attribute.\n+ n_features_to_select : int or None (default=None)\n+ The number of features to select. If `None`, half of the features\n+ are selected.\n+ step : int, float or list, optional (default=1)\n+ If greater than or equal to 1, then ``step`` corresponds to the\n+ (integer) number of features to remove at each iteration.\n+ If within (0.0, 1.0), then ``step`` corresponds to the percentage\n+ (rounded down) of features to remove at each iteration.\n+ If list, a series of steps of features to remove at each iteration.\n+ Iterations stops when steps finish\n+ verbose : int, (default=0)\n+ Controls verbosity of output.\n+\n+ """\n+ def __init__(self, estimator, n_features_to_select=None, step=1,\n+ verbose=0):\n+ super(DyRFE, self).__init__(estimator, n_features_to_select,\n+ step, verbose)\n+\n+ def _fit(self, X, y, step_score=None):\n+\n+ if type(self.step) is not list:\n+ return super(DyRFE, self)._fit(X, y, step_score)\n+\n+ # dynamic step\n+ X, y = check_X_y(X, y, "csc")\n+ # Initialization\n+ n_features = X.shape[1]\n+ if self.n_features_to_select is None:\n+ n_features_to_select = n_features // 2\n+ else:\n+ n_features_to_select = self.n_features_to_select\n+\n+ step = []\n+ for s in self.step:\n+ if 0.0 < s < 1.0:\n+ step.append(int(max(1, s * n_features)))\n+ else:\n+ step.append(int(s))\n+ if s <= 0:\n+ raise ValueError("Step must be >0")\n+\n+ support_ = np.ones(n_features, dtype=np.bool)\n+ ranking_ = np.ones(n_features, dtype=np.int)\n+\n+ if step_score:\n+ self.scores_ = []\n+\n+ step_i = 0\n+ # Elimination\n+ while np.sum(support_) > n_features_to_select and step_i < len(step):\n+\n+ # if last step is 1, will keep loop\n+ if step_i == len(step) - 1 and step[step_i] != 0:\n+ step.append(step[step_i])\n+\n+ # Remaining features\n+ features = np.arange(n_features)[support_]\n+\n+ # Rank the remaining features\n+ estimator = clone(self.estimator)\n+ if self.verbose > 0:\n+ print("Fitting estimator with %d features." % np.sum(support_))\n+\n+ estimator.fit(X[:, features], y)\n+\n+ # Get coefs\n+ if hasattr(estimator, \'coef_\'):\n+ coefs = estimator.coef_\n+ else:\n+ coefs = getattr(estimator, \'feature_importances_\', None)\n+ '..b' # Note that joblib raises a non-picklable error for bound methods\n+ # even if n_jobs is set to 1 with the default multiprocessing\n+ # backend.\n+ # This branching is done so that to\n+ # make sure that user code that sets n_jobs to 1\n+ # and provides bound methods as scorers is not broken with the\n+ # addition of n_jobs parameter in version 0.18.\n+\n+ if effective_n_jobs(self.n_jobs) == 1:\n+ parallel, func = list, _rfe_single_fit\n+ else:\n+ parallel = Parallel(n_jobs=self.n_jobs)\n+ func = delayed(_rfe_single_fit)\n+\n+ scores = parallel(\n+ func(rfe, self.estimator, X, y, train, test, scorer)\n+ for train, test in cv.split(X, y, groups))\n+\n+ scores = np.sum(scores, axis=0)\n+ diff = int(scores.shape[0]) - len(step)\n+ if diff > 0:\n+ step = np.r_[step, [step[-1]] * diff]\n+ scores_rev = scores[::-1]\n+ argmax_idx = len(scores) - np.argmax(scores_rev) - 1\n+ n_features_to_select = max(\n+ n_features - sum(step[:argmax_idx]),\n+ self.min_features_to_select)\n+\n+ # Re-execute an elimination with best_k over the whole set\n+ rfe = DyRFE(estimator=self.estimator,\n+ n_features_to_select=n_features_to_select, step=self.step,\n+ verbose=self.verbose)\n+\n+ rfe.fit(X, y)\n+\n+ # Set final attributes\n+ self.support_ = rfe.support_\n+ self.n_features_ = rfe.n_features_\n+ self.ranking_ = rfe.ranking_\n+ self.estimator_ = clone(self.estimator)\n+ self.estimator_.fit(self.transform(X), y)\n+\n+ # Fixing a normalization error, n is equal to get_n_splits(X, y) - 1\n+ # here, the scores are normalized by get_n_splits(X, y)\n+ self.grid_scores_ = scores[::-1] / cv.get_n_splits(X, y, groups)\n+ return self\n+\n+\n+class MyPipeline(pipeline.Pipeline):\n+ """\n+ Extend pipeline object to have feature_importances_ attribute\n+ """\n+ def fit(self, X, y=None, **fit_params):\n+ super(MyPipeline, self).fit(X, y, **fit_params)\n+ estimator = self.steps[-1][-1]\n+ if hasattr(estimator, \'coef_\'):\n+ coefs = estimator.coef_\n+ else:\n+ coefs = getattr(estimator, \'feature_importances_\', None)\n+ if coefs is None:\n+ raise RuntimeError(\'The estimator in the pipeline does not expose \'\n+ \'"coef_" or "feature_importances_" \'\n+ \'attributes\')\n+ self.feature_importances_ = coefs\n+ return self\n+\n+\n+class MyimbPipeline(imbPipeline):\n+ """\n+ Extend imblance pipeline object to have feature_importances_ attribute\n+ """\n+ def fit(self, X, y=None, **fit_params):\n+ super(MyimbPipeline, self).fit(X, y, **fit_params)\n+ estimator = self.steps[-1][-1]\n+ if hasattr(estimator, \'coef_\'):\n+ coefs = estimator.coef_\n+ else:\n+ coefs = getattr(estimator, \'feature_importances_\', None)\n+ if coefs is None:\n+ raise RuntimeError(\'The estimator in the pipeline does not expose \'\n+ \'"coef_" or "feature_importances_" \'\n+ \'attributes\')\n+ self.feature_importances_ = coefs\n+ return self\n+\n+\n+def check_feature_importances(estimator):\n+ """\n+ For pipeline object which has no feature_importances_ property,\n+ this function returns the same comfigured pipeline object with\n+ attached the last estimator\'s feature_importances_.\n+ """\n+ if estimator.__class__.__module__ == \'sklearn.pipeline\':\n+ pipeline_steps = estimator.get_params()[\'steps\']\n+ estimator = MyPipeline(pipeline_steps)\n+ elif estimator.__class__.__module__ == \'imblearn.pipeline\':\n+ pipeline_steps = estimator.get_params()[\'steps\']\n+ estimator = MyimbPipeline(pipeline_steps)\n+ else:\n+ return estimator\n' |
b |
diff -r 2bbbac61e48d -r ec25331946b8 iraps_classifier.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/iraps_classifier.py Tue May 14 18:17:57 2019 -0400 |
[ |
b'@@ -0,0 +1,569 @@\n+"""\n+class IRAPSCore\n+class IRAPSClassifier\n+class BinarizeTargetClassifier\n+class BinarizeTargetRegressor\n+class _BinarizeTargetScorer\n+class _BinarizeTargetProbaScorer\n+\n+binarize_auc_scorer\n+binarize_average_precision_scorer\n+\n+binarize_accuracy_scorer\n+binarize_balanced_accuracy_scorer\n+binarize_precision_scorer\n+binarize_recall_scorer\n+"""\n+\n+\n+import numpy as np\n+import random\n+import warnings\n+\n+from abc import ABCMeta\n+from scipy.stats import ttest_ind\n+from sklearn import metrics\n+from sklearn.base import BaseEstimator, clone, RegressorMixin\n+from sklearn.externals import six\n+from sklearn.feature_selection.univariate_selection import _BaseFilter\n+from sklearn.metrics.scorer import _BaseScorer\n+from sklearn.pipeline import Pipeline\n+from sklearn.utils import as_float_array, check_X_y\n+from sklearn.utils._joblib import Parallel, delayed\n+from sklearn.utils.validation import (check_array, check_is_fitted,\n+ check_memory, column_or_1d)\n+\n+\n+VERSION = \'0.1.1\'\n+\n+\n+class IRAPSCore(six.with_metaclass(ABCMeta, BaseEstimator)):\n+ """\n+ Base class of IRAPSClassifier\n+ From sklearn BaseEstimator:\n+ get_params()\n+ set_params()\n+\n+ Parameters\n+ ----------\n+ n_iter : int\n+ sample count\n+\n+ positive_thres : float\n+ z_score shreshold to discretize positive target values\n+\n+ negative_thres : float\n+ z_score threshold to discretize negative target values\n+\n+ verbose : int\n+ 0 or geater, if not 0, print progress\n+\n+ n_jobs : int, default=1\n+ The number of CPUs to use to do the computation.\n+\n+ pre_dispatch : int, or string.\n+ Controls the number of jobs that get dispatched during parallel\n+ execution. Reducing this number can be useful to avoid an\n+ explosion of memory consumption when more jobs get dispatched\n+ than CPUs can process. This parameter can be:\n+ - None, in which case all the jobs are immediately\n+ created and spawned. Use this for lightweight and\n+ fast-running jobs, to avoid delays due to on-demand\n+ spawning of the jobs\n+ - An int, giving the exact number of total jobs that are\n+ spawned\n+ - A string, giving an expression as a function of n_jobs,\n+ as in \'2*n_jobs\'\n+\n+ random_state : int or None\n+ """\n+\n+ def __init__(self, n_iter=1000, positive_thres=-1, negative_thres=0,\n+ verbose=0, n_jobs=1, pre_dispatch=\'2*n_jobs\',\n+ random_state=None):\n+ """\n+ IRAPS turns towwards general Anomaly Detection\n+ It comapares positive_thres with negative_thres,\n+ and decide which portion is the positive target.\n+ e.g.:\n+ (positive_thres=-1, negative_thres=0)\n+ => positive = Z_score of target < -1\n+ (positive_thres=1, negative_thres=0)\n+ => positive = Z_score of target > 1\n+\n+ Note: The positive targets here is always the\n+ abnormal minority group.\n+ """\n+ self.n_iter = n_iter\n+ self.positive_thres = positive_thres\n+ self.negative_thres = negative_thres\n+ self.verbose = verbose\n+ self.n_jobs = n_jobs\n+ self.pre_dispatch = pre_dispatch\n+ self.random_state = random_state\n+\n+ def fit(self, X, y):\n+ """\n+ X: array-like (n_samples x n_features)\n+ y: 1-d array-like (n_samples)\n+ """\n+ X, y = check_X_y(X, y, [\'csr\', \'csc\'], multi_output=False)\n+\n+ def _stochastic_sampling(X, y, random_state=None, positive_thres=-1,\n+ negative_thres=0):\n+ # each iteration select a random number of random subset of\n+ # training samples. this is somewhat different from the original\n+ # IRAPS method, but effect is almost the same.\n+ SAMPLE_SIZE = [0.25, 0.75]\n+ n_samples = X.shape[0'..b'lue = main_estimator.discretize_value\n+ less_is_positive = main_estimator.less_is_positive\n+\n+ if less_is_positive:\n+ y_trans = y < discretize_value\n+ else:\n+ y_trans = y > discretize_value\n+\n+ y_pred = clf.predict(X)\n+ if sample_weight is not None:\n+ return self._sign * self._score_func(y_trans, y_pred,\n+ sample_weight=sample_weight,\n+ **self._kwargs)\n+ else:\n+ return self._sign * self._score_func(y_trans, y_pred,\n+ **self._kwargs)\n+\n+\n+# roc_auc\n+binarize_auc_scorer =\\\n+ _BinarizeTargetProbaScorer(metrics.roc_auc_score, 1, {})\n+\n+# average_precision_scorer\n+binarize_average_precision_scorer =\\\n+ _BinarizeTargetProbaScorer(metrics.average_precision_score, 1, {})\n+\n+# roc_auc_scorer\n+iraps_auc_scorer = binarize_auc_scorer\n+\n+# average_precision_scorer\n+iraps_average_precision_scorer = binarize_average_precision_scorer\n+\n+\n+class BinarizeTargetRegressor(BaseEstimator, RegressorMixin):\n+ """\n+ Extend regression estimator to have discretize_value\n+\n+ Parameters\n+ ----------\n+ regressor: object\n+ Estimator object such as derived from sklearn `RegressionMixin`.\n+\n+ z_score: float, default=-1.0\n+ Threshold value based on z_score. Will be ignored when\n+ fixed_value is set\n+\n+ value: float, default=None\n+ Threshold value\n+\n+ less_is_positive: boolean, default=True\n+ When target is less the threshold value, it will be converted\n+ to True, False otherwise.\n+\n+ Attributes\n+ ----------\n+ regressor_: object\n+ Fitted regressor\n+\n+ discretize_value: float\n+ The threshold value used to discretize True and False targets\n+ """\n+\n+ def __init__(self, regressor, z_score=-1, value=None,\n+ less_is_positive=True):\n+ self.regressor = regressor\n+ self.z_score = z_score\n+ self.value = value\n+ self.less_is_positive = less_is_positive\n+\n+ def fit(self, X, y, sample_weight=None):\n+ """\n+ Calculate the discretize_value fit the regressor with traning data\n+\n+ Returns\n+ ------\n+ self: object\n+ """\n+ y = check_array(y, accept_sparse=False, force_all_finite=True,\n+ ensure_2d=False, dtype=\'numeric\')\n+ y = column_or_1d(y)\n+\n+ if self.value is None:\n+ discretize_value = y.mean() + y.std() * self.z_score\n+ else:\n+ discretize_value = self.Value\n+ self.discretize_value = discretize_value\n+\n+ self.regressor_ = clone(self.regressor)\n+\n+ if sample_weight is not None:\n+ self.regressor_.fit(X, y, sample_weight=sample_weight)\n+ else:\n+ self.regressor_.fit(X, y)\n+\n+ # attach classifier attributes\n+ if hasattr(self.regressor_, \'feature_importances_\'):\n+ self.feature_importances_ = self.regressor_.feature_importances_\n+ if hasattr(self.regressor_, \'coef_\'):\n+ self.coef_ = self.regressor_.coef_\n+ if hasattr(self.regressor_, \'n_outputs_\'):\n+ self.n_outputs_ = self.regressor_.n_outputs_\n+ if hasattr(self.regressor_, \'n_features_\'):\n+ self.n_features_ = self.regressor_.n_features_\n+\n+ return self\n+\n+ def predict(self, X):\n+ """Predict target value of X\n+ """\n+ check_is_fitted(self, \'regressor_\')\n+ y_pred = self.regressor_.predict(X)\n+ if not np.all((y_pred >= 0) & (y_pred <= 1)):\n+ y_pred = (y_pred - y_pred.min()) / (y_pred.max() - y_pred.min())\n+ if self.less_is_positive:\n+ y_pred = 1 - y_pred\n+ return y_pred\n+\n+\n+# roc_auc_scorer\n+regression_auc_scorer = binarize_auc_scorer\n+\n+# average_precision_scorer\n+regression_average_precision_scorer = binarize_average_precision_scorer\n' |
b |
diff -r 2bbbac61e48d -r ec25331946b8 main_macros.xml --- a/main_macros.xml Sun Dec 30 01:57:11 2018 -0500 +++ b/main_macros.xml Tue May 14 18:17:57 2019 -0400 |
[ |
b'@@ -1,14 +1,17 @@\n <macros>\n- <token name="@VERSION@">1.0</token>\n+ <token name="@VERSION@">1.0.0.4</token>\n \n <xml name="python_requirements">\n <requirements>\n <requirement type="package" version="3.6">python</requirement>\n- <requirement type="package" version="0.20.2">scikit-learn</requirement>\n- <requirement type="package" version="0.23.4">pandas</requirement>\n+ <requirement type="package" version="0.20.3">scikit-learn</requirement>\n+ <requirement type="package" version="0.24.2">pandas</requirement>\n <requirement type="package" version="0.80">xgboost</requirement>\n <requirement type="package" version="0.9.13">asteval</requirement>\n- <yield />\n+ <requirement type="package" version="0.6">skrebate</requirement>\n+ <requirement type="package" version="0.4.2">imbalanced-learn</requirement>\n+ <requirement type="package" version="0.16.0">mlxtend</requirement>\n+ <yield/>\n </requirements>\n </xml>\n \n@@ -352,10 +355,10 @@\n <option value="all_columns">All columns</option>\n </param>\n <when value="by_index_number">\n- <param name="@COL_NAME@" multiple="@MULTIPLE@" type="data_column" data_ref="@INFILE@" label="Select target column(s):"/>\n+ <param name="@COL_NAME@" multiple="@MULTIPLE@" type="data_column" use_header_names="true" data_ref="@INFILE@" label="Select target column(s):"/>\n </when>\n <when value="all_but_by_index_number">\n- <param name="@COL_NAME@" multiple="@MULTIPLE@" type="data_column" data_ref="@INFILE@" label="Select target column(s):"/>\n+ <param name="@COL_NAME@" multiple="@MULTIPLE@" type="data_column" use_header_names="true" data_ref="@INFILE@" label="Select target column(s):"/>\n </when>\n <when value="by_header_name">\n <param name="@COL_NAME@" type="text" value="" label="Type header name(s):" help="Comma-separated string. For example: target1,target2"/>\n@@ -428,7 +431,7 @@\n <option value="sparse">sparse matrix</option>\n </param>\n <when value="tabular">\n- <expand macro="samples_tabular" multiple1="true"/>\n+ <expand macro="samples_tabular" multiple1="true" multiple2="false"/>\n </when>\n <when value="sparse">\n <expand macro="sparse_target"/>\n@@ -823,6 +826,8 @@\n <option value="StratifiedShuffleSplit">StratifiedShuffleSplit</option>\n <option value="TimeSeriesSplit">TimeSeriesSplit</option>\n <option value="PredefinedSplit">PredefinedSplit</option>\n+ <option value="OrderedKFold">OrderedKFold</option>\n+ <option value="RepeatedOrderedKFold">RepeatedOrderedKFold</option>\n <yield/>\n </xml>\n \n@@ -872,6 +877,16 @@\n <when value="PredefinedSplit">\n <param argument="test_fold" type="text" value="" area="true" label="test_fold" help="List, e.g., [0, 1, -1, 1], represents two test sets, [X[0]] and [X[1], X[3]], X[2] is excluded from any test set due to \'-1\'."/>\n </when>\n+ <when value="OrderedKFold">\n+ <expand macro="cv_n_splits"/>\n+ <expand macro="cv_shuffle"/>\n+ <expand macro="random_state"/>\n+ </when>\n+ <when value="RepeatedOrderedKFold">\n+ <expand macro="cv_n_splits"/>\n+ <param argument="n_repeats" type="integer" value="5"/>\n+ <expand macro="random_state"/>\n+ </when>\n <yield/>\n </xml>\n \n@@ -929,7 +944,13 @@\n </xml>\n \n <xml name="cv_groups" >\n- <param argument="groups" type="text" value="" area="true" label="Groups" help="Group lables in a list. e.g., [1, 1, 2, 2, 3, 3, 3]"/>\n+ <section name="groups_selector" title="Groups column selector" expanded="true">\n+ <param name="infile_g" type="data" format="tabular" label="Choose dataset containing groups info:"/>\n+ <param name="header_g" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolfalse" checked="False" label="Does the dataset contain header:" />\n+ <conditional name="column_selector_options_g">\n+ <expand macro="sa'..b' </sanitizer>\n- </param>\n- </xml>\n-\n <xml name="search_cv_options">\n <expand macro="scoring_selection"/>\n <expand macro="model_validation_common_options"/>\n- <expand macro="pre_dispatch" value="2*n_jobs" help="Controls the number of jobs that get dispatched during parallel execution"/>\n+ <!--expand macro="pre_dispatch" default_value="2*n_jobs" help="Controls the number of jobs that get dispatched during parallel execution"/-->\n <param argument="iid" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="true" label="iid" help="If True, data is identically distributed across the folds"/>\n <param argument="refit" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="true" label="refit" help="Refit an estimator using the best found parameters on the whole dataset."/>\n <param argument="error_score" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="true" label="Raise fit error:" help="If false, the metric score is assigned to NaN if an error occurs in estimator fitting and FitFailedWarning is raised."/>\n@@ -1403,12 +1454,12 @@\n <conditional name="estimator_selector">\n <param name="selected_module" type="select" label="Choose the module that contains target estimator:" >\n <expand macro="estimator_module_options">\n- <option value="customer_estimator">Load a customer estimator</option>\n+ <option value="custom_estimator">Load a custom estimator</option>\n </expand>\n </param>\n <expand macro="estimator_suboptions">\n- <when value="customer_estimator">\n- <param name="c_estimator" type="data" format="zip" label="Choose the dataset containing the customer estimator or pipeline:"/>\n+ <when value="custom_estimator">\n+ <param name="c_estimator" type="data" format="zip" label="Choose the dataset containing the custom estimator or pipeline:"/>\n </when>\n </expand>\n </conditional>\n@@ -1591,6 +1642,7 @@\n <option value="over_sampling.SMOTENC">over_sampling.SMOTENC</option>\n <option value="combine.SMOTEENN">combine.SMOTEENN</option>\n <option value="combine.SMOTETomek">combine.SMOTETomek</option>\n+ <option value="Z_RandomOverSampler">Z_RandomOverSampler - for regression</option>\n </param>\n <when value="under_sampling.ClusterCentroids">\n <expand macro="estimator_params_text"\n@@ -1668,6 +1720,33 @@\n <expand macro="estimator_params_text"\n help="Default(=blank): sampling_strategy=\'auto\', random_state=None, smote=None, tomek=None."/>\n </when>\n+ <when value="Z_RandomOverSampler">\n+ <expand macro="estimator_params_text"\n+ help="Default(=blank): sampling_strategy=\'auto\', random_state=None, negative_thres=0, positive_thres=-1."/>\n+ </when>\n+ </conditional>\n+ </xml>\n+\n+ <xml name="stacking_ensemble_inputs">\n+ <section name="options" title="Advanced Options" expanded="false">\n+ <yield/>\n+ <param argument="use_features_in_secondary" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="false"/>\n+ <param argument="store_train_meta_features" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="false"/>\n+ </section>\n+ </xml>\n+\n+ <xml name="stacking_base_estimator">\n+ <conditional name="estimator_selector">\n+ <param name="selected_module" type="select" label="Choose the module that contains target estimator:" >\n+ <expand macro="estimator_module_options">\n+ <option value="custom_estimator">Load a custom estimator</option>\n+ </expand>\n+ </param>\n+ <expand macro="estimator_suboptions">\n+ <when value="custom_estimator">\n+ <param name="c_estimator" type="data" format="zip" label="Choose the dataset containing the custom estimator or pipeline"/>\n+ </when>\n+ </expand>\n </conditional>\n </xml>\n \n' |
b |
diff -r 2bbbac61e48d -r ec25331946b8 model_validations.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/model_validations.py Tue May 14 18:17:57 2019 -0400 |
[ |
b'@@ -0,0 +1,252 @@\n+"""\n+class\n+-----\n+OrderedKFold\n+RepeatedOrderedKold\n+\n+\n+function\n+--------\n+train_test_split\n+"""\n+\n+import numpy as np\n+import warnings\n+\n+from itertools import chain\n+from math import ceil, floor\n+from sklearn.model_selection import (GroupShuffleSplit, ShuffleSplit,\n+ StratifiedShuffleSplit)\n+from sklearn.model_selection._split import _BaseKFold, _RepeatedSplits\n+from sklearn.utils import check_random_state, indexable, safe_indexing\n+from sklearn.utils.validation import _num_samples, check_array\n+\n+\n+def _validate_shuffle_split(n_samples, test_size, train_size,\n+ default_test_size=None):\n+ """\n+ Validation helper to check if the test/test sizes are meaningful wrt to the\n+ size of the data (n_samples)\n+ """\n+ if test_size is None and train_size is None:\n+ test_size = default_test_size\n+\n+ test_size_type = np.asarray(test_size).dtype.kind\n+ train_size_type = np.asarray(train_size).dtype.kind\n+\n+ if (test_size_type == \'i\' and (test_size >= n_samples or test_size <= 0)\n+ or test_size_type == \'f\' and (test_size <= 0 or test_size >= 1)):\n+ raise ValueError(\'test_size={0} should be either positive and smaller\'\n+ \' than the number of samples {1} or a float in the \'\n+ \'(0, 1) range\'.format(test_size, n_samples))\n+\n+ if (train_size_type == \'i\' and (train_size >= n_samples or train_size <= 0)\n+ or train_size_type == \'f\' and (train_size <= 0 or train_size >= 1)):\n+ raise ValueError(\'train_size={0} should be either positive and smaller\'\n+ \' than the number of samples {1} or a float in the \'\n+ \'(0, 1) range\'.format(train_size, n_samples))\n+\n+ if train_size is not None and train_size_type not in (\'i\', \'f\'):\n+ raise ValueError("Invalid value for train_size: {}".format(train_size))\n+ if test_size is not None and test_size_type not in (\'i\', \'f\'):\n+ raise ValueError("Invalid value for test_size: {}".format(test_size))\n+\n+ if (train_size_type == \'f\' and test_size_type == \'f\' and\n+ train_size + test_size > 1):\n+ raise ValueError(\n+ \'The sum of test_size and train_size = {}, should be in the (0, 1)\'\n+ \' range. Reduce test_size and/or train_size.\'\n+ .format(train_size + test_size))\n+\n+ if test_size_type == \'f\':\n+ n_test = ceil(test_size * n_samples)\n+ elif test_size_type == \'i\':\n+ n_test = float(test_size)\n+\n+ if train_size_type == \'f\':\n+ n_train = floor(train_size * n_samples)\n+ elif train_size_type == \'i\':\n+ n_train = float(train_size)\n+\n+ if train_size is None:\n+ n_train = n_samples - n_test\n+ elif test_size is None:\n+ n_test = n_samples - n_train\n+\n+ if n_train + n_test > n_samples:\n+ raise ValueError(\'The sum of train_size and test_size = %d, \'\n+ \'should be smaller than the number of \'\n+ \'samples %d. Reduce test_size and/or \'\n+ \'train_size.\' % (n_train + n_test, n_samples))\n+\n+ n_train, n_test = int(n_train), int(n_test)\n+\n+ if n_train == 0:\n+ raise ValueError(\n+ \'With n_samples={}, test_size={} and train_size={}, the \'\n+ \'resulting train set will be empty. Adjust any of the \'\n+ \'aforementioned parameters.\'.format(n_samples, test_size,\n+ train_size)\n+ )\n+\n+ return n_train, n_test\n+\n+\n+def train_test_split(*arrays, **options):\n+ """Extend sklearn.model_selection.train_test_slit to have group split.\n+\n+ Parameters\n+ ----------\n+ *arrays : sequence of indexables with same length / shape[0]\n+ Allowed inputs are lists, numpy arrays, scipy-sparse\n+ matrices or pandas dataframes.\n+\n+ test_size : float, int or None, optional (default=None)\n+ If float, should be betw'..b'arrays == 0:\n+ raise ValueError("At least one array required as input")\n+ test_size = options.pop(\'test_size\', None)\n+ train_size = options.pop(\'train_size\', None)\n+ random_state = options.pop(\'random_state\', None)\n+ shuffle = options.pop(\'shuffle\', \'simple\')\n+ labels = options.pop(\'labels\', None)\n+\n+ if options:\n+ raise TypeError("Invalid parameters passed: %s" % str(options))\n+\n+ arrays = indexable(*arrays)\n+\n+ n_samples = _num_samples(arrays[0])\n+ if shuffle == \'group\':\n+ if labels is None:\n+ raise ValueError("When shuffle=\'group\', "\n+ "labels should not be None!")\n+ labels = check_array(labels, ensure_2d=False, dtype=None)\n+ uniques = np.unique(labels)\n+ n_samples = uniques.size\n+\n+ n_train, n_test = _validate_shuffle_split(n_samples, test_size, train_size,\n+ default_test_size=0.25)\n+\n+ shuffle_options = dict(test_size=n_test,\n+ train_size=n_train,\n+ random_state=random_state)\n+\n+ if shuffle is None:\n+ if labels is not None:\n+ warnings.warn("The `labels` is ignored for "\n+ "shuffle being None!")\n+\n+ train = np.arange(n_train)\n+ test = np.arange(n_train, n_train + n_test)\n+\n+ elif shuffle == \'simple\':\n+ if labels is not None:\n+ warnings.warn("The `labels` is not needed and therefore "\n+ "ignored for ShuffleSplit, as shuffle=\'simple\'!")\n+\n+ cv = ShuffleSplit(**shuffle_options)\n+ train, test = next(cv.split(X=arrays[0], y=None))\n+\n+ elif shuffle == \'stratified\':\n+ cv = StratifiedShuffleSplit(**shuffle_options)\n+ train, test = next(cv.split(X=arrays[0], y=labels))\n+\n+ elif shuffle == \'group\':\n+ cv = GroupShuffleSplit(**shuffle_options)\n+ train, test = next(cv.split(X=arrays[0], y=None, groups=labels))\n+\n+ else:\n+ raise ValueError("The argument `shuffle` only supports None, "\n+ "\'simple\', \'stratified\' and \'group\', but got `%s`!"\n+ % shuffle)\n+\n+ return list(chain.from_iterable((safe_indexing(a, train),\n+ safe_indexing(a, test)) for a in arrays))\n+\n+\n+class OrderedKFold(_BaseKFold):\n+ """\n+ Split into K fold based on ordered target value\n+\n+ Parameters\n+ ----------\n+ n_splits : int, default=3\n+ Number of folds. Must be at least 2.\n+ shuffle: bool\n+ random_state: None or int\n+ """\n+\n+ def __init__(self, n_splits=3, shuffle=False, random_state=None):\n+ super(OrderedKFold, self).__init__(n_splits, shuffle, random_state)\n+\n+ def _iter_test_indices(self, X, y, groups=None):\n+ n_samples = _num_samples(X)\n+ n_splits = self.n_splits\n+ y = np.asarray(y)\n+ sorted_index = np.argsort(y)\n+ if self.shuffle:\n+ current = 0\n+ rng = check_random_state(self.random_state)\n+ for i in range(n_samples // int(n_splits)):\n+ start, stop = current, current + n_splits\n+ rng.shuffle(sorted_index[start:stop])\n+ current = stop\n+ rng.shuffle(sorted_index[current:])\n+\n+ for i in range(n_splits):\n+ yield sorted_index[i:n_samples:n_splits]\n+\n+\n+class RepeatedOrderedKFold(_RepeatedSplits):\n+ """ Repeated OrderedKFold runs mutiple times with different randomization.\n+\n+ Parameters\n+ ----------\n+ n_splits : int, default=5\n+ Number of folds. Must be at least 2.\n+\n+ n_repeats : int, default=5\n+ Number of times cross-validator to be repeated.\n+\n+ random_state: int, RandomState instance or None. Optional\n+ """\n+ def __init__(self, n_splits=5, n_repeats=5, random_state=None):\n+ super(RepeatedOrderedKFold, self).__init__(\n+ OrderedKFold, n_repeats, random_state, n_splits=n_splits)\n' |
b |
diff -r 2bbbac61e48d -r ec25331946b8 pk_whitelist.json --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pk_whitelist.json Tue May 14 18:17:57 2019 -0400 |
[ |
b'@@ -0,0 +1,768 @@\n+{ "SK_NAMES": [\n+ "sklearn._ASSUME_FINITE", "sklearn._isotonic._inplace_contiguous_isotonic_regression",\n+ "sklearn._isotonic._make_unique", "sklearn.base.BaseEstimator",\n+ "sklearn.base.BiclusterMixin", "sklearn.base.ClassifierMixin",\n+ "sklearn.base.ClusterMixin", "sklearn.base.DensityMixin",\n+ "sklearn.base.MetaEstimatorMixin", "sklearn.base.RegressorMixin",\n+ "sklearn.base.TransformerMixin", "sklearn.base._first_and_last_element",\n+ "sklearn.base._pprint", "sklearn.base.clone",\n+ "sklearn.base.is_classifier", "sklearn.base.is_regressor",\n+ "sklearn.clone", "sklearn.cluster.AffinityPropagation",\n+ "sklearn.cluster.AgglomerativeClustering", "sklearn.cluster.Birch",\n+ "sklearn.cluster.DBSCAN", "sklearn.cluster.FeatureAgglomeration",\n+ "sklearn.cluster.KMeans", "sklearn.cluster.MeanShift",\n+ "sklearn.cluster.MiniBatchKMeans", "sklearn.cluster.SpectralBiclustering",\n+ "sklearn.cluster.SpectralClustering", "sklearn.cluster.SpectralCoclustering",\n+ "sklearn.cluster._dbscan_inner.dbscan_inner", "sklearn.cluster._feature_agglomeration.AgglomerationTransform",\n+ "sklearn.cluster._hierarchical.WeightedEdge", "sklearn.cluster._hierarchical._get_parents",\n+ "sklearn.cluster._hierarchical._hc_get_descendent", "sklearn.cluster._hierarchical.average_merge",\n+ "sklearn.cluster._hierarchical.compute_ward_dist", "sklearn.cluster._hierarchical.hc_get_heads",\n+ "sklearn.cluster._hierarchical.max_merge", "sklearn.cluster._k_means._assign_labels_array",\n+ "sklearn.cluster._k_means._assign_labels_csr", "sklearn.cluster._k_means._centers_dense",\n+ "sklearn.cluster._k_means._centers_sparse", "sklearn.cluster._k_means._mini_batch_update_csr",\n+ "sklearn.cluster._k_means_elkan.k_means_elkan", "sklearn.cluster.affinity_propagation",\n+ "sklearn.cluster.affinity_propagation_.AffinityPropagation", "sklearn.cluster.affinity_propagation_.affinity_propagation",\n+ "sklearn.cluster.bicluster.BaseSpectral", "sklearn.cluster.bicluster.SpectralBiclustering",\n+ "sklearn.cluster.bicluster.SpectralCoclustering", "sklearn.cluster.bicluster._bistochastic_normalize",\n+ "sklearn.cluster.bicluster._log_normalize", "sklearn.cluster.bicluster._scale_normalize",\n+ "sklearn.cluster.birch.Birch", "sklearn.cluster.birch._CFNode",\n+ "sklearn.cluster.birch._CFSubcluster", "sklearn.cluster.birch._iterate_sparse_X",\n+ "sklearn.cluster.birch._split_node", "sklearn.cluster.dbscan",\n+ "sklearn.cluster.dbscan_.DBSCAN", "sklearn.cluster.dbscan_.dbscan",\n+ "sklearn.cluster.estimate_bandwidth", "sklearn.cluster.get_bin_seeds",\n+ "sklearn.cluster.hierarchical.AgglomerativeClustering", "sklearn.cluster.hierarchical.FeatureAgglomeration",\n+ "sklearn.cluster.hierarchical._TREE_BUILDERS", "sklearn.cluster.hierarchical._average_linkage",\n+ "sklearn.cluster.hierarchical._complete_linkage", "sklearn.cluster.hierarchical._fix_connectivity",\n+ "sklearn.cluster.hierarchical._hc_cut", "sklearn.cluster.hierarchical.linkage_tree",\n+ "sklearn.cluster.hierarchical.ward_tree", "sklearn.cluster.k_means",\n+ "sklearn.cluster.k_means_.FLOAT_DTYPES", "sklearn.cluster.k_means_.KMeans",\n+ "sklearn.cluster.k_means_.MiniBatchKMeans", "sklearn.cluster.k_means_._init_centroids",\n+ "sklearn.cluster.k_means_._k_init", "sklearn.cluster.k_means_._kmeans_single_elkan",\n+ "sklearn.cluster.k_means_._kmeans_single_lloyd", "sklearn.cluster.k_means_._labels_inertia",\n+ "sklearn.cluster.k_means_._labels_inertia_precompute_dense", "sklearn.cluster.k_means_._mini_batch_convergence",\n+ "sklearn.cluster.k_means_._mini_batch_step", "sklearn.cluster.k_means_._tolerance",\n+ "sklearn.cluster.k_means_._validate_center_shape", "sklearn.cluster.k_means_.k_means",\n+ "sklearn.cluster.k_means_.string_types", "sklearn.cluster.linkage_tree",\n+ "sklearn.cluster.mean_shift", "sklearn.cluster.mean_shift_.MeanShift",\n+ "sklearn.cluster.mean_shift_._mean_shift_single_seed", "sklearn.cluster'..b'ltiSURFstar",\n+ "skrebate.ReliefF", "skrebate.SURF",\n+ "skrebate.SURFstar", "skrebate.TuRF",\n+ "skrebate.multisurf.MultiSURF", "skrebate.multisurfstar.MultiSURFstar",\n+ "skrebate.relieff.ReliefF", "skrebate.scoring_utils.MultiSURF_compute_scores",\n+ "skrebate.scoring_utils.MultiSURFstar_compute_scores", "skrebate.scoring_utils.ReliefF_compute_scores",\n+ "skrebate.scoring_utils.SURF_compute_scores", "skrebate.scoring_utils.SURFstar_compute_scores",\n+ "skrebate.scoring_utils.compute_score", "skrebate.scoring_utils.get_row_missing",\n+ "skrebate.scoring_utils.ramp_function", "skrebate.surf.SURF",\n+ "skrebate.surfstar.SURFstar", "skrebate.turf.TuRF"\n+ ],\n+\n+ "XGB_NAMES": [\n+ "xgboost.Booster", "xgboost.DMatrix",\n+ "xgboost.VERSION_FILE", "xgboost.XGBClassifier",\n+ "xgboost.XGBModel", "xgboost.XGBRegressor",\n+ "xgboost.callback._fmt_metric", "xgboost.callback._get_callback_context",\n+ "xgboost.callback.early_stop", "xgboost.callback.print_evaluation",\n+ "xgboost.callback.record_evaluation", "xgboost.callback.reset_learning_rate",\n+ "xgboost.compat.PANDAS_INSTALLED", "xgboost.compat.PY3",\n+ "xgboost.compat.SKLEARN_INSTALLED", "xgboost.compat.STRING_TYPES",\n+ "xgboost.compat.py_str", "xgboost.core.Booster",\n+ "xgboost.core.CallbackEnv", "xgboost.core.DMatrix",\n+ "xgboost.core.EarlyStopException", "xgboost.core.PANDAS_DTYPE_MAPPER",\n+ "xgboost.core.PANDAS_INSTALLED", "xgboost.core.PY3",\n+ "xgboost.core.STRING_TYPES", "xgboost.core.XGBoostError",\n+ "xgboost.core._check_call", "xgboost.core._load_lib",\n+ "xgboost.core._maybe_pandas_data", "xgboost.core._maybe_pandas_label",\n+ "xgboost.core.c_array", "xgboost.core.c_str",\n+ "xgboost.core.ctypes2buffer", "xgboost.core.ctypes2numpy",\n+ "xgboost.core.from_cstr_to_pystr", "xgboost.core.from_pystr_to_cstr",\n+ "xgboost.cv", "xgboost.f",\n+ "xgboost.libpath.XGBoostLibraryNotFound", "xgboost.libpath.find_lib_path",\n+ "xgboost.plot_importance", "xgboost.plot_tree",\n+ "xgboost.plotting._EDGEPAT", "xgboost.plotting._EDGEPAT2",\n+ "xgboost.plotting._LEAFPAT", "xgboost.plotting._NODEPAT",\n+ "xgboost.plotting._parse_edge", "xgboost.plotting._parse_node",\n+ "xgboost.plotting.plot_importance", "xgboost.plotting.plot_tree",\n+ "xgboost.plotting.to_graphviz", "xgboost.rabit.DTYPE_ENUM__",\n+ "xgboost.rabit.STRING_TYPES", "xgboost.rabit._init_rabit",\n+ "xgboost.rabit.allreduce", "xgboost.rabit.broadcast",\n+ "xgboost.rabit.finalize", "xgboost.rabit.get_processor_name",\n+ "xgboost.rabit.get_rank", "xgboost.rabit.get_world_size",\n+ "xgboost.rabit.init", "xgboost.rabit.tracker_print",\n+ "xgboost.rabit.version_number", "xgboost.sklearn.SKLEARN_INSTALLED",\n+ "xgboost.sklearn.XGBClassifier", "xgboost.sklearn.XGBModel",\n+ "xgboost.sklearn.XGBRegressor", "xgboost.sklearn._objective_decorator",\n+ "xgboost.to_graphviz", "xgboost.train",\n+ "xgboost.training.CVPack", "xgboost.training.SKLEARN_INSTALLED",\n+ "xgboost.training.STRING_TYPES", "xgboost.training._train_internal",\n+ "xgboost.training.aggcv", "xgboost.training.cv",\n+ "xgboost.training.mknfold", "xgboost.training.train"\n+ ],\n+\n+\n+ "NUMPY_NAMES": [\n+ "numpy.core.multiarray._reconstruct", "numpy.ndarray",\n+ "numpy.dtype", "numpy.core.multiarray.scalar", "numpy.random.__RandomState_ctor",\n+ "numpy.ma.core._mareconstruct", "numpy.ma.core.MaskedArray"\n+ ],\n+\n+ "IMBLEARN_NAMES":[\n+ "imblearn.pipeline.Pipeline", "imblearn.over_sampling._random_over_sampler.RandomOverSampler",\n+ "imblearn.under_sampling._prototype_selection._edited_nearest_neighbours.EditedNearestNeighbours"\n+ ],\n+\n+ "MLXTEND_NAMES":[\n+ "mlxtend.classifier.stacking_cv_classification.StackingCVClassifier",\n+ "mlxtend.classifier.stacking_classification.StackingClassifier",\n+ "mlxtend.regressor.stacking_cv_regression.StackingCVRegressor",\n+ "mlxtend.regressor.stacking_regression.StackingRegressor"\n+ ]\n+}\n\\ No newline at end of file\n' |
b |
diff -r 2bbbac61e48d -r ec25331946b8 preprocessors.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/preprocessors.py Tue May 14 18:17:57 2019 -0400 |
[ |
@@ -0,0 +1,184 @@ +""" +Z_RandomOverSampler +""" + +import imblearn +import numpy as np + +from collections import Counter +from imblearn.over_sampling.base import BaseOverSampler +from imblearn.over_sampling import RandomOverSampler +from imblearn.pipeline import Pipeline as imbPipeline +from imblearn.utils import check_target_type +from scipy import sparse +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.preprocessing.data import _handle_zeros_in_scale +from sklearn.utils import check_array, safe_indexing +from sklearn.utils.fixes import nanpercentile +from sklearn.utils.validation import (check_is_fitted, check_X_y, + FLOAT_DTYPES) + + +class Z_RandomOverSampler(BaseOverSampler): + + def __init__(self, sampling_strategy='auto', + return_indices=False, + random_state=None, + ratio=None, + negative_thres=0, + positive_thres=-1): + super(Z_RandomOverSampler, self).__init__( + sampling_strategy=sampling_strategy, ratio=ratio) + self.random_state = random_state + self.return_indices = return_indices + self.negative_thres = negative_thres + self.positive_thres = positive_thres + + @staticmethod + def _check_X_y(X, y): + y, binarize_y = check_target_type(y, indicate_one_vs_all=True) + X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'], dtype=None) + return X, y, binarize_y + + def _fit_resample(self, X, y): + n_samples = X.shape[0] + + # convert y to z_score + y_z = (y - y.mean()) / y.std() + + index0 = np.arange(n_samples) + index_negative = index0[y_z > self.negative_thres] + index_positive = index0[y_z <= self.positive_thres] + index_unclassified = [x for x in index0 + if x not in index_negative + and x not in index_positive] + + y_z[index_negative] = 0 + y_z[index_positive] = 1 + y_z[index_unclassified] = -1 + + ros = RandomOverSampler( + sampling_strategy=self.sampling_strategy, + random_state=self.random_state, + ratio=self.ratio) + _, _ = ros.fit_resample(X, y_z) + sample_indices = ros.sample_indices_ + + print("Before sampler: %s. Total after: %s" + % (Counter(y_z), sample_indices.shape)) + + self.sample_indices_ = np.array(sample_indices) + + if self.return_indices: + return (safe_indexing(X, sample_indices), + safe_indexing(y, sample_indices), + sample_indices) + return (safe_indexing(X, sample_indices), + safe_indexing(y, sample_indices)) + + +def _get_quantiles(X, quantile_range): + """ + Calculate column percentiles for 2d array + + Parameters + ---------- + X : array-like, shape [n_samples, n_features] + """ + quantiles = [] + for feature_idx in range(X.shape[1]): + if sparse.issparse(X): + column_nnz_data = X.data[ + X.indptr[feature_idx]: X.indptr[feature_idx + 1]] + column_data = np.zeros(shape=X.shape[0], dtype=X.dtype) + column_data[:len(column_nnz_data)] = column_nnz_data + else: + column_data = X[:, feature_idx] + quantiles.append(nanpercentile(column_data, quantile_range)) + + quantiles = np.transpose(quantiles) + + return quantiles + + +class TDMScaler(BaseEstimator, TransformerMixin): + """ + Scale features using Training Distribution Matching (TDM) algorithm + + References + ---------- + .. [1] Thompson JA, Tan J and Greene CS (2016) Cross-platform + normalization of microarray and RNA-seq data for machine + learning applications. PeerJ 4, e1621. + """ + + def __init__(self, q_lower=25.0, q_upper=75.0, ): + self.q_lower = q_lower + self.q_upper = q_upper + + def fit(self, X, y=None): + """ + Parameters + ---------- + X : array-like, shape [n_samples, n_features] + """ + X = check_array(X, copy=True, estimator=self, dtype=FLOAT_DTYPES, + force_all_finite=True) + + if not 0 <= self.q_lower <= self.q_upper <= 100: + raise ValueError("Invalid quantile parameter values: " + "q_lower %s, q_upper: %s" + % (str(self.q_lower), str(self.q_upper))) + + # TODO sparse data + quantiles = nanpercentile(X, (self.q_lower, self.q_upper)) + iqr = quantiles[1] - quantiles[0] + + self.q_lower_ = quantiles[0] + self.q_upper_ = quantiles[1] + self.iqr_ = _handle_zeros_in_scale(iqr, copy=False) + + self.max_ = np.nanmax(X) + self.min_ = np.nanmin(X) + + return self + + def transform(self, X): + """ + Parameters + ---------- + X : {array-like, sparse matrix} + The data used to scale along the specified axis. + """ + check_is_fitted(self, 'iqr_', 'max_') + X = check_array(X, copy=True, estimator=self, dtype=FLOAT_DTYPES, + force_all_finite=True) + + # TODO sparse data + train_upper_scale = (self.max_ - self.q_upper_) / self.iqr_ + train_lower_scale = (self.q_lower_ - self.min_) / self.iqr_ + + test_quantiles = nanpercentile(X, (self.q_lower, self.q_upper)) + test_iqr = _handle_zeros_in_scale( + test_quantiles[1] - test_quantiles[0], copy=False) + + test_upper_bound = test_quantiles[1] + train_upper_scale * test_iqr + test_lower_bound = test_quantiles[0] - train_lower_scale * test_iqr + + test_min = np.nanmin(X) + if test_lower_bound < test_min: + test_lower_bound = test_min + + X[X > test_upper_bound] = test_upper_bound + X[X < test_lower_bound] = test_lower_bound + + X = (X - test_lower_bound) / (test_upper_bound - test_lower_bound)\ + * (self.max_ - self.min_) + self.min_ + + return X + + def inverse_transform(self, X): + """ + Scale the data back to the original state + """ + raise NotImplementedError("Inverse transformation is not implemented!") |
b |
diff -r 2bbbac61e48d -r ec25331946b8 search_model_validation.py --- a/search_model_validation.py Sun Dec 30 01:57:11 2018 -0500 +++ b/search_model_validation.py Tue May 14 18:17:57 2019 -0400 |
[ |
b'@@ -1,7 +1,8 @@\n+import argparse\n+import collections\n import imblearn\n import json\n import numpy as np\n-import os\n import pandas\n import pickle\n import skrebate\n@@ -9,93 +10,124 @@\n import sys\n import xgboost\n import warnings\n+import iraps_classifier\n+import model_validations\n+import preprocessors\n+import feature_selectors\n from imblearn import under_sampling, over_sampling, combine\n-from imblearn.pipeline import Pipeline as imbPipeline\n-from sklearn import (cluster, compose, decomposition, ensemble, feature_extraction,\n- feature_selection, gaussian_process, kernel_approximation, metrics,\n- model_selection, naive_bayes, neighbors, pipeline, preprocessing,\n- svm, linear_model, tree, discriminant_analysis)\n+from scipy.io import mmread\n+from mlxtend import classifier, regressor\n+from sklearn import (cluster, compose, decomposition, ensemble,\n+ feature_extraction, feature_selection,\n+ gaussian_process, kernel_approximation, metrics,\n+ model_selection, naive_bayes, neighbors,\n+ pipeline, preprocessing, svm, linear_model,\n+ tree, discriminant_analysis)\n from sklearn.exceptions import FitFailedWarning\n from sklearn.externals import joblib\n-from utils import get_cv, get_scoring, get_X_y, load_model, read_columns, SafeEval\n+from sklearn.model_selection._validation import _score\n+\n+from utils import (SafeEval, get_cv, get_scoring, get_X_y,\n+ load_model, read_columns)\n+from model_validations import train_test_split\n \n \n-N_JOBS = int(os.environ.get(\'GALAXY_SLOTS\', 1))\n+N_JOBS = int(__import__(\'os\').environ.get(\'GALAXY_SLOTS\', 1))\n+CACHE_DIR = \'./cached\'\n+NON_SEARCHABLE = (\'n_jobs\', \'pre_dispatch\', \'memory\', \'steps\',\n+ \'nthread\', \'verbose\')\n \n \n-def get_search_params(params_builder):\n+def _eval_search_params(params_builder):\n search_params = {}\n- safe_eval = SafeEval(load_scipy=True, load_numpy=True)\n- safe_eval_es = SafeEval(load_estimators=True)\n \n for p in params_builder[\'param_set\']:\n- search_p = p[\'search_param_selector\'][\'search_p\']\n- if search_p.strip() == \'\':\n+ search_list = p[\'sp_list\'].strip()\n+ if search_list == \'\':\n continue\n- param_type = p[\'search_param_selector\'][\'selected_param_type\']\n+\n+ param_name = p[\'sp_name\']\n+ if param_name.lower().endswith(NON_SEARCHABLE):\n+ print("Warning: `%s` is not eligible for search and was "\n+ "omitted!" % param_name)\n+ continue\n \n- lst = search_p.split(\':\')\n- assert (len(lst) == 2), "Error, make sure there is one and only one colon in search parameter input."\n- literal = lst[1].strip()\n- param_name = lst[0].strip()\n- if param_name:\n- if param_name.lower() == \'n_jobs\':\n- sys.exit("Parameter `%s` is invalid for search." %param_name)\n- elif not param_name.endswith(\'-\'):\n- ev = safe_eval(literal)\n- if param_type == \'final_estimator_p\':\n- search_params[\'estimator__\' + param_name] = ev\n- else:\n- search_params[\'preprocessing_\' + param_type[5:6] + \'__\' + param_name] = ev\n- else:\n- # only for estimator eval, add `-` to the end of param\n- #TODO maybe add regular express check\n- ev = safe_eval_es(literal)\n- for obj in ev:\n- if \'n_jobs\' in obj.get_params():\n- obj.set_params( n_jobs=N_JOBS )\n- if param_type == \'final_estimator_p\':\n- search_params[\'estimator__\' + param_name[:-1]] = ev\n- else:\n- search_params[\'preprocessing_\' + param_type[5:6] + \'__\' + param_name[:-1]] = ev\n- elif param_type != \'final_estimator_p\':\n- #TODO regular express check ?\n- '..b'_train_test_split == \'yes\':\n+ # make sure refit is choosen\n+ if not options[\'refit\']:\n+ raise ValueError("Refit must be `True` for shuffle splitting!")\n+ split_options = params[\'train_test_split\']\n+\n+ # splits\n+ if split_options[\'shuffle\'] == \'stratified\':\n+ split_options[\'labels\'] = y\n+ X, X_test, y, y_test = train_test_split(X, y, **split_options)\n+ elif split_options[\'shuffle\'] == \'group\':\n+ if not groups:\n+ raise ValueError("No group based CV option was "\n+ "choosen for group shuffle!")\n+ split_options[\'labels\'] = groups\n+ X, X_test, y, y_test, groups, _ =\\\n+ train_test_split(X, y, **split_options)\n+ else:\n+ if split_options[\'shuffle\'] == \'None\':\n+ split_options[\'shuffle\'] = None\n+ X, X_test, y, y_test =\\\n+ train_test_split(X, y, **split_options)\n+ # end train_test_split\n \n if options[\'error_score\'] == \'raise\':\n- searcher.fit(X, y)\n+ searcher.fit(X, y, groups=groups)\n else:\n warnings.simplefilter(\'always\', FitFailedWarning)\n with warnings.catch_warnings(record=True) as w:\n try:\n- searcher.fit(X, y)\n+ searcher.fit(X, y, groups=groups)\n except ValueError:\n pass\n for warning in w:\n print(repr(warning.message))\n \n- cv_result = pandas.DataFrame(searcher.cv_results_)\n- cv_result.rename(inplace=True, columns={\'mean_test_primary\': \'mean_test_\'+primary_scoring, \'rank_test_primary\': \'rank_test_\'+primary_scoring})\n- cv_result.to_csv(path_or_buf=outfile_result, sep=\'\\t\', header=True, index=False)\n+ if do_train_test_split == \'no\':\n+ # save results\n+ cv_results = pandas.DataFrame(searcher.cv_results_)\n+ cv_results = cv_results[sorted(cv_results.columns)]\n+ cv_results.to_csv(path_or_buf=outfile_result, sep=\'\\t\',\n+ header=True, index=False)\n+\n+ # output test result using best_estimator_\n+ else:\n+ best_estimator_ = searcher.best_estimator_\n+ if isinstance(options[\'scoring\'], collections.Mapping):\n+ is_multimetric = True\n+ else:\n+ is_multimetric = False\n \n- if outfile_estimator:\n- with open(outfile_estimator, \'wb\') as output_handler:\n- pickle.dump(searcher.best_estimator_, output_handler, pickle.HIGHEST_PROTOCOL)\n+ test_score = _score(best_estimator_, X_test,\n+ y_test, options[\'scoring\'],\n+ is_multimetric=is_multimetric)\n+ if not is_multimetric:\n+ test_score = {primary_scoring: test_score}\n+ for key, value in test_score.items():\n+ test_score[key] = [value]\n+ result_df = pandas.DataFrame(test_score)\n+ result_df.to_csv(path_or_buf=outfile_result, sep=\'\\t\',\n+ header=True, index=False)\n+\n+ memory.clear(warn=False)\n+\n+ if outfile_object:\n+ with open(outfile_object, \'wb\') as output_handler:\n+ pickle.dump(searcher, output_handler, pickle.HIGHEST_PROTOCOL)\n+\n+\n+if __name__ == \'__main__\':\n+ aparser = argparse.ArgumentParser()\n+ aparser.add_argument("-i", "--inputs", dest="inputs", required=True)\n+ aparser.add_argument("-e", "--estimator", dest="infile_estimator")\n+ aparser.add_argument("-X", "--infile1", dest="infile1")\n+ aparser.add_argument("-y", "--infile2", dest="infile2")\n+ aparser.add_argument("-r", "--outfile_result", dest="outfile_result")\n+ aparser.add_argument("-o", "--outfile_object", dest="outfile_object")\n+ aparser.add_argument("-g", "--groups", dest="groups")\n+ args = aparser.parse_args()\n+\n+ main(args.inputs, args.infile_estimator, args.infile1, args.infile2,\n+ args.outfile_result, outfile_object=args.outfile_object,\n+ groups=args.groups)\n' |
b |
diff -r 2bbbac61e48d -r ec25331946b8 sk_whitelist.json --- a/sk_whitelist.json Sun Dec 30 01:57:11 2018 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
b'@@ -1,761 +0,0 @@\n-{ "SK_NAMES": [\n- "sklearn._ASSUME_FINITE", "sklearn._isotonic._inplace_contiguous_isotonic_regression",\n- "sklearn._isotonic._make_unique", "sklearn.base.BaseEstimator",\n- "sklearn.base.BiclusterMixin", "sklearn.base.ClassifierMixin",\n- "sklearn.base.ClusterMixin", "sklearn.base.DensityMixin",\n- "sklearn.base.MetaEstimatorMixin", "sklearn.base.RegressorMixin",\n- "sklearn.base.TransformerMixin", "sklearn.base._first_and_last_element",\n- "sklearn.base._pprint", "sklearn.base.clone",\n- "sklearn.base.is_classifier", "sklearn.base.is_regressor",\n- "sklearn.clone", "sklearn.cluster.AffinityPropagation",\n- "sklearn.cluster.AgglomerativeClustering", "sklearn.cluster.Birch",\n- "sklearn.cluster.DBSCAN", "sklearn.cluster.FeatureAgglomeration",\n- "sklearn.cluster.KMeans", "sklearn.cluster.MeanShift",\n- "sklearn.cluster.MiniBatchKMeans", "sklearn.cluster.SpectralBiclustering",\n- "sklearn.cluster.SpectralClustering", "sklearn.cluster.SpectralCoclustering",\n- "sklearn.cluster._dbscan_inner.dbscan_inner", "sklearn.cluster._feature_agglomeration.AgglomerationTransform",\n- "sklearn.cluster._hierarchical.WeightedEdge", "sklearn.cluster._hierarchical._get_parents",\n- "sklearn.cluster._hierarchical._hc_get_descendent", "sklearn.cluster._hierarchical.average_merge",\n- "sklearn.cluster._hierarchical.compute_ward_dist", "sklearn.cluster._hierarchical.hc_get_heads",\n- "sklearn.cluster._hierarchical.max_merge", "sklearn.cluster._k_means._assign_labels_array",\n- "sklearn.cluster._k_means._assign_labels_csr", "sklearn.cluster._k_means._centers_dense",\n- "sklearn.cluster._k_means._centers_sparse", "sklearn.cluster._k_means._mini_batch_update_csr",\n- "sklearn.cluster._k_means_elkan.k_means_elkan", "sklearn.cluster.affinity_propagation",\n- "sklearn.cluster.affinity_propagation_.AffinityPropagation", "sklearn.cluster.affinity_propagation_.affinity_propagation",\n- "sklearn.cluster.bicluster.BaseSpectral", "sklearn.cluster.bicluster.SpectralBiclustering",\n- "sklearn.cluster.bicluster.SpectralCoclustering", "sklearn.cluster.bicluster._bistochastic_normalize",\n- "sklearn.cluster.bicluster._log_normalize", "sklearn.cluster.bicluster._scale_normalize",\n- "sklearn.cluster.birch.Birch", "sklearn.cluster.birch._CFNode",\n- "sklearn.cluster.birch._CFSubcluster", "sklearn.cluster.birch._iterate_sparse_X",\n- "sklearn.cluster.birch._split_node", "sklearn.cluster.dbscan",\n- "sklearn.cluster.dbscan_.DBSCAN", "sklearn.cluster.dbscan_.dbscan",\n- "sklearn.cluster.estimate_bandwidth", "sklearn.cluster.get_bin_seeds",\n- "sklearn.cluster.hierarchical.AgglomerativeClustering", "sklearn.cluster.hierarchical.FeatureAgglomeration",\n- "sklearn.cluster.hierarchical._TREE_BUILDERS", "sklearn.cluster.hierarchical._average_linkage",\n- "sklearn.cluster.hierarchical._complete_linkage", "sklearn.cluster.hierarchical._fix_connectivity",\n- "sklearn.cluster.hierarchical._hc_cut", "sklearn.cluster.hierarchical.linkage_tree",\n- "sklearn.cluster.hierarchical.ward_tree", "sklearn.cluster.k_means",\n- "sklearn.cluster.k_means_.FLOAT_DTYPES", "sklearn.cluster.k_means_.KMeans",\n- "sklearn.cluster.k_means_.MiniBatchKMeans", "sklearn.cluster.k_means_._init_centroids",\n- "sklearn.cluster.k_means_._k_init", "sklearn.cluster.k_means_._kmeans_single_elkan",\n- "sklearn.cluster.k_means_._kmeans_single_lloyd", "sklearn.cluster.k_means_._labels_inertia",\n- "sklearn.cluster.k_means_._labels_inertia_precompute_dense", "sklearn.cluster.k_means_._mini_batch_convergence",\n- "sklearn.cluster.k_means_._mini_batch_step", "sklearn.cluster.k_means_._tolerance",\n- "sklearn.cluster.k_means_._validate_center_shape", "sklearn.cluster.k_means_.k_means",\n- "sklearn.cluster.k_means_.string_types", "sklearn.cluster.linkage_tree",\n- "sklearn.cluster.mean_shift", "sklearn.cluster.mean_shift_.MeanShift",\n- "sklearn.cluster.mean_shift_._mean_shift_single_seed", "sklearn.cluster'..b'lidation.check_non_negative", "sklearn.utils.validation.check_random_state",\n- "sklearn.utils.validation.check_symmetric", "sklearn.utils.validation.column_or_1d",\n- "sklearn.utils.validation.has_fit_parameter", "sklearn.utils.validation.indexable",\n- "sklearn.utils.weight_vector.WeightVector"\n-],\n-\n- "SKR_NAMES": [\n- "skrebate.MultiSURF", "skrebate.MultiSURFstar",\n- "skrebate.ReliefF", "skrebate.SURF",\n- "skrebate.SURFstar", "skrebate.TuRF",\n- "skrebate.multisurf.MultiSURF", "skrebate.multisurfstar.MultiSURFstar",\n- "skrebate.relieff.ReliefF", "skrebate.scoring_utils.MultiSURF_compute_scores",\n- "skrebate.scoring_utils.MultiSURFstar_compute_scores", "skrebate.scoring_utils.ReliefF_compute_scores",\n- "skrebate.scoring_utils.SURF_compute_scores", "skrebate.scoring_utils.SURFstar_compute_scores",\n- "skrebate.scoring_utils.compute_score", "skrebate.scoring_utils.get_row_missing",\n- "skrebate.scoring_utils.ramp_function", "skrebate.surf.SURF",\n- "skrebate.surfstar.SURFstar", "skrebate.turf.TuRF"\n- ],\n-\n- "XGB_NAMES": [\n- "xgboost.Booster", "xgboost.DMatrix",\n- "xgboost.VERSION_FILE", "xgboost.XGBClassifier",\n- "xgboost.XGBModel", "xgboost.XGBRegressor",\n- "xgboost.callback._fmt_metric", "xgboost.callback._get_callback_context",\n- "xgboost.callback.early_stop", "xgboost.callback.print_evaluation",\n- "xgboost.callback.record_evaluation", "xgboost.callback.reset_learning_rate",\n- "xgboost.compat.PANDAS_INSTALLED", "xgboost.compat.PY3",\n- "xgboost.compat.SKLEARN_INSTALLED", "xgboost.compat.STRING_TYPES",\n- "xgboost.compat.py_str", "xgboost.core.Booster",\n- "xgboost.core.CallbackEnv", "xgboost.core.DMatrix",\n- "xgboost.core.EarlyStopException", "xgboost.core.PANDAS_DTYPE_MAPPER",\n- "xgboost.core.PANDAS_INSTALLED", "xgboost.core.PY3",\n- "xgboost.core.STRING_TYPES", "xgboost.core.XGBoostError",\n- "xgboost.core._check_call", "xgboost.core._load_lib",\n- "xgboost.core._maybe_pandas_data", "xgboost.core._maybe_pandas_label",\n- "xgboost.core.c_array", "xgboost.core.c_str",\n- "xgboost.core.ctypes2buffer", "xgboost.core.ctypes2numpy",\n- "xgboost.core.from_cstr_to_pystr", "xgboost.core.from_pystr_to_cstr",\n- "xgboost.cv", "xgboost.f",\n- "xgboost.libpath.XGBoostLibraryNotFound", "xgboost.libpath.find_lib_path",\n- "xgboost.plot_importance", "xgboost.plot_tree",\n- "xgboost.plotting._EDGEPAT", "xgboost.plotting._EDGEPAT2",\n- "xgboost.plotting._LEAFPAT", "xgboost.plotting._NODEPAT",\n- "xgboost.plotting._parse_edge", "xgboost.plotting._parse_node",\n- "xgboost.plotting.plot_importance", "xgboost.plotting.plot_tree",\n- "xgboost.plotting.to_graphviz", "xgboost.rabit.DTYPE_ENUM__",\n- "xgboost.rabit.STRING_TYPES", "xgboost.rabit._init_rabit",\n- "xgboost.rabit.allreduce", "xgboost.rabit.broadcast",\n- "xgboost.rabit.finalize", "xgboost.rabit.get_processor_name",\n- "xgboost.rabit.get_rank", "xgboost.rabit.get_world_size",\n- "xgboost.rabit.init", "xgboost.rabit.tracker_print",\n- "xgboost.rabit.version_number", "xgboost.sklearn.SKLEARN_INSTALLED",\n- "xgboost.sklearn.XGBClassifier", "xgboost.sklearn.XGBModel",\n- "xgboost.sklearn.XGBRegressor", "xgboost.sklearn._objective_decorator",\n- "xgboost.to_graphviz", "xgboost.train",\n- "xgboost.training.CVPack", "xgboost.training.SKLEARN_INSTALLED",\n- "xgboost.training.STRING_TYPES", "xgboost.training._train_internal",\n- "xgboost.training.aggcv", "xgboost.training.cv",\n- "xgboost.training.mknfold", "xgboost.training.train"\n- ],\n-\n-\n- "NUMPY_NAMES": [\n- "numpy.core.multiarray._reconstruct", "numpy.ndarray",\n- "numpy.dtype", "numpy.core.multiarray.scalar",\n- "numpy.random.__RandomState_ctor"\n- ],\n-\n- "IMBLEARN_NAMES":[\n- "imblearn.pipeline.Pipeline", "imblearn.over_sampling._random_over_sampler.RandomOverSampler",\n- "imblearn.under_sampling._prototype_selection._edited_nearest_neighbours.EditedNearestNeighbours"\n- ]\n-}\n\\ No newline at end of file\n' |
b |
diff -r 2bbbac61e48d -r ec25331946b8 stacking_ensembles.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/stacking_ensembles.py Tue May 14 18:17:57 2019 -0400 |
[ |
@@ -0,0 +1,128 @@ +import argparse +import json +import pandas as pd +import pickle +import xgboost +import warnings +from sklearn import (cluster, compose, decomposition, ensemble, + feature_extraction, feature_selection, + gaussian_process, kernel_approximation, metrics, + model_selection, naive_bayes, neighbors, + pipeline, preprocessing, svm, linear_model, + tree, discriminant_analysis) +from sklearn.model_selection._split import check_cv +from feature_selectors import (DyRFE, DyRFECV, + MyPipeline, MyimbPipeline) +from iraps_classifier import (IRAPSCore, IRAPSClassifier, + BinarizeTargetClassifier, + BinarizeTargetRegressor) +from preprocessors import Z_RandomOverSampler +from utils import load_model, get_cv, get_estimator, get_search_params + +from mlxtend.regressor import StackingCVRegressor, StackingRegressor +from mlxtend.classifier import StackingCVClassifier, StackingClassifier + + +warnings.filterwarnings('ignore') + +N_JOBS = int(__import__('os').environ.get('GALAXY_SLOTS', 1)) + + +def main(inputs_path, output_obj, base_paths=None, meta_path=None, + outfile_params=None): + """ + Parameter + --------- + inputs_path : str + File path for Galaxy parameters + + output_obj : str + File path for ensemble estimator ouput + + base_paths : str + File path or paths concatenated by comma. + + meta_path : str + File path + + outfile_params : str + File path for params output + """ + with open(inputs_path, 'r') as param_handler: + params = json.load(param_handler) + + base_estimators = [] + for idx, base_file in enumerate(base_paths.split(',')): + if base_file and base_file != 'None': + with open(base_file, 'rb') as handler: + model = load_model(handler) + else: + estimator_json = (params['base_est_builder'][idx] + ['estimator_selector']) + model = get_estimator(estimator_json) + base_estimators.append(model) + + if meta_path: + with open(meta_path, 'rb') as f: + meta_estimator = load_model(f) + else: + estimator_json = params['meta_estimator']['estimator_selector'] + meta_estimator = get_estimator(estimator_json) + + options = params['algo_selection']['options'] + + cv_selector = options.pop('cv_selector', None) + if cv_selector: + splitter, groups = get_cv(cv_selector) + options['cv'] = splitter + # set n_jobs + options['n_jobs'] = N_JOBS + + if params['algo_selection']['estimator_type'] == 'StackingCVClassifier': + ensemble_estimator = StackingCVClassifier( + classifiers=base_estimators, + meta_classifier=meta_estimator, + **options) + + elif params['algo_selection']['estimator_type'] == 'StackingClassifier': + ensemble_estimator = StackingClassifier( + classifiers=base_estimators, + meta_classifier=meta_estimator, + **options) + + elif params['algo_selection']['estimator_type'] == 'StackingCVRegressor': + ensemble_estimator = StackingCVRegressor( + regressors=base_estimators, + meta_regressor=meta_estimator, + **options) + + else: + ensemble_estimator = StackingRegressor( + regressors=base_estimators, + meta_regressor=meta_estimator, + **options) + + print(ensemble_estimator) + for base_est in base_estimators: + print(base_est) + + with open(output_obj, 'wb') as out_handler: + pickle.dump(ensemble_estimator, out_handler, pickle.HIGHEST_PROTOCOL) + + if params['get_params'] and outfile_params: + results = get_search_params(ensemble_estimator) + df = pd.DataFrame(results, columns=['', 'Parameter', 'Value']) + df.to_csv(outfile_params, sep='\t', index=False) + + +if __name__ == '__main__': + aparser = argparse.ArgumentParser() + aparser.add_argument("-b", "--bases", dest="bases") + aparser.add_argument("-m", "--meta", dest="meta") + aparser.add_argument("-i", "--inputs", dest="inputs") + aparser.add_argument("-o", "--outfile", dest="outfile") + aparser.add_argument("-p", "--outfile_params", dest="outfile_params") + args = aparser.parse_args() + + main(args.inputs, args.outfile, base_paths=args.bases, + meta_path=args.meta, outfile_params=args.outfile_params) |
b |
diff -r 2bbbac61e48d -r ec25331946b8 test-data/GridSearchCV.zip |
b |
Binary file test-data/GridSearchCV.zip has changed |
b |
diff -r 2bbbac61e48d -r ec25331946b8 test-data/LinearRegression01.zip |
b |
Binary file test-data/LinearRegression01.zip has changed |
b |
diff -r 2bbbac61e48d -r ec25331946b8 test-data/LinearRegression02.zip |
b |
Binary file test-data/LinearRegression02.zip has changed |
b |
diff -r 2bbbac61e48d -r ec25331946b8 test-data/RFE.zip |
b |
Binary file test-data/RFE.zip has changed |
b |
diff -r 2bbbac61e48d -r ec25331946b8 test-data/RandomForestClassifier.zip |
b |
Binary file test-data/RandomForestClassifier.zip has changed |
b |
diff -r 2bbbac61e48d -r ec25331946b8 test-data/RandomForestRegressor01.zip |
b |
Binary file test-data/RandomForestRegressor01.zip has changed |
b |
diff -r 2bbbac61e48d -r ec25331946b8 test-data/StackingCVRegressor01.zip |
b |
Binary file test-data/StackingCVRegressor01.zip has changed |
b |
diff -r 2bbbac61e48d -r ec25331946b8 test-data/StackingCVRegressor02.zip |
b |
Binary file test-data/StackingCVRegressor02.zip has changed |
b |
diff -r 2bbbac61e48d -r ec25331946b8 test-data/XGBRegressor01.zip |
b |
Binary file test-data/XGBRegressor01.zip has changed |
b |
diff -r 2bbbac61e48d -r ec25331946b8 test-data/best_estimator_.zip |
b |
Binary file test-data/best_estimator_.zip has changed |
b |
diff -r 2bbbac61e48d -r ec25331946b8 test-data/best_params_.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/best_params_.txt Tue May 14 18:17:57 2019 -0400 |
b |
@@ -0,0 +1,1 @@ +{'estimator__n_estimators': 100} \ No newline at end of file |
b |
diff -r 2bbbac61e48d -r ec25331946b8 test-data/best_score_.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/best_score_.tabular Tue May 14 18:17:57 2019 -0400 |
b |
@@ -0,0 +1,2 @@ +best_score_ +0.7976348550293088 |
b |
diff -r 2bbbac61e48d -r ec25331946b8 test-data/feature_importances_.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/feature_importances_.tabular Tue May 14 18:17:57 2019 -0400 |
b |
@@ -0,0 +1,11 @@ +feature_importances_ +0.15959252 +0.20373514 +0.22071308 +0.06281833 +0.098471984 +0.06960951 +0.13073005 +0.027164686 +0.022071308 +0.0050933785 |
b |
diff -r 2bbbac61e48d -r ec25331946b8 test-data/feature_selection_result13 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/feature_selection_result13 Tue May 14 18:17:57 2019 -0400 |
b |
@@ -0,0 +1,262 @@ +temp_1 average forecast_noaa friend +69.0 69.7 65.0 88.0 +59.0 58.1 57.0 66.0 +88.0 77.3 75.0 70.0 +65.0 64.7 63.0 58.0 +50.0 47.5 44.0 58.0 +51.0 48.2 45.0 63.0 +52.0 48.6 45.0 41.0 +78.0 76.7 75.0 66.0 +35.0 45.2 43.0 38.0 +40.0 46.1 45.0 36.0 +47.0 45.3 41.0 58.0 +72.0 76.3 76.0 88.0 +76.0 74.4 73.0 72.0 +39.0 45.3 45.0 46.0 +78.0 72.2 70.0 84.0 +71.0 67.3 63.0 85.0 +48.0 47.7 44.0 61.0 +72.0 77.0 77.0 68.0 +57.0 54.7 50.0 70.0 +40.0 45.1 44.0 39.0 +54.0 47.6 47.0 53.0 +58.0 53.2 52.0 71.0 +68.0 58.6 58.0 54.0 +65.0 55.3 55.0 65.0 +47.0 48.8 46.0 51.0 +44.0 45.6 43.0 42.0 +64.0 67.1 64.0 69.0 +62.0 57.1 57.0 67.0 +66.0 65.7 64.0 74.0 +70.0 71.8 67.0 90.0 +57.0 54.2 54.0 70.0 +50.0 50.5 46.0 57.0 +55.0 51.8 49.0 71.0 +55.0 49.5 46.0 67.0 +42.0 45.2 41.0 47.0 +65.0 60.1 57.0 41.0 +63.0 65.6 63.0 73.0 +48.0 47.3 45.0 28.0 +42.0 46.3 44.0 62.0 +51.0 46.2 45.0 38.0 +64.0 68.0 65.0 64.0 +75.0 74.6 74.0 63.0 +52.0 46.7 42.0 39.0 +67.0 68.6 66.0 80.0 +68.0 68.7 65.0 56.0 +54.0 55.0 53.0 42.0 +62.0 56.8 52.0 70.0 +76.0 76.1 76.0 61.0 +73.0 73.1 71.0 93.0 +52.0 50.3 50.0 35.0 +70.0 73.9 71.0 68.0 +77.0 77.4 75.0 62.0 +60.0 56.6 52.0 72.0 +52.0 53.3 50.0 54.0 +79.0 75.0 71.0 85.0 +76.0 57.2 53.0 74.0 +66.0 66.5 64.0 85.0 +57.0 61.8 58.0 62.0 +66.0 57.4 57.0 60.0 +61.0 58.4 58.0 41.0 +55.0 53.1 52.0 65.0 +48.0 48.1 46.0 54.0 +49.0 49.2 46.0 63.0 +65.0 66.7 64.0 73.0 +60.0 62.5 58.0 56.0 +56.0 53.0 53.0 36.0 +59.0 57.4 56.0 44.0 +44.0 45.7 41.0 35.0 +82.0 63.2 62.0 83.0 +64.0 67.0 65.0 76.0 +43.0 45.5 41.0 46.0 +64.0 55.7 51.0 57.0 +63.0 52.7 49.0 49.0 +70.0 70.6 67.0 79.0 +71.0 52.4 48.0 42.0 +76.0 73.5 69.0 85.0 +68.0 62.1 58.0 55.0 +39.0 45.3 44.0 39.0 +71.0 70.7 70.0 52.0 +69.0 71.7 68.0 89.0 +74.0 71.5 71.0 82.0 +81.0 64.1 62.0 81.0 +51.0 49.3 49.0 34.0 +45.0 46.8 44.0 61.0 +87.0 76.8 73.0 73.0 +71.0 73.8 71.0 86.0 +55.0 60.3 56.0 77.0 +80.0 76.9 72.0 81.0 +67.0 69.0 65.0 76.0 +61.0 61.4 60.0 78.0 +46.0 46.6 43.0 65.0 +39.0 45.1 42.0 51.0 +67.0 68.3 67.0 61.0 +52.0 47.8 43.0 50.0 +67.0 69.8 68.0 87.0 +75.0 71.2 67.0 77.0 +68.0 73.3 73.0 79.0 +92.0 68.2 65.0 71.0 +67.0 72.8 69.0 56.0 +44.0 45.8 43.0 56.0 +61.0 61.0 56.0 73.0 +65.0 53.4 49.0 41.0 +68.0 73.0 72.0 70.0 +87.0 62.1 62.0 69.0 +117.0 54.8 51.0 62.0 +80.0 76.4 75.0 66.0 +57.0 51.0 47.0 46.0 +67.0 63.6 61.0 68.0 +58.0 54.0 51.0 56.0 +65.0 56.2 53.0 41.0 +52.0 48.6 45.0 47.0 +59.0 55.3 52.0 39.0 +57.0 53.9 53.0 35.0 +81.0 59.2 56.0 66.0 +75.0 77.1 76.0 75.0 +76.0 77.4 76.0 95.0 +57.0 64.8 61.0 53.0 +69.0 74.2 72.0 86.0 +77.0 66.8 66.0 64.0 +55.0 49.9 47.0 55.0 +49.0 46.8 45.0 53.0 +54.0 52.7 48.0 57.0 +55.0 51.2 49.0 42.0 +56.0 55.6 53.0 45.0 +68.0 74.6 72.0 77.0 +54.0 53.4 49.0 44.0 +67.0 69.0 69.0 87.0 +49.0 46.9 45.0 33.0 +49.0 49.1 47.0 45.0 +56.0 48.5 48.0 49.0 +73.0 71.0 66.0 78.0 +66.0 66.4 65.0 60.0 +69.0 66.5 66.0 62.0 +82.0 64.5 64.0 65.0 +90.0 76.7 75.0 65.0 +51.0 50.7 49.0 43.0 +77.0 57.1 57.0 41.0 +60.0 61.4 58.0 58.0 +74.0 72.8 71.0 87.0 +85.0 77.2 73.0 74.0 +68.0 62.8 61.0 64.0 +56.0 49.5 46.0 37.0 +71.0 56.2 55.0 45.0 +62.0 59.5 57.0 40.0 +83.0 77.3 76.0 76.0 +64.0 65.4 62.0 56.0 +56.0 48.4 45.0 54.0 +41.0 45.1 42.0 31.0 +65.0 66.2 66.0 67.0 +65.0 53.7 49.0 38.0 +40.0 46.0 46.0 41.0 +45.0 45.6 43.0 29.0 +52.0 48.4 48.0 58.0 +63.0 51.7 50.0 63.0 +52.0 47.6 47.0 44.0 +60.0 57.9 55.0 77.0 +81.0 75.7 73.0 89.0 +75.0 75.8 74.0 77.0 +59.0 51.4 48.0 64.0 +73.0 77.1 77.0 94.0 +75.0 77.3 73.0 66.0 +60.0 58.5 56.0 59.0 +75.0 71.3 68.0 56.0 +59.0 57.6 56.0 40.0 +53.0 49.1 47.0 56.0 +79.0 77.2 76.0 60.0 +57.0 52.1 49.0 46.0 +75.0 67.6 64.0 77.0 +71.0 69.4 67.0 81.0 +53.0 50.2 50.0 42.0 +46.0 48.8 48.0 56.0 +81.0 76.9 72.0 70.0 +49.0 48.9 47.0 29.0 +57.0 48.4 44.0 34.0 +60.0 58.8 54.0 53.0 +67.0 73.7 72.0 64.0 +61.0 64.1 62.0 60.0 +66.0 69.5 66.0 85.0 +64.0 51.9 50.0 55.0 +66.0 65.7 62.0 49.0 +64.0 52.2 52.0 49.0 +71.0 65.2 61.0 56.0 +75.0 63.8 62.0 60.0 +48.0 46.4 46.0 47.0 +53.0 52.5 48.0 70.0 +49.0 47.1 46.0 65.0 +85.0 68.5 67.0 81.0 +62.0 49.4 48.0 30.0 +50.0 47.0 42.0 58.0 +58.0 55.9 51.0 39.0 +72.0 77.2 74.0 95.0 +55.0 50.7 50.0 34.0 +74.0 72.3 70.0 91.0 +85.0 77.3 77.0 77.0 +73.0 77.3 77.0 93.0 +52.0 47.4 44.0 39.0 +67.0 67.6 64.0 62.0 +45.0 45.1 45.0 35.0 +46.0 47.2 46.0 41.0 +66.0 60.6 60.0 57.0 +71.0 77.0 75.0 86.0 +70.0 69.3 66.0 79.0 +58.0 49.9 46.0 53.0 +72.0 77.1 76.0 65.0 +74.0 75.4 74.0 71.0 +65.0 64.5 63.0 49.0 +77.0 58.8 55.0 39.0 +59.0 50.9 49.0 35.0 +45.0 45.7 41.0 61.0 +53.0 50.5 49.0 46.0 +53.0 54.9 54.0 72.0 +79.0 77.3 73.0 79.0 +49.0 49.0 44.0 44.0 +63.0 62.9 62.0 78.0 +69.0 56.5 54.0 45.0 +60.0 50.8 47.0 46.0 +64.0 62.5 60.0 73.0 +79.0 71.0 66.0 64.0 +55.0 47.0 43.0 58.0 +73.0 56.0 54.0 41.0 +60.0 59.1 57.0 62.0 +67.0 70.2 67.0 77.0 +42.0 45.2 45.0 58.0 +60.0 65.0 62.0 55.0 +57.0 49.8 47.0 30.0 +35.0 45.2 44.0 36.0 +75.0 70.3 66.0 84.0 +61.0 51.1 48.0 65.0 +51.0 50.6 46.0 59.0 +71.0 71.9 67.0 70.0 +74.0 75.3 74.0 71.0 +48.0 45.4 44.0 42.0 +74.0 74.9 70.0 60.0 +76.0 70.8 68.0 57.0 +58.0 51.6 47.0 37.0 +51.0 50.4 48.0 43.0 +72.0 72.6 68.0 78.0 +76.0 67.2 64.0 74.0 +52.0 47.9 47.0 60.0 +53.0 48.2 48.0 53.0 +65.0 69.1 65.0 83.0 +58.0 58.1 58.0 43.0 +77.0 75.6 74.0 56.0 +61.0 52.9 51.0 35.0 +67.0 65.3 64.0 54.0 +54.0 49.3 46.0 58.0 +79.0 67.4 65.0 58.0 +77.0 64.3 63.0 67.0 +71.0 67.7 64.0 55.0 +58.0 57.7 54.0 61.0 +68.0 55.9 55.0 56.0 +40.0 45.4 45.0 49.0 +80.0 77.3 75.0 71.0 +74.0 62.3 59.0 61.0 +57.0 45.5 42.0 57.0 +52.0 47.8 43.0 57.0 +71.0 75.1 71.0 95.0 +49.0 53.6 49.0 70.0 +89.0 59.0 59.0 61.0 +60.0 60.2 56.0 78.0 +59.0 58.3 58.0 40.0 |
b |
diff -r 2bbbac61e48d -r ec25331946b8 test-data/final_estimator.zip |
b |
Binary file test-data/final_estimator.zip has changed |
b |
diff -r 2bbbac61e48d -r ec25331946b8 test-data/get_params.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/get_params.tabular Tue May 14 18:17:57 2019 -0400 |
b |
@@ -0,0 +1,6 @@ + Parameter Value +@ copy_X copy_X: True +@ fit_intercept fit_intercept: True +* n_jobs n_jobs: 1 +@ normalize normalize: False + Note: @, params eligible for search in searchcv tool. |
b |
diff -r 2bbbac61e48d -r ec25331946b8 test-data/get_params01.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/get_params01.tabular Tue May 14 18:17:57 2019 -0400 |
[ |
@@ -0,0 +1,30 @@ + Parameter Value +* memory memory: None +* steps "steps: [('robustscaler', RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True, + with_scaling=True)), ('selectkbest', SelectKBest(k=10, score_func=<function f_classif at 0x111ef0158>)), ('svr', SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, + gamma='auto_deprecated', kernel='linear', max_iter=-1, shrinking=True, + tol=0.001, verbose=False))]" +@ robustscaler "robustscaler: RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True, + with_scaling=True)" +@ selectkbest selectkbest: SelectKBest(k=10, score_func=<function f_classif at 0x111ef0158>) +@ svr "svr: SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, + gamma='auto_deprecated', kernel='linear', max_iter=-1, shrinking=True, + tol=0.001, verbose=False)" +@ robustscaler__copy robustscaler__copy: True +@ robustscaler__quantile_range robustscaler__quantile_range: (25.0, 75.0) +@ robustscaler__with_centering robustscaler__with_centering: True +@ robustscaler__with_scaling robustscaler__with_scaling: True +@ selectkbest__k selectkbest__k: 10 +@ selectkbest__score_func selectkbest__score_func: <function f_classif at 0x111ef0158> +@ svr__C svr__C: 1.0 +@ svr__cache_size svr__cache_size: 200 +@ svr__coef0 svr__coef0: 0.0 +@ svr__degree svr__degree: 3 +@ svr__epsilon svr__epsilon: 0.1 +@ svr__gamma svr__gamma: 'auto_deprecated' +@ svr__kernel svr__kernel: 'linear' +@ svr__max_iter svr__max_iter: -1 +@ svr__shrinking svr__shrinking: True +@ svr__tol svr__tol: 0.001 +* svr__verbose svr__verbose: False + Note: @, searchable params in searchcv too. |
b |
diff -r 2bbbac61e48d -r ec25331946b8 test-data/get_params02.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/get_params02.tabular Tue May 14 18:17:57 2019 -0400 |
[ |
@@ -0,0 +1,33 @@ + Parameter Value +* memory memory: None +* steps "steps: [('robustscaler', RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True, + with_scaling=True)), ('lassocv', LassoCV(alphas=None, copy_X=True, cv='warn', eps=0.001, fit_intercept=True, + max_iter=1000, n_alphas=100, n_jobs=1, normalize=False, positive=False, + precompute='auto', random_state=None, selection='cyclic', tol=0.0001, + verbose=False))]" +@ robustscaler "robustscaler: RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True, + with_scaling=True)" +@ lassocv "lassocv: LassoCV(alphas=None, copy_X=True, cv='warn', eps=0.001, fit_intercept=True, + max_iter=1000, n_alphas=100, n_jobs=1, normalize=False, positive=False, + precompute='auto', random_state=None, selection='cyclic', tol=0.0001, + verbose=False)" +@ robustscaler__copy robustscaler__copy: True +@ robustscaler__quantile_range robustscaler__quantile_range: (25.0, 75.0) +@ robustscaler__with_centering robustscaler__with_centering: True +@ robustscaler__with_scaling robustscaler__with_scaling: True +@ lassocv__alphas lassocv__alphas: None +@ lassocv__copy_X lassocv__copy_X: True +@ lassocv__cv lassocv__cv: 'warn' +@ lassocv__eps lassocv__eps: 0.001 +@ lassocv__fit_intercept lassocv__fit_intercept: True +@ lassocv__max_iter lassocv__max_iter: 1000 +@ lassocv__n_alphas lassocv__n_alphas: 100 +* lassocv__n_jobs lassocv__n_jobs: 1 +@ lassocv__normalize lassocv__normalize: False +@ lassocv__positive lassocv__positive: False +@ lassocv__precompute lassocv__precompute: 'auto' +@ lassocv__random_state lassocv__random_state: None +@ lassocv__selection lassocv__selection: 'cyclic' +@ lassocv__tol lassocv__tol: 0.0001 +* lassocv__verbose lassocv__verbose: False + Note: @, searchable params in searchcv too. |
b |
diff -r 2bbbac61e48d -r ec25331946b8 test-data/get_params03.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/get_params03.tabular Tue May 14 18:17:57 2019 -0400 |
[ |
@@ -0,0 +1,43 @@ + Parameter Value +* memory memory: None +* steps "steps: [('robustscaler', RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True, + with_scaling=True)), ('xgbclassifier', XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, + colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0, + max_depth=3, min_child_weight=1, missing=nan, n_estimators=100, + n_jobs=1, nthread=None, objective='binary:logistic', random_state=0, + reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None, + silent=True, subsample=1))]" +@ robustscaler "robustscaler: RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True, + with_scaling=True)" +@ xgbclassifier "xgbclassifier: XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, + colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0, + max_depth=3, min_child_weight=1, missing=nan, n_estimators=100, + n_jobs=1, nthread=None, objective='binary:logistic', random_state=0, + reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None, + silent=True, subsample=1)" +@ robustscaler__copy robustscaler__copy: True +@ robustscaler__quantile_range robustscaler__quantile_range: (25.0, 75.0) +@ robustscaler__with_centering robustscaler__with_centering: True +@ robustscaler__with_scaling robustscaler__with_scaling: True +@ xgbclassifier__base_score xgbclassifier__base_score: 0.5 +@ xgbclassifier__booster xgbclassifier__booster: 'gbtree' +@ xgbclassifier__colsample_bylevel xgbclassifier__colsample_bylevel: 1 +@ xgbclassifier__colsample_bytree xgbclassifier__colsample_bytree: 1 +@ xgbclassifier__gamma xgbclassifier__gamma: 0 +@ xgbclassifier__learning_rate xgbclassifier__learning_rate: 0.1 +@ xgbclassifier__max_delta_step xgbclassifier__max_delta_step: 0 +@ xgbclassifier__max_depth xgbclassifier__max_depth: 3 +@ xgbclassifier__min_child_weight xgbclassifier__min_child_weight: 1 +@ xgbclassifier__missing xgbclassifier__missing: nan +@ xgbclassifier__n_estimators xgbclassifier__n_estimators: 100 +* xgbclassifier__n_jobs xgbclassifier__n_jobs: 1 +* xgbclassifier__nthread xgbclassifier__nthread: None +@ xgbclassifier__objective xgbclassifier__objective: 'binary:logistic' +@ xgbclassifier__random_state xgbclassifier__random_state: 0 +@ xgbclassifier__reg_alpha xgbclassifier__reg_alpha: 0 +@ xgbclassifier__reg_lambda xgbclassifier__reg_lambda: 1 +@ xgbclassifier__scale_pos_weight xgbclassifier__scale_pos_weight: 1 +@ xgbclassifier__seed xgbclassifier__seed: None +@ xgbclassifier__silent xgbclassifier__silent: True +@ xgbclassifier__subsample xgbclassifier__subsample: 1 + Note: @, searchable params in searchcv too. |
b |
diff -r 2bbbac61e48d -r ec25331946b8 test-data/get_params04.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/get_params04.tabular Tue May 14 18:17:57 2019 -0400 |
[ |
@@ -0,0 +1,39 @@ + Parameter Value +* memory memory: None +* steps "steps: [('selectfrommodel', SelectFromModel(estimator=AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, + learning_rate=1.0, n_estimators=50, random_state=None), + max_features=None, norm_order=1, prefit=False, threshold=None)), ('linearsvc', LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True, + intercept_scaling=1, loss='squared_hinge', max_iter=1000, + multi_class='ovr', penalty='l2', random_state=None, tol=0.0001, + verbose=0))]" +@ selectfrommodel "selectfrommodel: SelectFromModel(estimator=AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, + learning_rate=1.0, n_estimators=50, random_state=None), + max_features=None, norm_order=1, prefit=False, threshold=None)" +@ linearsvc "linearsvc: LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True, + intercept_scaling=1, loss='squared_hinge', max_iter=1000, + multi_class='ovr', penalty='l2', random_state=None, tol=0.0001, + verbose=0)" +@ selectfrommodel__estimator__algorithm selectfrommodel__estimator__algorithm: 'SAMME.R' +@ selectfrommodel__estimator__base_estimator selectfrommodel__estimator__base_estimator: None +@ selectfrommodel__estimator__learning_rate selectfrommodel__estimator__learning_rate: 1.0 +@ selectfrommodel__estimator__n_estimators selectfrommodel__estimator__n_estimators: 50 +@ selectfrommodel__estimator__random_state selectfrommodel__estimator__random_state: None +@ selectfrommodel__estimator "selectfrommodel__estimator: AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, + learning_rate=1.0, n_estimators=50, random_state=None)" +@ selectfrommodel__max_features selectfrommodel__max_features: None +@ selectfrommodel__norm_order selectfrommodel__norm_order: 1 +@ selectfrommodel__prefit selectfrommodel__prefit: False +@ selectfrommodel__threshold selectfrommodel__threshold: None +@ linearsvc__C linearsvc__C: 1.0 +@ linearsvc__class_weight linearsvc__class_weight: None +@ linearsvc__dual linearsvc__dual: True +@ linearsvc__fit_intercept linearsvc__fit_intercept: True +@ linearsvc__intercept_scaling linearsvc__intercept_scaling: 1 +@ linearsvc__loss linearsvc__loss: 'squared_hinge' +@ linearsvc__max_iter linearsvc__max_iter: 1000 +@ linearsvc__multi_class linearsvc__multi_class: 'ovr' +@ linearsvc__penalty linearsvc__penalty: 'l2' +@ linearsvc__random_state linearsvc__random_state: None +@ linearsvc__tol linearsvc__tol: 0.0001 +* linearsvc__verbose linearsvc__verbose: 0 + Note: @, searchable params in searchcv too. |
b |
diff -r 2bbbac61e48d -r ec25331946b8 test-data/get_params05.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/get_params05.tabular Tue May 14 18:17:57 2019 -0400 |
[ |
@@ -0,0 +1,31 @@ + Parameter Value +* memory memory: None +* steps "steps: [('randomforestregressor', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None, + max_features='auto', max_leaf_nodes=None, + min_impurity_decrease=0.0, min_impurity_split=None, + min_samples_leaf=1, min_samples_split=2, + min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1, + oob_score=False, random_state=42, verbose=0, warm_start=False))]" +@ randomforestregressor "randomforestregressor: RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None, + max_features='auto', max_leaf_nodes=None, + min_impurity_decrease=0.0, min_impurity_split=None, + min_samples_leaf=1, min_samples_split=2, + min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1, + oob_score=False, random_state=42, verbose=0, warm_start=False)" +@ randomforestregressor__bootstrap randomforestregressor__bootstrap: True +@ randomforestregressor__criterion randomforestregressor__criterion: 'mse' +@ randomforestregressor__max_depth randomforestregressor__max_depth: None +@ randomforestregressor__max_features randomforestregressor__max_features: 'auto' +@ randomforestregressor__max_leaf_nodes randomforestregressor__max_leaf_nodes: None +@ randomforestregressor__min_impurity_decrease randomforestregressor__min_impurity_decrease: 0.0 +@ randomforestregressor__min_impurity_split randomforestregressor__min_impurity_split: None +@ randomforestregressor__min_samples_leaf randomforestregressor__min_samples_leaf: 1 +@ randomforestregressor__min_samples_split randomforestregressor__min_samples_split: 2 +@ randomforestregressor__min_weight_fraction_leaf randomforestregressor__min_weight_fraction_leaf: 0.0 +@ randomforestregressor__n_estimators randomforestregressor__n_estimators: 100 +* randomforestregressor__n_jobs randomforestregressor__n_jobs: 1 +@ randomforestregressor__oob_score randomforestregressor__oob_score: False +@ randomforestregressor__random_state randomforestregressor__random_state: 42 +* randomforestregressor__verbose randomforestregressor__verbose: 0 +@ randomforestregressor__warm_start randomforestregressor__warm_start: False + Note: @, searchable params in searchcv too. |
b |
diff -r 2bbbac61e48d -r ec25331946b8 test-data/get_params06.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/get_params06.tabular Tue May 14 18:17:57 2019 -0400 |
[ |
@@ -0,0 +1,22 @@ + Parameter Value +* memory memory: None +* steps "steps: [('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None, + svd_solver='auto', tol=0.0, whiten=False)), ('adaboostregressor', AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear', + n_estimators=50, random_state=None))]" +@ pca "pca: PCA(copy=True, iterated_power='auto', n_components=None, random_state=None, + svd_solver='auto', tol=0.0, whiten=False)" +@ adaboostregressor "adaboostregressor: AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear', + n_estimators=50, random_state=None)" +@ pca__copy pca__copy: True +@ pca__iterated_power pca__iterated_power: 'auto' +@ pca__n_components pca__n_components: None +@ pca__random_state pca__random_state: None +@ pca__svd_solver pca__svd_solver: 'auto' +@ pca__tol pca__tol: 0.0 +@ pca__whiten pca__whiten: False +@ adaboostregressor__base_estimator adaboostregressor__base_estimator: None +@ adaboostregressor__learning_rate adaboostregressor__learning_rate: 1.0 +@ adaboostregressor__loss adaboostregressor__loss: 'linear' +@ adaboostregressor__n_estimators adaboostregressor__n_estimators: 50 +@ adaboostregressor__random_state adaboostregressor__random_state: None + Note: @, searchable params in searchcv too. |
b |
diff -r 2bbbac61e48d -r ec25331946b8 test-data/get_params07.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/get_params07.tabular Tue May 14 18:17:57 2019 -0400 |
[ |
@@ -0,0 +1,16 @@ + Parameter Value +* memory memory: None +* steps "steps: [('rbfsampler', RBFSampler(gamma=2.0, n_components=10, random_state=None)), ('adaboostclassifier', AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, + learning_rate=1.0, n_estimators=50, random_state=None))]" +@ rbfsampler rbfsampler: RBFSampler(gamma=2.0, n_components=10, random_state=None) +@ adaboostclassifier "adaboostclassifier: AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, + learning_rate=1.0, n_estimators=50, random_state=None)" +@ rbfsampler__gamma rbfsampler__gamma: 2.0 +@ rbfsampler__n_components rbfsampler__n_components: 10 +@ rbfsampler__random_state rbfsampler__random_state: None +@ adaboostclassifier__algorithm adaboostclassifier__algorithm: 'SAMME.R' +@ adaboostclassifier__base_estimator adaboostclassifier__base_estimator: None +@ adaboostclassifier__learning_rate adaboostclassifier__learning_rate: 1.0 +@ adaboostclassifier__n_estimators adaboostclassifier__n_estimators: 50 +@ adaboostclassifier__random_state adaboostclassifier__random_state: None + Note: @, searchable params in searchcv too. |
b |
diff -r 2bbbac61e48d -r ec25331946b8 test-data/get_params08.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/get_params08.tabular Tue May 14 18:17:57 2019 -0400 |
[ |
@@ -0,0 +1,24 @@ + Parameter Value +* memory memory: None +* steps "steps: [('featureagglomeration', FeatureAgglomeration(affinity='euclidean', compute_full_tree='auto', + connectivity=None, linkage='ward', memory=None, n_clusters=3, + pooling_func=<function mean at 0x1123f1620>)), ('adaboostclassifier', AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, + learning_rate=1.0, n_estimators=50, random_state=None))]" +@ featureagglomeration "featureagglomeration: FeatureAgglomeration(affinity='euclidean', compute_full_tree='auto', + connectivity=None, linkage='ward', memory=None, n_clusters=3, + pooling_func=<function mean at 0x1123f1620>)" +@ adaboostclassifier "adaboostclassifier: AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, + learning_rate=1.0, n_estimators=50, random_state=None)" +@ featureagglomeration__affinity featureagglomeration__affinity: 'euclidean' +@ featureagglomeration__compute_full_tree featureagglomeration__compute_full_tree: 'auto' +@ featureagglomeration__connectivity featureagglomeration__connectivity: None +@ featureagglomeration__linkage featureagglomeration__linkage: 'ward' +* featureagglomeration__memory featureagglomeration__memory: None +@ featureagglomeration__n_clusters featureagglomeration__n_clusters: 3 +@ featureagglomeration__pooling_func featureagglomeration__pooling_func: <function mean at 0x1123f1620> +@ adaboostclassifier__algorithm adaboostclassifier__algorithm: 'SAMME.R' +@ adaboostclassifier__base_estimator adaboostclassifier__base_estimator: None +@ adaboostclassifier__learning_rate adaboostclassifier__learning_rate: 1.0 +@ adaboostclassifier__n_estimators adaboostclassifier__n_estimators: 50 +@ adaboostclassifier__random_state adaboostclassifier__random_state: None + Note: @, searchable params in searchcv too. |
b |
diff -r 2bbbac61e48d -r ec25331946b8 test-data/get_params09.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/get_params09.tabular Tue May 14 18:17:57 2019 -0400 |
[ |
@@ -0,0 +1,39 @@ + Parameter Value +* memory memory: None +* steps "steps: [('relieff', ReliefF(discrete_threshold=10, n_features_to_select=3, n_jobs=1, + n_neighbors=100, verbose=False)), ('randomforestregressor', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None, + max_features='auto', max_leaf_nodes=None, + min_impurity_decrease=0.0, min_impurity_split=None, + min_samples_leaf=1, min_samples_split=2, + min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=1, + oob_score=False, random_state=None, verbose=0, warm_start=False))]" +@ relieff "relieff: ReliefF(discrete_threshold=10, n_features_to_select=3, n_jobs=1, + n_neighbors=100, verbose=False)" +@ randomforestregressor "randomforestregressor: RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None, + max_features='auto', max_leaf_nodes=None, + min_impurity_decrease=0.0, min_impurity_split=None, + min_samples_leaf=1, min_samples_split=2, + min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=1, + oob_score=False, random_state=None, verbose=0, warm_start=False)" +@ relieff__discrete_threshold relieff__discrete_threshold: 10 +@ relieff__n_features_to_select relieff__n_features_to_select: 3 +* relieff__n_jobs relieff__n_jobs: 1 +@ relieff__n_neighbors relieff__n_neighbors: 100 +* relieff__verbose relieff__verbose: False +@ randomforestregressor__bootstrap randomforestregressor__bootstrap: True +@ randomforestregressor__criterion randomforestregressor__criterion: 'mse' +@ randomforestregressor__max_depth randomforestregressor__max_depth: None +@ randomforestregressor__max_features randomforestregressor__max_features: 'auto' +@ randomforestregressor__max_leaf_nodes randomforestregressor__max_leaf_nodes: None +@ randomforestregressor__min_impurity_decrease randomforestregressor__min_impurity_decrease: 0.0 +@ randomforestregressor__min_impurity_split randomforestregressor__min_impurity_split: None +@ randomforestregressor__min_samples_leaf randomforestregressor__min_samples_leaf: 1 +@ randomforestregressor__min_samples_split randomforestregressor__min_samples_split: 2 +@ randomforestregressor__min_weight_fraction_leaf randomforestregressor__min_weight_fraction_leaf: 0.0 +@ randomforestregressor__n_estimators randomforestregressor__n_estimators: 'warn' +* randomforestregressor__n_jobs randomforestregressor__n_jobs: 1 +@ randomforestregressor__oob_score randomforestregressor__oob_score: False +@ randomforestregressor__random_state randomforestregressor__random_state: None +* randomforestregressor__verbose randomforestregressor__verbose: 0 +@ randomforestregressor__warm_start randomforestregressor__warm_start: False + Note: @, searchable params in searchcv too. |
b |
diff -r 2bbbac61e48d -r ec25331946b8 test-data/get_params10.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/get_params10.tabular Tue May 14 18:17:57 2019 -0400 |
[ |
@@ -0,0 +1,12 @@ + Parameter Value +* memory memory: None +* steps "steps: [('adaboostregressor', AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear', + n_estimators=50, random_state=None))]" +@ adaboostregressor "adaboostregressor: AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear', + n_estimators=50, random_state=None)" +@ adaboostregressor__base_estimator adaboostregressor__base_estimator: None +@ adaboostregressor__learning_rate adaboostregressor__learning_rate: 1.0 +@ adaboostregressor__loss adaboostregressor__loss: 'linear' +@ adaboostregressor__n_estimators adaboostregressor__n_estimators: 50 +@ adaboostregressor__random_state adaboostregressor__random_state: None + Note: @, params eligible for search in searchcv tool. |
b |
diff -r 2bbbac61e48d -r ec25331946b8 test-data/get_params11.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/get_params11.tabular Tue May 14 18:17:57 2019 -0400 |
[ |
@@ -0,0 +1,46 @@ + Parameter Value +* memory memory: None +* steps "steps: [('editednearestneighbours', EditedNearestNeighbours(kind_sel='all', n_jobs=1, n_neighbors=3, + random_state=None, ratio=None, return_indices=False, + sampling_strategy='auto')), ('randomforestclassifier', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', + max_depth=None, max_features='auto', max_leaf_nodes=None, + min_impurity_decrease=0.0, min_impurity_split=None, + min_samples_leaf=1, min_samples_split=2, + min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=1, + oob_score=False, random_state=None, verbose=0, + warm_start=False))]" +@ editednearestneighbours "editednearestneighbours: EditedNearestNeighbours(kind_sel='all', n_jobs=1, n_neighbors=3, + random_state=None, ratio=None, return_indices=False, + sampling_strategy='auto')" +@ randomforestclassifier "randomforestclassifier: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', + max_depth=None, max_features='auto', max_leaf_nodes=None, + min_impurity_decrease=0.0, min_impurity_split=None, + min_samples_leaf=1, min_samples_split=2, + min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=1, + oob_score=False, random_state=None, verbose=0, + warm_start=False)" +@ editednearestneighbours__kind_sel editednearestneighbours__kind_sel: 'all' +* editednearestneighbours__n_jobs editednearestneighbours__n_jobs: 1 +@ editednearestneighbours__n_neighbors editednearestneighbours__n_neighbors: 3 +@ editednearestneighbours__random_state editednearestneighbours__random_state: None +@ editednearestneighbours__ratio editednearestneighbours__ratio: None +@ editednearestneighbours__return_indices editednearestneighbours__return_indices: False +@ editednearestneighbours__sampling_strategy editednearestneighbours__sampling_strategy: 'auto' +@ randomforestclassifier__bootstrap randomforestclassifier__bootstrap: True +@ randomforestclassifier__class_weight randomforestclassifier__class_weight: None +@ randomforestclassifier__criterion randomforestclassifier__criterion: 'gini' +@ randomforestclassifier__max_depth randomforestclassifier__max_depth: None +@ randomforestclassifier__max_features randomforestclassifier__max_features: 'auto' +@ randomforestclassifier__max_leaf_nodes randomforestclassifier__max_leaf_nodes: None +@ randomforestclassifier__min_impurity_decrease randomforestclassifier__min_impurity_decrease: 0.0 +@ randomforestclassifier__min_impurity_split randomforestclassifier__min_impurity_split: None +@ randomforestclassifier__min_samples_leaf randomforestclassifier__min_samples_leaf: 1 +@ randomforestclassifier__min_samples_split randomforestclassifier__min_samples_split: 2 +@ randomforestclassifier__min_weight_fraction_leaf randomforestclassifier__min_weight_fraction_leaf: 0.0 +@ randomforestclassifier__n_estimators randomforestclassifier__n_estimators: 'warn' +* randomforestclassifier__n_jobs randomforestclassifier__n_jobs: 1 +@ randomforestclassifier__oob_score randomforestclassifier__oob_score: False +@ randomforestclassifier__random_state randomforestclassifier__random_state: None +* randomforestclassifier__verbose randomforestclassifier__verbose: 0 +@ randomforestclassifier__warm_start randomforestclassifier__warm_start: False + Note: @, searchable params in searchcv too. |
b |
diff -r 2bbbac61e48d -r ec25331946b8 test-data/get_params12.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/get_params12.tabular Tue May 14 18:17:57 2019 -0400 |
[ |
@@ -0,0 +1,47 @@ + Parameter Value +* memory memory: None +* steps "steps: [('rfe', RFE(estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1, + colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0, + max_depth=3, min_child_weight=1, missing=nan, n_estimators=100, + n_jobs=1, nthread=None, objective='reg:linear', random_state=0, + reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None, + silent=True, subsample=1), + n_features_to_select=None, step=1, verbose=0))]" +@ rfe "rfe: RFE(estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1, + colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0, + max_depth=3, min_child_weight=1, missing=nan, n_estimators=100, + n_jobs=1, nthread=None, objective='reg:linear', random_state=0, + reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None, + silent=True, subsample=1), + n_features_to_select=None, step=1, verbose=0)" +@ rfe__estimator__base_score rfe__estimator__base_score: 0.5 +@ rfe__estimator__booster rfe__estimator__booster: 'gbtree' +@ rfe__estimator__colsample_bylevel rfe__estimator__colsample_bylevel: 1 +@ rfe__estimator__colsample_bytree rfe__estimator__colsample_bytree: 1 +@ rfe__estimator__gamma rfe__estimator__gamma: 0 +@ rfe__estimator__learning_rate rfe__estimator__learning_rate: 0.1 +@ rfe__estimator__max_delta_step rfe__estimator__max_delta_step: 0 +@ rfe__estimator__max_depth rfe__estimator__max_depth: 3 +@ rfe__estimator__min_child_weight rfe__estimator__min_child_weight: 1 +@ rfe__estimator__missing rfe__estimator__missing: nan +@ rfe__estimator__n_estimators rfe__estimator__n_estimators: 100 +* rfe__estimator__n_jobs rfe__estimator__n_jobs: 1 +* rfe__estimator__nthread rfe__estimator__nthread: None +@ rfe__estimator__objective rfe__estimator__objective: 'reg:linear' +@ rfe__estimator__random_state rfe__estimator__random_state: 0 +@ rfe__estimator__reg_alpha rfe__estimator__reg_alpha: 0 +@ rfe__estimator__reg_lambda rfe__estimator__reg_lambda: 1 +@ rfe__estimator__scale_pos_weight rfe__estimator__scale_pos_weight: 1 +@ rfe__estimator__seed rfe__estimator__seed: None +@ rfe__estimator__silent rfe__estimator__silent: True +@ rfe__estimator__subsample rfe__estimator__subsample: 1 +@ rfe__estimator "rfe__estimator: XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1, + colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0, + max_depth=3, min_child_weight=1, missing=nan, n_estimators=100, + n_jobs=1, nthread=None, objective='reg:linear', random_state=0, + reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None, + silent=True, subsample=1)" +@ rfe__n_features_to_select rfe__n_features_to_select: None +@ rfe__step rfe__step: 1 +* rfe__verbose rfe__verbose: 0 + Note: @, searchable params in searchcv too. |
b |
diff -r 2bbbac61e48d -r ec25331946b8 test-data/mv_result01.tabular --- a/test-data/mv_result01.tabular Sun Dec 30 01:57:11 2018 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,3 +0,0 @@ -0.9452947345848994 -0.9926363525448115 --0.4384003222944141 |
b |
diff -r 2bbbac61e48d -r ec25331946b8 test-data/mv_result02.tabular --- a/test-data/mv_result02.tabular Sun Dec 30 01:57:11 2018 -0500 +++ b/test-data/mv_result02.tabular Tue May 14 18:17:57 2019 -0400 |
b |
@@ -1,10 +1,11 @@ -1.6957921248350636 --0.9248588846061156 --0.48640795813792376 -0.647707440306449 -0.32740690920811427 --0.8229559569886034 -1.2150108977866847 -0.14723254190255275 -0.6053186541119763 -0.3972102859168325 +Predicted +1.578912095858962 +-1.199072894940544 +-0.7173258906076226 +0.3255908318822695 +0.21919344304093213 +-0.6841926371423699 +1.1144698671662865 +0.19379531649046616 +0.9405094785593062 +1.2581284896870837 |
b |
diff -r 2bbbac61e48d -r ec25331946b8 test-data/mv_result03.tabular --- a/test-data/mv_result03.tabular Sun Dec 30 01:57:11 2018 -0500 +++ b/test-data/mv_result03.tabular Tue May 14 18:17:57 2019 -0400 |
b |
@@ -1,3 +1,6 @@ -0.9452947345848994 -0.9926363525448115 --0.4384003222944141 +train_sizes_abs mean_train_scores std_train_scores mean_test_scores std_test_scores +17 0.9668700841937653 0.00277836829836518 0.7008862995946905 0.03857541198731935 +56 0.9730008602419361 0.006839342612121988 0.7963376762427242 0.004846330083938778 +95 0.9728783377589098 0.0037790183626530663 0.814592845745573 0.020457691766770824 +134 0.9739086338111185 0.001627343246847077 0.7985540571195479 0.03954641079310707 +174 0.9726218628287785 0.0032867750457225182 0.8152971572131146 0.04280261115004303 |
b |
diff -r 2bbbac61e48d -r ec25331946b8 test-data/mv_result04.tabular --- a/test-data/mv_result04.tabular Sun Dec 30 01:57:11 2018 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,5 +0,0 @@ -17 -56 -95 -134 -174 |
b |
diff -r 2bbbac61e48d -r ec25331946b8 test-data/mv_result05.tabular --- a/test-data/mv_result05.tabular Sun Dec 30 01:57:11 2018 -0500 +++ b/test-data/mv_result05.tabular Tue May 14 18:17:57 2019 -0400 |
b |
@@ -1,1 +1,262 @@ -0.4998435882784322 +Predicted +70.16 +62.06 +83.04 +62.84 +48.63 +51.25 +54.98 +80.3 +42.84 +41.52 +43.83 +73.15 +74.22 +42.88 +74.93 +72.9 +53.74 +78.86 +59.0 +40.28 +54.52 +58.34 +62.74 +62.35 +49.15 +41.92 +65.59 +59.91 +66.49 +72.08 +60.44 +53.84 +54.82 +52.66 +42.37 +61.3 +63.14 +50.62 +42.75 +47.39 +67.8 +73.58 +49.97 +67.04 +67.45 +54.67 +64.87 +77.23 +73.52 +53.55 +70.53 +77.98 +61.99 +53.08 +78.12 +66.55 +63.95 +60.57 +61.6 +60.37 +55.29 +54.31 +52.54 +65.31 +61.51 +57.3 +60.02 +43.64 +74.78 +68.26 +42.72 +61.26 +61.25 +71.58 +61.03 +70.53 +70.25 +43.4 +71.39 +72.31 +72.7 +72.11 +53.55 +43.4 +80.6 +73.72 +58.86 +76.71 +68.36 +60.26 +48.56 +38.96 +69.67 +52.9 +67.63 +75.12 +70.92 +70.89 +67.05 +43.89 +59.94 +62.98 +71.1 +79.22 +77.31 +79.06 +61.11 +66.32 +54.7 +61.1 +54.59 +58.7 +59.6 +73.79 +72.69 +81.83 +61.08 +69.21 +74.8 +54.37 +50.85 +53.07 +58.53 +55.44 +72.62 +54.14 +68.12 +48.81 +50.11 +56.06 +73.63 +63.29 +71.0 +74.87 +81.24 +54.67 +66.96 +61.37 +74.84 +76.71 +69.27 +56.53 +71.91 +58.74 +77.83 +64.57 +51.93 +42.84 +64.11 +59.47 +42.46 +43.79 +51.75 +63.98 +54.71 +64.95 +79.72 +72.12 +60.66 +79.3 +71.26 +59.9 +74.25 +59.68 +52.37 +78.52 +58.52 +71.98 +71.77 +54.48 +48.96 +81.42 +54.08 +53.52 +64.38 +70.79 +63.95 +67.48 +61.76 +66.15 +62.1 +75.68 +69.72 +43.8 +56.27 +53.38 +81.31 +57.54 +48.15 +59.47 +78.01 +56.39 +72.33 +78.8 +78.66 +52.01 +66.68 +48.56 +47.75 +65.67 +77.93 +72.68 +58.0 +77.83 +73.37 +65.39 +69.79 +55.98 +46.35 +54.31 +55.58 +79.69 +52.76 +62.62 +66.54 +60.29 +62.57 +74.86 +48.05 +65.09 +65.02 +67.84 +41.86 +62.28 +57.05 +43.68 +72.0 +63.04 +54.41 +73.37 +75.11 +42.65 +73.16 +71.68 +58.61 +53.54 +73.33 +72.16 +49.96 +54.78 +64.24 +60.13 +76.46 +61.53 +68.36 +53.1 +71.33 +76.12 +70.86 +61.35 +67.12 +43.25 +80.2 +71.16 +58.63 +52.37 +74.93 +53.34 +76.41 +63.87 +59.97 |
b |
diff -r 2bbbac61e48d -r ec25331946b8 test-data/mv_result06.tabular --- a/test-data/mv_result06.tabular Sun Dec 30 01:57:11 2018 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,5 +0,0 @@ -0.07547169811320754 0.10344827586206896 0.10294117647058823 -0.07547169811320754 0.10344827586206896 0.10294117647058823 -0.07547169811320754 0.10344827586206896 0.10294117647058823 -0.07547169811320754 0.10344827586206896 0.10294117647058823 -0.07547169811320754 0.10344827586206896 0.10294117647058823 |
b |
diff -r 2bbbac61e48d -r ec25331946b8 test-data/named_steps.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/named_steps.txt Tue May 14 18:17:57 2019 -0400 |
b |
@@ -0,0 +1,6 @@ +{'preprocessing_1': SelectKBest(k=10, score_func=<function f_regression at 0x113310ea0>), 'estimator': XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1, + colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0, + max_depth=3, min_child_weight=1, missing=nan, n_estimators=100, + n_jobs=1, nthread=None, objective='reg:linear', random_state=10, + reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None, + silent=True, subsample=1)} \ No newline at end of file |
b |
diff -r 2bbbac61e48d -r ec25331946b8 test-data/nn_model01 |
b |
Binary file test-data/nn_model01 has changed |
b |
diff -r 2bbbac61e48d -r ec25331946b8 test-data/pipeline01 |
b |
Binary file test-data/pipeline01 has changed |
b |
diff -r 2bbbac61e48d -r ec25331946b8 test-data/pipeline02 |
b |
Binary file test-data/pipeline02 has changed |
b |
diff -r 2bbbac61e48d -r ec25331946b8 test-data/pipeline03 |
b |
Binary file test-data/pipeline03 has changed |
b |
diff -r 2bbbac61e48d -r ec25331946b8 test-data/pipeline04 |
b |
Binary file test-data/pipeline04 has changed |
b |
diff -r 2bbbac61e48d -r ec25331946b8 test-data/pipeline05 |
b |
Binary file test-data/pipeline05 has changed |
b |
diff -r 2bbbac61e48d -r ec25331946b8 test-data/pipeline06 |
b |
Binary file test-data/pipeline06 has changed |
b |
diff -r 2bbbac61e48d -r ec25331946b8 test-data/pipeline07 |
b |
Binary file test-data/pipeline07 has changed |
b |
diff -r 2bbbac61e48d -r ec25331946b8 test-data/pipeline08 |
b |
Binary file test-data/pipeline08 has changed |
b |
diff -r 2bbbac61e48d -r ec25331946b8 test-data/pipeline09 |
b |
Binary file test-data/pipeline09 has changed |
b |
diff -r 2bbbac61e48d -r ec25331946b8 test-data/pipeline10 |
b |
Binary file test-data/pipeline10 has changed |
b |
diff -r 2bbbac61e48d -r ec25331946b8 test-data/pipeline11 |
b |
Binary file test-data/pipeline11 has changed |
b |
diff -r 2bbbac61e48d -r ec25331946b8 test-data/pipeline12 |
b |
Binary file test-data/pipeline12 has changed |
b |
diff -r 2bbbac61e48d -r ec25331946b8 test-data/pipeline13 |
b |
Binary file test-data/pipeline13 has changed |
b |
diff -r 2bbbac61e48d -r ec25331946b8 test-data/pipeline14 |
b |
Binary file test-data/pipeline14 has changed |
b |
diff -r 2bbbac61e48d -r ec25331946b8 test-data/pipeline15 |
b |
Binary file test-data/pipeline15 has changed |
b |
diff -r 2bbbac61e48d -r ec25331946b8 test-data/ranking_.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/ranking_.tabular Tue May 14 18:17:57 2019 -0400 |
b |
@@ -0,0 +1,18 @@ +ranking_ +17 +7 +4 +5 +2 +1 +9 +6 +8 +3 +10 +15 +14 +11 +13 +12 +16 |
b |
diff -r 2bbbac61e48d -r ec25331946b8 test-data/searchCV01 |
b |
Binary file test-data/searchCV01 has changed |
b |
diff -r 2bbbac61e48d -r ec25331946b8 test-data/searchCV02 |
b |
Binary file test-data/searchCV02 has changed |
b |
diff -r 2bbbac61e48d -r ec25331946b8 utils.py --- a/utils.py Sun Dec 30 01:57:11 2018 -0500 +++ b/utils.py Tue May 14 18:17:57 2019 -0400 |
[ |
b'@@ -1,80 +1,134 @@\n+import ast\n import json\n+import imblearn\n import numpy as np\n-import os\n import pandas\n import pickle\n import re\n import scipy\n import sklearn\n+import skrebate\n import sys\n import warnings\n import xgboost\n \n+from collections import Counter\n from asteval import Interpreter, make_symbol_table\n-from sklearn import (cluster, compose, decomposition, ensemble, feature_extraction,\n- feature_selection, gaussian_process, kernel_approximation, metrics,\n- model_selection, naive_bayes, neighbors, pipeline, preprocessing,\n- svm, linear_model, tree, discriminant_analysis)\n+from imblearn import under_sampling, over_sampling, combine\n+from imblearn.pipeline import Pipeline as imbPipeline\n+from mlxtend import regressor, classifier\n+from scipy.io import mmread\n+from sklearn import (\n+ cluster, compose, decomposition, ensemble, feature_extraction,\n+ feature_selection, gaussian_process, kernel_approximation, metrics,\n+ model_selection, naive_bayes, neighbors, pipeline, preprocessing,\n+ svm, linear_model, tree, discriminant_analysis)\n+\n+try:\n+ import iraps_classifier\n+except ImportError:\n+ pass\n \n try:\n- import skrebate\n-except ModuleNotFoundError:\n+ import model_validations\n+except ImportError:\n+ pass\n+\n+try:\n+ import feature_selectors\n+except ImportError:\n pass\n \n-\n-N_JOBS = int(os.environ.get(\'GALAXY_SLOTS\', 1))\n+try:\n+ import preprocessors\n+except ImportError:\n+ pass\n \n-try:\n- sk_whitelist\n-except NameError:\n- sk_whitelist = None\n+# handle pickle white list file\n+WL_FILE = __import__(\'os\').path.join(\n+ __import__(\'os\').path.dirname(__file__), \'pk_whitelist.json\')\n+\n+N_JOBS = int(__import__(\'os\').environ.get(\'GALAXY_SLOTS\', 1))\n \n \n-class SafePickler(pickle.Unpickler):\n+class _SafePickler(pickle.Unpickler, object):\n """\n- Used to safely deserialize scikit-learn model objects serialized by cPickle.dump\n+ Used to safely deserialize scikit-learn model objects\n Usage:\n- eg.: SafePickler.load(pickled_file_object)\n+ eg.: _SafePickler.load(pickled_file_object)\n """\n- def find_class(self, module, name):\n+ def __init__(self, file):\n+ super(_SafePickler, self).__init__(file)\n+ # load global white list\n+ with open(WL_FILE, \'r\') as f:\n+ self.pk_whitelist = json.load(f)\n \n- # sk_whitelist could be read from tool\n- global sk_whitelist\n- if not sk_whitelist:\n- whitelist_file = os.path.join(os.path.dirname(__file__), \'sk_whitelist.json\')\n- with open(whitelist_file, \'r\') as f:\n- sk_whitelist = json.load(f)\n+ self.bad_names = (\n+ \'and\', \'as\', \'assert\', \'break\', \'class\', \'continue\',\n+ \'def\', \'del\', \'elif\', \'else\', \'except\', \'exec\',\n+ \'finally\', \'for\', \'from\', \'global\', \'if\', \'import\',\n+ \'in\', \'is\', \'lambda\', \'not\', \'or\', \'pass\', \'print\',\n+ \'raise\', \'return\', \'try\', \'system\', \'while\', \'with\',\n+ \'True\', \'False\', \'None\', \'eval\', \'execfile\', \'__import__\',\n+ \'__package__\', \'__subclasses__\', \'__bases__\', \'__globals__\',\n+ \'__code__\', \'__closure__\', \'__func__\', \'__self__\', \'__module__\',\n+ \'__dict__\', \'__class__\', \'__call__\', \'__get__\',\n+ \'__getattribute__\', \'__subclasshook__\', \'__new__\',\n+ \'__init__\', \'func_globals\', \'func_code\', \'func_closure\',\n+ \'im_class\', \'im_func\', \'im_self\', \'gi_code\', \'gi_frame\',\n+ \'__asteval__\', \'f_locals\', \'__mro__\')\n \n- bad_names = (\'and\', \'as\', \'assert\', \'break\', \'class\', \'continue\',\n- \'def\', \'del\', \'elif\', \'else\', \'except\', \'exec\',\n- \'finally\', \'for\', \'from\', \'global\', \'if\', \'import\',\n- \'in\', \'is\', \'lambda\', \'not\', \'or\', \'pass\', \'print\',\n- \'raise\', \'return\', \'try\', \'system\', \'while\', \'with\',\n- \'True\', \'False\', \'None\', \'eval\', \'execfile\', \'__impo'..b'eader_name\', \'all_but_by_header_name\']:\n+ c = groups[\'column_selector_options_g\'][\'col_g\']\n+ else:\n+ c = None\n+ groups = read_columns(\n+ infile_g,\n+ c=c,\n+ c_option=column_option,\n+ sep=\'\\t\',\n+ header=header,\n+ parse_dates=True)\n+ groups = groups.ravel()\n \n for k, v in cv_json.items():\n if v == \'\':\n@@ -341,7 +502,12 @@\n if test_size and test_size > 1.0:\n cv_json[\'test_size\'] = int(test_size)\n \n- cv_class = getattr(model_selection, cv)\n+ if cv == \'OrderedKFold\':\n+ cv_class = try_get_attr(\'model_validations\', \'OrderedKFold\')\n+ elif cv == \'RepeatedOrderedKFold\':\n+ cv_class = try_get_attr(\'model_validations\', \'RepeatedOrderedKFold\')\n+ else:\n+ cv_class = getattr(model_selection, cv)\n splitter = cv_class(**cv_json)\n \n return splitter, groups\n@@ -349,6 +515,9 @@\n \n # needed when sklearn < v0.20\n def balanced_accuracy_score(y_true, y_pred):\n+ """Compute balanced accuracy score, which is now available in\n+ scikit-learn from v0.20.0.\n+ """\n C = metrics.confusion_matrix(y_true, y_pred)\n with np.errstate(divide=\'ignore\', invalid=\'ignore\'):\n per_class = np.diag(C) / C.sum(axis=1)\n@@ -360,21 +529,71 @@\n \n \n def get_scoring(scoring_json):\n-\n+ """Return single sklearn scorer class\n+ or multiple scoers in dictionary\n+ """\n if scoring_json[\'primary_scoring\'] == \'default\':\n return None\n \n my_scorers = metrics.SCORERS\n+ my_scorers[\'binarize_auc_scorer\'] =\\\n+ try_get_attr(\'iraps_classifier\', \'binarize_auc_scorer\')\n+ my_scorers[\'binarize_average_precision_scorer\'] =\\\n+ try_get_attr(\'iraps_classifier\', \'binarize_average_precision_scorer\')\n if \'balanced_accuracy\' not in my_scorers:\n- my_scorers[\'balanced_accuracy\'] = metrics.make_scorer(balanced_accuracy_score)\n+ my_scorers[\'balanced_accuracy\'] =\\\n+ metrics.make_scorer(balanced_accuracy_score)\n \n if scoring_json[\'secondary_scoring\'] != \'None\'\\\n- and scoring_json[\'secondary_scoring\'] != scoring_json[\'primary_scoring\']:\n- scoring = {}\n- scoring[\'primary\'] = my_scorers[scoring_json[\'primary_scoring\']]\n+ and scoring_json[\'secondary_scoring\'] !=\\\n+ scoring_json[\'primary_scoring\']:\n+ return_scoring = {}\n+ primary_scoring = scoring_json[\'primary_scoring\']\n+ return_scoring[primary_scoring] = my_scorers[primary_scoring]\n for scorer in scoring_json[\'secondary_scoring\'].split(\',\'):\n if scorer != scoring_json[\'primary_scoring\']:\n- scoring[scorer] = my_scorers[scorer]\n- return scoring\n+ return_scoring[scorer] = my_scorers[scorer]\n+ return return_scoring\n \n return my_scorers[scoring_json[\'primary_scoring\']]\n+\n+\n+def get_search_params(estimator):\n+ """Format the output of `estimator.get_params()`\n+ """\n+ params = estimator.get_params()\n+ results = []\n+ for k, v in params.items():\n+ # params below won\'t be shown for search in the searchcv tool\n+ keywords = (\'n_jobs\', \'pre_dispatch\', \'memory\', \'steps\',\n+ \'nthread\', \'verbose\')\n+ if k.endswith(keywords):\n+ results.append([\'*\', k, k+": "+repr(v)])\n+ else:\n+ results.append([\'@\', k, k+": "+repr(v)])\n+ results.append(\n+ ["", "Note:",\n+ "@, params eligible for search in searchcv tool."])\n+\n+ return results\n+\n+\n+def try_get_attr(module, name):\n+ """try to get attribute from a custom module\n+\n+ Parameters\n+ ----------\n+ module : str\n+ Module name\n+ name : str\n+ Attribute (class/function) name.\n+\n+ Returns\n+ -------\n+ class or function\n+ """\n+ mod = sys.modules.get(module, None)\n+ if mod:\n+ return getattr(mod, name)\n+ else:\n+ raise Exception("No module named %s." % module)\n' |