Mercurial > repos > bgruening > sklearn_ensemble
changeset 19:4570575d060c draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit d00173591e4a783a4c1cb2664e4bb192ab5414f7
author | bgruening |
---|---|
date | Fri, 17 Aug 2018 12:28:21 -0400 |
parents | 0b3144c0b4ee |
children | 038cecaa9e7c |
files | ensemble.xml main_macros.xml test-data/pipeline09 test-data/pipeline10 utils.py |
diffstat | 5 files changed, 448 insertions(+), 271 deletions(-) [+] |
line wrap: on
line diff
--- a/ensemble.xml Tue Aug 07 05:47:03 2018 -0400 +++ b/ensemble.xml Fri Aug 17 12:28:21 2018 -0400 @@ -15,6 +15,7 @@ <configfile name="ensemble_script"> <![CDATA[ import sys +import os import json import numpy as np import sklearn.ensemble @@ -22,8 +23,7 @@ import pickle from scipy.io import mmread -@COLUMNS_FUNCTION@ -@GET_X_y_FUNCTION@ +execfile("$__tool_directory__/utils.py") # Get inputs, outputs. input_json_path = sys.argv[1] @@ -47,6 +47,8 @@ if params["selected_tasks"]["selected_task"] == "train": algorithm = params["selected_tasks"]["selected_algorithms"]["selected_algorithm"] options = params["selected_tasks"]["selected_algorithms"]["options"] + if algorithm in ['RandomForestClassifier', 'RandomForestRegressor']: + options['n_jobs'] = N_JOBS if "select_max_features" in options: if options["select_max_features"]["max_features"] == "number_input": options["select_max_features"]["max_features"] = options["select_max_features"]["num_max_features"] @@ -107,7 +109,6 @@ <expand macro="max_leaf_nodes"/> <expand macro="bootstrap"/> <expand macro="warm_start" checked="false"/> - <expand macro="n_jobs"/> <expand macro="random_state"/> <expand macro="oob_score"/> <!--class_weight=None--> @@ -167,7 +168,6 @@ <expand macro="min_impurity_decrease"/> <expand macro="bootstrap"/> <expand macro="oob_score"/> - <expand macro="n_jobs"/> <expand macro="random_state"/> <expand macro="verbose"/> <expand macro="warm_start" checked="false"/>
--- a/main_macros.xml Tue Aug 07 05:47:03 2018 -0400 +++ b/main_macros.xml Fri Aug 17 12:28:21 2018 -0400 @@ -1,216 +1,13 @@ <macros> <token name="@VERSION@">0.9</token> - <token name="@COLUMNS_FUNCTION@"> -def read_columns(f, c=None, c_option='by_index_number', return_df=False, **args): - data = pandas.read_csv(f, **args) - if c_option == 'by_index_number': - cols = list(map(lambda x: x - 1, c)) - data = data.iloc[:,cols] - if c_option == 'all_but_by_index_number': - cols = list(map(lambda x: x - 1, c)) - data.drop(data.columns[cols], axis=1, inplace=True) - if c_option == 'by_header_name': - cols = [e.strip() for e in c.split(',')] - data = data[cols] - if c_option == 'all_but_by_header_name': - cols = [e.strip() for e in c.split(',')] - data.drop(cols, axis=1, inplace=True) - y = data.values - if return_df: - return y, data - else: - return y - return y - </token> - -## generate an instance for one of sklearn.feature_selection classes - <token name="@FEATURE_SELECTOR_FUNCTION@"> -def feature_selector(inputs): - selector = inputs["selected_algorithm"] - selector = getattr(sklearn.feature_selection, selector) - options = inputs["options"] - - if inputs['selected_algorithm'] == 'SelectFromModel': - if not options['threshold'] or options['threshold'] == 'None': - options['threshold'] = None - if inputs['model_inputter']['input_mode'] == 'prefitted': - model_file = inputs['model_inputter']['fitted_estimator'] - with open(model_file, 'rb') as model_handler: - fitted_estimator = pickle.load(model_handler) - new_selector = selector(fitted_estimator, prefit=True, **options) - else: - estimator_json = inputs['model_inputter']["estimator_selector"] - estimator = get_estimator(estimator_json) - new_selector = selector(estimator, **options) - - elif inputs['selected_algorithm'] in ['RFE', 'RFECV']: - if 'scoring' in options and (not options['scoring'] or options['scoring'] == 'None'): - options['scoring'] = None - estimator=get_estimator(inputs["estimator_selector"]) - new_selector = selector(estimator, **options) - - elif inputs['selected_algorithm'] == "VarianceThreshold": - new_selector = selector(**options) - - else: - score_func = inputs["score_func"] - score_func = getattr(sklearn.feature_selection, score_func) - new_selector = selector(score_func, **options) - - return new_selector - </token> - - <token name="@GET_X_y_FUNCTION@"> -def get_X_y(params, file1, file2): - input_type = params["selected_tasks"]["selected_algorithms"]["input_options"]["selected_input"] - if input_type=="tabular": - header = 'infer' if params["selected_tasks"]["selected_algorithms"]["input_options"]["header1"] else None - column_option = params["selected_tasks"]["selected_algorithms"]["input_options"]["column_selector_options_1"]["selected_column_selector_option"] - if column_option in ["by_index_number", "all_but_by_index_number", "by_header_name", "all_but_by_header_name"]: - c = params["selected_tasks"]["selected_algorithms"]["input_options"]["column_selector_options_1"]["col1"] - else: - c = None - X = read_columns( - file1, - c = c, - c_option = column_option, - sep='\t', - header=header, - parse_dates=True - ) - else: - X = mmread(file1) - - header = 'infer' if params["selected_tasks"]["selected_algorithms"]["input_options"]["header2"] else None - column_option = params["selected_tasks"]["selected_algorithms"]["input_options"]["column_selector_options_2"]["selected_column_selector_option2"] - if column_option in ["by_index_number", "all_but_by_index_number", "by_header_name", "all_but_by_header_name"]: - c = params["selected_tasks"]["selected_algorithms"]["input_options"]["column_selector_options_2"]["col2"] - else: - c = None - y = read_columns( - file2, - c = c, - c_option = column_option, - sep='\t', - header=header, - parse_dates=True - ) - y=y.ravel() - return X, y - </token> - - <token name="@SAFE_EVAL_FUNCTION@"> -def safe_eval(literal): - - FROM_SCIPY_STATS = [ 'bernoulli', 'binom', 'boltzmann', 'dlaplace', 'geom', 'hypergeom', - 'logser', 'nbinom', 'planck', 'poisson', 'randint', 'skellam', 'zipf' ] - - FROM_NUMPY_RANDOM = [ 'beta', 'binomial', 'bytes', 'chisquare', 'choice', 'dirichlet', 'division', - 'exponential', 'f', 'gamma', 'geometric', 'gumbel', 'hypergeometric', - 'laplace', 'logistic', 'lognormal', 'logseries', 'mtrand', 'multinomial', - 'multivariate_normal', 'negative_binomial', 'noncentral_chisquare', 'noncentral_f', - 'normal', 'pareto', 'permutation', 'poisson', 'power', 'rand', 'randint', - 'randn', 'random', 'random_integers', 'random_sample', 'ranf', 'rayleigh', - 'sample', 'seed', 'set_state', 'shuffle', 'standard_cauchy', 'standard_exponential', - 'standard_gamma', 'standard_normal', 'standard_t', 'triangular', 'uniform', - 'vonmises', 'wald', 'weibull', 'zipf' ] - - # File opening and other unneeded functions could be dropped - UNWANTED = ['open', 'type', 'dir', 'id', 'str', 'repr'] - - # Allowed symbol table. Add more if needed. - new_syms = { - 'np_arange': getattr(np, 'arange'), - 'ensemble_ExtraTreesClassifier': getattr(ensemble, 'ExtraTreesClassifier') - } - - syms = make_symbol_table(use_numpy=False, **new_syms) - - for method in FROM_SCIPY_STATS: - syms['scipy_stats_' + method] = getattr(scipy.stats, method) - - for func in FROM_NUMPY_RANDOM: - syms['np_random_' + func] = getattr(np.random, func) - - for key in UNWANTED: - syms.pop(key, None) - - aeval = Interpreter(symtable=syms, use_numpy=False, minimal=False, - no_if=True, no_for=True, no_while=True, no_try=True, - no_functiondef=True, no_ifexp=True, no_listcomp=False, - no_augassign=False, no_assert=True, no_delete=True, - no_raise=True, no_print=True) - - return aeval(literal) - </token> - - <token name="@GET_SEARCH_PARAMS_FUNCTION@"> -def get_search_params(params_builder): - search_params = {} - - for p in params_builder['param_set']: - search_p = p['search_param_selector']['search_p'] - if search_p.strip() == '': - continue - param_type = p['search_param_selector']['selected_param_type'] - - lst = search_p.split(":") - assert (len(lst) == 2), "Error, make sure there is one and only one colon in search parameter input." - literal = lst[1].strip() - ev = safe_eval(literal) - if param_type == "final_estimator_p": - search_params["estimator__" + lst[0].strip()] = ev - else: - search_params["preprocessing_" + param_type[5:6] + "__" + lst[0].strip()] = ev - - return search_params - </token> - - <token name="@GET_ESTIMATOR_FUNCTION@"> -def get_estimator(estimator_json): - estimator_module = estimator_json['selected_module'] - estimator_cls = estimator_json['selected_estimator'] - - if estimator_module == "xgboost": - cls = getattr(xgboost, estimator_cls) - else: - module = getattr(sklearn, estimator_module) - cls = getattr(module, estimator_cls) - - estimator = cls() - - estimator_params = estimator_json['text_params'].strip() - if estimator_params != "": - try: - params = ast.literal_eval('{' + estimator_params + '}') - except ValueError: - sys.exit("Unsupported parameter input: `%s`" %estimator_params) - estimator.set_params(**params) - - return estimator - </token> - - <token name="@GET_CV_FUNCTION@"> -def get_cv(literal): - if literal == "": - return None - if re.match(r'^\d+$', literal): - return int(literal) - m = re.match(r'^(?P<method>\w+)\((?P<args>.*)\)$', literal) - if m: - my_class = getattr( model_selection, m.group('method') ) - args = safe_eval( 'dict('+ m.group('args') + ')' ) - return my_class( **args ) - sys.exit("Unsupported CV input: %s" %literal) - </token> - <xml name="python_requirements"> <requirements> <requirement type="package" version="2.7">python</requirement> <requirement type="package" version="0.19.1">scikit-learn</requirement> <requirement type="package" version="0.22.0">pandas</requirement> <requirement type="package" version="0.72.1">xgboost</requirement> + <requirement type="package" version="0.9.12">asteval</requirement> <yield /> </requirements> </xml> @@ -439,10 +236,6 @@ <param argument="fit_intercept" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolfalse" checked="@CHECKED@" label="Estimate the intercept" help="If false, the data is assumed to be already centered."/> </xml> - <xml name="n_jobs" token_default_value="1" token_label="The number of jobs to run in parallel for both fit and predict"> - <param argument="n_jobs" type="integer" value="@DEFAULT_VALUE@" optional="true" label="@LABEL@" help="If -1, then the number of jobs is set to the number of cores"/> - </xml> - <xml name="n_iter" token_default_value="5" token_help_text="The number of passes over the training data (aka epochs). "> <param argument="n_iter" type="integer" optional="true" value="@DEFAULT_VALUE@" label="Number of iterations" help="@HELP_TEXT@"/> </xml> @@ -542,7 +335,7 @@ <conditional name="column_selector_options_1"> <expand macro="samples_column_selector_options" multiple="@MULTIPLE1@"/> </conditional> - <param name="infile2" type="data" format="tabular" label="Dataset containing class labels:"/> + <param name="infile2" type="data" format="tabular" label="Dataset containing class labels or target values:"/> <param name="header2" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolfalse" checked="False" label="Does the dataset contain header:" /> <conditional name="column_selector_options_2"> <expand macro="samples_column_selector_options" column_option="selected_column_selector_option2" col_name="col2" multiple="@MULTIPLE2@" infile="infile2"/> @@ -1031,6 +824,16 @@ </when> </xml> + <xml name="cv"> + <param argument="cv" type="text" value="" optional="true" label="cv" help="Optional. Integer or evalable splitter object, e.g., StratifiedKFold(n_splits=3, shuffle=True, random_state=10). Leave blank for default." > + <sanitizer> + <valid initial="default"> + <add value="'"/> + </valid> + </sanitizer> + </param> + </xml> + <xml name="feature_selection_all"> <conditional name="fs_algorithm_selector"> <param name="selected_algorithm" type="select" label="Select a feature selection algorithm"> @@ -1109,10 +912,9 @@ <expand macro="estimator_selector_all"/> <section name="options" title="Advanced Options" expanded="False"> <param argument="step" type="float" value="1" label="step" optional="true" help="Default = 1. " /> - <param argument="cv" type="integer" value="" optional="true" label="cv" help="Determines the cross-validation splitting strategy" /> - <param argument="scoring" type="text" value="" optional="true" label="scoring" help="A string (see model evaluation documentation) or a scorer callable object / function with signature scorer(estimator, X, y)."/> + <expand macro="cv"/> + <expand macro="scoring_selection"/> <param argument="verbose" type="integer" value="0" label="verbose" help="Controls verbosity of output." /> - <param argument="n_jobs" type="integer" value="1" label="n_jobs" help="Number of cores to run in parallel while fitting across folds. Defaults to 1 core."/> </section> </when> <when value="VarianceThreshold"> @@ -1159,14 +961,106 @@ </xml> <xml name="model_validation_common_options"> - <param argument="cv" type="text" value="" size="50" optional="true" label="cv" help="Optional. Integer or evalable splitter object, e.g., StratifiedKFold(n_splits=3, shuffle=True, random_state=10). Leave blank for default." /> - <expand macro="n_jobs"/> + <expand macro="cv"/> <expand macro="verbose"/> <yield/> </xml> - <xml name="scoring"> - <param argument="scoring" type="text" value="" optional="true" label="scoring" help="A metric used to evaluate the estimator"/> + <xml name="scoring_selection"> + <conditional name="scoring"> + <param name="primary_scoring" type="select" multiple="false" label="Select the primary metric (scoring):" help="Metric to refit the best estimator."> + <option value="default" selected="true">default with estimator</option> + <option value="accuracy">Classification -- 'accuracy'</option> + <option value="balanced_accuracy">Classification -- 'balanced_accuracy'</option> + <option value="average_precision">Classification -- 'average_precision'</option> + <option value="f1">Classification -- 'f1'</option> + <option value="f1_micro">Classification -- 'f1_micro'</option> + <option value="f1_macro">Classification -- 'f1_macro'</option> + <option value="f1_weighted">Classification -- 'f1_weighted'</option> + <option value="f1_samples">Classification -- 'f1_samples'</option> + <option value="neg_log_loss">Classification -- 'neg_log_loss'</option> + <option value="precision">Classification -- 'precision'</option> + <option value="precision_micro">Classification -- 'precision_micro'</option> + <option value="precision_macro">Classification -- 'precision_macro'</option> + <option value="precision_wighted">Classification -- 'precision_wighted'</option> + <option value="precision_samples">Classification -- 'precision_samples'</option> + <option value="recall">Classification -- 'recall'</option> + <option value="recall_micro">Classification -- 'recall_micro'</option> + <option value="recall_macro">Classification -- 'recall_macro'</option> + <option value="recall_wighted">Classification -- 'recall_wighted'</option> + <option value="recall_samples">Classification -- 'recall_samples'</option> + <option value="roc_auc">Classification -- 'roc_auc'</option> + <option value="explained_variance">Regression -- 'explained_variance'</option> + <option value="neg_mean_absolute_error">Regression -- 'neg_mean_absolute_error'</option> + <option value="neg_mean_squared_error">Regression -- 'neg_mean_squared_error'</option> + <option value="neg_mean_squared_log_error">Regression -- 'neg_mean_squared_log_error'</option> + <option value="neg_median_absolute_error">Regression -- 'neg_median_absolute_error'</option> + <option value="r2">Regression -- 'r2'</option> + </param> + <when value="default"/> + <when value="accuracy"><expand macro="secondary_scoring_selection_classification"/></when> + <when value="balanced_accuracy"><expand macro="secondary_scoring_selection_classification"/></when> + <when value="average_precision"><expand macro="secondary_scoring_selection_classification"/></when> + <when value="f1"><expand macro="secondary_scoring_selection_classification"/></when> + <when value="f1_micro"><expand macro="secondary_scoring_selection_classification"/></when> + <when value="f1_macro"><expand macro="secondary_scoring_selection_classification"/></when> + <when value="f1_weighted"><expand macro="secondary_scoring_selection_classification"/></when> + <when value="f1_samples"><expand macro="secondary_scoring_selection_classification"/></when> + <when value="neg_log_loss"><expand macro="secondary_scoring_selection_classification"/></when> + <when value="precision"><expand macro="secondary_scoring_selection_classification"/></when> + <when value="precision_micro"><expand macro="secondary_scoring_selection_classification"/></when> + <when value="precision_macro"><expand macro="secondary_scoring_selection_classification"/></when> + <when value="precision_wighted"><expand macro="secondary_scoring_selection_classification"/></when> + <when value="precision_samples"><expand macro="secondary_scoring_selection_classification"/></when> + <when value="recall"><expand macro="secondary_scoring_selection_classification"/></when> + <when value="recall_micro"><expand macro="secondary_scoring_selection_classification"/></when> + <when value="recall_macro"><expand macro="secondary_scoring_selection_classification"/></when> + <when value="recall_wighted"><expand macro="secondary_scoring_selection_classification"/></when> + <when value="recall_samples"><expand macro="secondary_scoring_selection_classification"/></when> + <when value="roc_auc"><expand macro="secondary_scoring_selection_classification"/></when> + <when value="explained_variance"><expand macro="secondary_scoring_selection_regression"/></when> + <when value="neg_mean_absolute_error"><expand macro="secondary_scoring_selection_regression"/></when> + <when value="neg_mean_squared_error"><expand macro="secondary_scoring_selection_regression"/></when> + <when value="neg_mean_squared_log_error"><expand macro="secondary_scoring_selection_regression"/></when> + <when value="neg_median_absolute_error"><expand macro="secondary_scoring_selection_regression"/></when> + <when value="r2"><expand macro="secondary_scoring_selection_regression"/></when> + </conditional> + </xml> + + <xml name="secondary_scoring_selection_classification"> + <param name="secondary_scoring" type="select" multiple="true" label="Additional scoring used in multi-metric mode:" help="If the same metric with the primary is chosen, the metric will be ignored."> + <option value="accuracy">Classification -- 'accuracy'</option> + <option value="balanced_accuracy">Classification -- 'balanced_accuracy'</option> + <option value="average_precision">Classification -- 'average_precision'</option> + <option value="f1">Classification -- 'f1'</option> + <option value="f1_micro">Classification -- 'f1_micro'</option> + <option value="f1_macro">Classification -- 'f1_macro'</option> + <option value="f1_weighted">Classification -- 'f1_weighted'</option> + <option value="f1_samples">Classification -- 'f1_samples'</option> + <option value="neg_log_loss">Classification -- 'neg_log_loss'</option> + <option value="precision">Classification -- 'precision'</option> + <option value="precision_micro">Classification -- 'precision_micro'</option> + <option value="precision_macro">Classification -- 'precision_macro'</option> + <option value="precision_wighted">Classification -- 'precision_wighted'</option> + <option value="precision_samples">Classification -- 'precision_samples'</option> + <option value="recall">Classification -- 'recall'</option> + <option value="recall_micro">Classification -- 'recall_micro'</option> + <option value="recall_macro">Classification -- 'recall_macro'</option> + <option value="recall_wighted">Classification -- 'recall_wighted'</option> + <option value="recall_samples">Classification -- 'recall_samples'</option> + <option value="roc_auc">Classification -- 'roc_auc'</option> + </param> + </xml> + + <xml name="secondary_scoring_selection_regression"> + <param name="secondary_scoring" type="select" multiple="true" label="Additional scoring used in multi-metric mode:" help="If the same metric with the primary is chosen, the metric will be ignored."> + <option value="explained_variance">Regression -- 'explained_variance'</option> + <option value="neg_mean_absolute_error">Regression -- 'neg_mean_absolute_error'</option> + <option value="neg_mean_squared_error">Regression -- 'neg_mean_squared_error'</option> + <option value="neg_mean_squared_log_error">Regression -- 'neg_mean_squared_log_error'</option> + <option value="neg_median_absolute_error">Regression -- 'neg_median_absolute_error'</option> + <option value="r2">Regression -- 'r2'</option> + </param> </xml> <xml name="pre_dispatch" token_type="hidden" token_default_value="all" token_help="Number of predispatched jobs for parallel execution"> @@ -1210,7 +1104,7 @@ </xml> <xml name="search_param_input" token_label="Estimator parameter:" token_help="One parameter per box. For example: C: [1, 10, 100, 1000]. See bottom for more examples"> - <param name="search_p" type="text" value="" size="100" optional="true" label="@LABEL@" help="@HELP@"> + <param name="search_p" type="text" value="" optional="true" label="@LABEL@" help="@HELP@"> <sanitizer> <valid initial="default"> <add value="'"/> @@ -1223,12 +1117,12 @@ </xml> <xml name="search_cv_options"> - <expand macro="scoring"/> + <expand macro="scoring_selection"/> <expand macro="model_validation_common_options"/> <expand macro="pre_dispatch" value="2*n_jobs" help="Controls the number of jobs that get dispatched during parallel execution"/> <param argument="iid" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="true" label="iid" help="If True, data is identically distributed across the folds"/> <param argument="refit" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="true" label="refit" help="Refit an estimator using the best found parameters on the whole dataset."/> - <!--error_score--> + <param argument="error_score" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="true" label="Raise fit error:" help="If false, the metric score is assigned to 0 if an error occurs in estimator fitting and FitFailedWarning is raised."/> <param argument="return_train_score" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="false" label="return_train_score" help=""/> </xml> @@ -1307,7 +1201,7 @@ <option value="RandomForestClassifier">RandomForestClassifier</option> <option value="RandomForestRegressor">RandomForestRegressor</option> <option value="RandomTreesEmbedding">RandomTreesEmbedding</option> - <option value="VotingClassifier">VotingClassifier</option> + <!--option value="VotingClassifier">VotingClassifier</option--> </param> <expand macro="estimator_params_text"/> </when> @@ -1330,12 +1224,11 @@ </when> <when value="neighbors"> <param name="selected_estimator" type="select" label="Choose estimator class:"> - <option value="BallTree" selected="true">BallTree</option> - <option value="DistanceMetric">DistanceMetric</option> - <option value="KDTree">KDTree</option> + <option value="KNeighborsClassifier" selected="true">KNeighborsClassifier</option> + <option value="KNeighborsRegressor">KNeighborsRegressor</option> + <!--option value="BallTree">BallTree</option--> + <!--option value="KDTree">KDTree</option--> <option value="KernelDensity">KernelDensity</option> - <option value="KNeighborsClassifier">KNeighborsClassifier</option> - <option value="KNeighborsRegressor">KNeighborsRegressor</option> <option value="LocalOutlierFactor">LocalOutlierFactor</option> <option value="RadiusNeighborsClassifier">RadiusNeighborsClassifier</option> <option value="RadiusNeighborsRegressor">RadiusNeighborsRegressor</option> @@ -1354,9 +1247,9 @@ </conditional> </xml> - <xml name="estimator_params_text" token_label="Type in estimator parameters:" - token_help="Parameters in dictionary without braces ('{}'), e.g., 'C': 1, 'kernel': 'linear'. No double quotes. Leave this box blank for default estimator."> - <param name="text_params" type="text" value="" size="50" optional="true" label="@LABEL@" help="@HELP@"> + <xml name="estimator_params_text" token_label="Type in parameter settings if different from default:" token_default_value='' + token_help="Dictionary-capable, e.g., C=1, kernel='linear'. No double quotes. Leave this box blank for default estimator."> + <param name="text_params" type="text" value="@DEFAULT_VALUE@" optional="true" label="@LABEL@" help="@HELP@"> <sanitizer> <valid initial="default"> <add value="'"/> @@ -1374,20 +1267,20 @@ <option value="SkewedChi2Sampler">SkewedChi2Sampler</option> </param> <when value="Nystroem"> - <expand macro="estimator_params_text" label="Type in kernel approximater parameters:" - help="Parameters in dictionary without braces ('{}'), e.g., 'n_components': 100, 'kernel': 'rbf'. No double quotes. Leave this box blank for class default."/> + <expand macro="estimator_params_text" + help="Default(=blank): coef0=None, degree=None, gamma=None, kernel='rbf', kernel_params=None, n_components=100, random_state=None. No double quotes"/> </when> <when value="RBFSampler"> - <expand macro="estimator_params_text" label="Type in kernel approximater parameters:" - help="Parameters in dictionary without braces ('{}'), e.g., 'n_components': 100, 'gamma': 1.0. No double quotes. Leave this box blank for class default."/> + <expand macro="estimator_params_text" + help="Default(=blank): gamma=1.0, n_components=100, random_state=None."/> </when> <when value="AdditiveChi2Sampler"> - <expand macro="estimator_params_text" label="Type in kernel approximater parameters:" - help="Parameters in dictionary without braces ('{}'), e.g., 'sample_steps': 2, 'sample_interval': None. No double quotes. Leave this box blank for class default."/> + <expand macro="estimator_params_text" + help="Default(=blank): sample_interval=None, sample_steps=2."/> </when> <when value="SkewedChi2Sampler"> - <expand macro="estimator_params_text" label="Type in kernel approximater parameters:" - help="Parameters in dictionary without braces ('{}'), e.g., 'n_components': 100, 'skewedness': 1.0. No double quotes. Leave this box blank for class default."/> + <expand macro="estimator_params_text" + help="Default(=blank): n_components=100, random_state=None, skewedness=1.0."/> </when> </conditional> </xml> @@ -1406,60 +1299,56 @@ <option value="NMF">NMF</option> <option value="PCA">PCA</option> <option value="SparsePCA">SparsePCA</option> - <option value="SparseCoder">SparseCoder</option> + <!--option value="SparseCoder">SparseCoder</option--> <option value="TruncatedSVD">TruncatedSVD</option> </param> <when value="DictionaryLearning"> - <expand macro="estimator_params_text" label="Type in maxtrix decomposition parameters:" - help="Parameters in dictionary without braces ('{}'), e.g., 'n_components': None, 'alpha': 1.0. No double quotes. Leave this box blank for class default."/> + <expand macro="estimator_params_text" + help="Default(=blank): alpha=1, code_init=None, dict_init=None, fit_algorithm='lars', max_iter=1000, n_components=None, random_state=None, split_sign=False, tol=1e-08, transform_algorithm='omp', transform_alpha=None, transform_n_nonzero_coefs=None, verbose=False."/> </when> <when value="FactorAnalysis"> - <expand macro="estimator_params_text" label="Type in maxtrix decomposition parameters:" - help="Parameters in dictionary without braces ('{}'), e.g., 'n_components': 100, 'random_state': 42. No double quotes. Leave this box blank for class default."/> + <expand macro="estimator_params_text" + help="Default(=blank): copy=True, iterated_power=3, max_iter=1000, n_components=None, noise_variance_init=None, random_state=0, svd_method='randomized', tol=0.01."/> </when> <when value="FastICA"> - <expand macro="estimator_params_text" label="Type in maxtrix decomposition parameters:" - help="Parameters in dictionary without braces ('{}'), e.g., 'n_components': 100, 'random_state': 42. No double quotes. Leave this box blank for class default."/> + <expand macro="estimator_params_text" + help="Default(=blank): algorithm='parallel', fun='logcosh', fun_args=None, max_iter=200, n_components=None, random_state=None, tol=0.0001, w_init=None, whiten=True. No double quotes."/> </when> <when value="IncrementalPCA"> - <expand macro="estimator_params_text" label="Type in maxtrix decomposition parameters:" - help="Parameters in dictionary without braces ('{}'), e.g., 'n_components': 100, 'whiten': False. No double quotes. Leave this box blank for class default."/> + <expand macro="estimator_params_text" + help="Default(=blank): batch_size=None, copy=True, n_components=None, whiten=False."/> </when> <when value="KernelPCA"> - <expand macro="estimator_params_text" label="Type in maxtrix decomposition parameters:" - help="Parameters in dictionary without braces ('{}'), e.g., 'n_components': 100, 'random_state': 42. No double quotes. Leave this box blank for class default."/> + <expand macro="estimator_params_text" + help="Default(=blank): alpha=1.0, coef0=1, copy_X=True, degree=3, eigen_solver='auto', fit_inverse_transform=False, gamma=None, kernel='linear', kernel_params=None, max_iter=None, n_components=None, random_state=None, remove_zero_eig=False, tol=0. No double quotes."/> </when> <when value="LatentDirichletAllocation"> - <expand macro="estimator_params_text" label="Type in maxtrix decomposition parameters:" - help="Parameters in dictionary without braces ('{}'), e.g., 'n_components': 100, 'random_state': 42. No double quotes. Leave this box blank for class default."/> + <expand macro="estimator_params_text" + help="Default(=blank): batch_size=128, doc_topic_prior=None, evaluate_every=-1, learning_decay=0.7, learning_method=None, learning_offset=10.0, max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001, n_components=10, n_topics=None, perp_tol=0.1, random_state=None, topic_word_prior=None, total_samples=1000000.0, verbose=0."/> </when> <when value="MiniBatchDictionaryLearning"> - <expand macro="estimator_params_text" label="Type in maxtrix decomposition parameters:" - help="Parameters in dictionary without braces ('{}'), e.g., 'n_components': 100, 'random_state': 42. No double quotes. Leave this box blank for class default."/> + <expand macro="estimator_params_text" + help="Default(=blank): alpha=1, batch_size=3, dict_init=None, fit_algorithm='lars', n_components=None, n_iter=1000, random_state=None, shuffle=True, split_sign=False, transform_algorithm='omp', transform_alpha=None, transform_n_nonzero_coefs=None, verbose=False."/> </when> <when value="MiniBatchSparsePCA"> - <expand macro="estimator_params_text" label="Type in maxtrix decomposition parameters:" - help="Parameters in dictionary without braces ('{}'), e.g., 'n_components': 100, 'random_state': 42. No double quotes. Leave this box blank for class default."/> + <expand macro="estimator_params_text" + help="Default(=blank): alpha=1, batch_size=3, callback=None, method='lars', n_components=None, n_iter=100, random_state=None, ridge_alpha=0.01, shuffle=True, verbose=False."/> </when> <when value="NMF"> - <expand macro="estimator_params_text" label="Type in maxtrix decomposition parameters:" - help="Parameters in dictionary without braces ('{}'), e.g., 'n_components': 100, 'init': 'random'. No double quotes. Leave this box blank for class default."/> + <expand macro="estimator_params_text" + help="Default(=blank): alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200, n_components=None, random_state=None, shuffle=False, solver='cd', tol=0.0001, verbose=0."/> </when> <when value="PCA"> - <expand macro="estimator_params_text" label="Type in maxtrix decomposition parameters:" - help="Parameters in dictionary without braces ('{}'), e.g., 'n_components': 100, 'random_state': 42. No double quotes. Leave this box blank for class default."/> + <expand macro="estimator_params_text" + help="Default(=blank): copy=True, iterated_power='auto', n_components=None, random_state=None, svd_solver='auto', tol=0.0, whiten=False."/> </when> <when value="SparsePCA"> - <expand macro="estimator_params_text" label="Type in maxtrix decomposition parameters:" - help="Parameters in dictionary without braces ('{}'), e.g., 'n_components': 100, 'random_state': 42. No double quotes. Leave this box blank for class default."/> - </when> - <when value="SparseCoder"> - <expand macro="estimator_params_text" label="Type in maxtrix decomposition parameters:" - help="Parameters in dictionary without braces ('{}'), e.g., 'transform_algorithm': 'omp', 'transform_alpha': 1.0. No double quotes. Leave this box blank for class default."/> + <expand macro="estimator_params_text" + help="Default(=blank): U_init=None, V_init=None, alpha=1, max_iter=1000, method='lars', n_components=None, random_state=None, ridge_alpha=0.01, tol=1e-08, verbose=False."/> </when> <when value="TruncatedSVD"> - <expand macro="estimator_params_text" label="Type in maxtrix decomposition parameters:" - help="Parameters in dictionary without braces ('{}'), e.g., 'n_components': 2, 'algorithm': 'randomized'. No double quotes. Leave this box blank for default estimator."/> + <expand macro="estimator_params_text" + help="Default(=blank): algorithm='randomized', n_components=2, n_iter=5, random_state=None, tol=0.0."/> </when> </conditional> </xml> @@ -1470,8 +1359,45 @@ <option value="FeatureAgglomeration" selected="true">FeatureAgglomeration</option> </param> <when value="FeatureAgglomeration"> - <expand macro="estimator_params_text" label="Type in parameters:" - help="Parameters in dictionary without braces ('{}'), e.g., 'n_clusters': 2, 'affinity': 'euclidean'. No double quotes. Leave this box blank for class default."/> + <expand macro="estimator_params_text" + help="Default(=blank): affinity='euclidean', compute_full_tree='auto', connectivity=None, linkage='ward', memory=None, n_clusters=2, pooling_func=np.mean."/> + </when> + </conditional> + </xml> + + <xml name="skrebate"> + <conditional name="skrebate_selector"> + <param name="select_algorithm" type="select" label="Choose the algorithm:"> + <option value="ReliefF">ReliefF</option> + <option value="SURF">SURF</option> + <option value="SURFstar">SURFstar</option> + <option value="MultiSURF">MultiSURF</option> + <option value="MultiSURFstar">MultiSURFstar</option> + <option value="TuRF">TuRF</option> + </param> + <when value="ReliefF"> + <expand macro="estimator_params_text" + help="Default(=blank): discrete_threshold=10, n_features_to_select=10, n_neighbors=100, verbose=False."/> + </when> + <when value="SURF"> + <expand macro="estimator_params_text" + help="Default(=blank): discrete_threshold=10, n_features_to_select=10, verbose=False."/> + </when> + <when value="SURFstar"> + <expand macro="estimator_params_text" + help="Default(=blank): discrete_threshold=10, n_features_to_select=10, verbose=False."/> + </when> + <when value="MultiSURF"> + <expand macro="estimator_params_text" + help="Default(=blank): discrete_threshold=10, n_features_to_select=10, verbose=False."/> + </when> + <when value="MultiSURFstar"> + <expand macro="estimator_params_text" + help="Default(=blank): discrete_threshold=10, n_features_to_select=10, verbose=False."/> + </when> + <when value="TuRF"> + <expand macro="estimator_params_text" + help="Default(=blank): core_algorithm='ReliefF', discrete_threshold=10, n_features_to_select=10, n_neighbors=100, pct=0.5, verbose=False."/> </when> </conditional> </xml>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/utils.py Fri Aug 17 12:28:21 2018 -0400 @@ -0,0 +1,251 @@ +import sys +import os +import pandas +import re +import pickle +import warnings +import numpy as np +import xgboost +import scipy +import sklearn +import ast +from asteval import Interpreter, make_symbol_table +from sklearn import metrics, model_selection, ensemble, svm, linear_model, naive_bayes, tree, neighbors + +N_JOBS = int( os.environ.get('GALAXY_SLOTS', 1) ) + +def read_columns(f, c=None, c_option='by_index_number', return_df=False, **args): + data = pandas.read_csv(f, **args) + if c_option == 'by_index_number': + cols = list(map(lambda x: x - 1, c)) + data = data.iloc[:,cols] + if c_option == 'all_but_by_index_number': + cols = list(map(lambda x: x - 1, c)) + data.drop(data.columns[cols], axis=1, inplace=True) + if c_option == 'by_header_name': + cols = [e.strip() for e in c.split(',')] + data = data[cols] + if c_option == 'all_but_by_header_name': + cols = [e.strip() for e in c.split(',')] + data.drop(cols, axis=1, inplace=True) + y = data.values + if return_df: + return y, data + else: + return y + return y + + +## generate an instance for one of sklearn.feature_selection classes +def feature_selector(inputs): + selector = inputs["selected_algorithm"] + selector = getattr(sklearn.feature_selection, selector) + options = inputs["options"] + + if inputs['selected_algorithm'] == 'SelectFromModel': + if not options['threshold'] or options['threshold'] == 'None': + options['threshold'] = None + if inputs['model_inputter']['input_mode'] == 'prefitted': + model_file = inputs['model_inputter']['fitted_estimator'] + with open(model_file, 'rb') as model_handler: + fitted_estimator = pickle.load(model_handler) + new_selector = selector(fitted_estimator, prefit=True, **options) + else: + estimator_json = inputs['model_inputter']["estimator_selector"] + estimator = get_estimator(estimator_json) + new_selector = selector(estimator, **options) + + elif inputs['selected_algorithm'] == 'RFE': + estimator=get_estimator(inputs["estimator_selector"]) + new_selector = selector(estimator, **options) + + elif inputs['selected_algorithm'] == 'RFECV': + options['scoring'] = get_scoring(options['scoring']) + options['n_jobs'] = N_JOBS + options['cv'] = get_cv( options['cv'].strip() ) + estimator=get_estimator(inputs["estimator_selector"]) + new_selector = selector(estimator, **options) + + elif inputs['selected_algorithm'] == "VarianceThreshold": + new_selector = selector(**options) + + else: + score_func = inputs["score_func"] + score_func = getattr(sklearn.feature_selection, score_func) + new_selector = selector(score_func, **options) + + return new_selector + + +def get_X_y(params, file1, file2): + input_type = params["selected_tasks"]["selected_algorithms"]["input_options"]["selected_input"] + if input_type=="tabular": + header = 'infer' if params["selected_tasks"]["selected_algorithms"]["input_options"]["header1"] else None + column_option = params["selected_tasks"]["selected_algorithms"]["input_options"]["column_selector_options_1"]["selected_column_selector_option"] + if column_option in ["by_index_number", "all_but_by_index_number", "by_header_name", "all_but_by_header_name"]: + c = params["selected_tasks"]["selected_algorithms"]["input_options"]["column_selector_options_1"]["col1"] + else: + c = None + X = read_columns( + file1, + c = c, + c_option = column_option, + sep='\t', + header=header, + parse_dates=True + ) + else: + X = mmread(file1) + + header = 'infer' if params["selected_tasks"]["selected_algorithms"]["input_options"]["header2"] else None + column_option = params["selected_tasks"]["selected_algorithms"]["input_options"]["column_selector_options_2"]["selected_column_selector_option2"] + if column_option in ["by_index_number", "all_but_by_index_number", "by_header_name", "all_but_by_header_name"]: + c = params["selected_tasks"]["selected_algorithms"]["input_options"]["column_selector_options_2"]["col2"] + else: + c = None + y = read_columns( + file2, + c = c, + c_option = column_option, + sep='\t', + header=header, + parse_dates=True + ) + y=y.ravel() + return X, y + + +class SafeEval(Interpreter): + + def __init__(self, load_scipy=False, load_numpy=False): + + # File opening and other unneeded functions could be dropped + unwanted = ['open', 'type', 'dir', 'id', 'str', 'repr'] + + # Allowed symbol table. Add more if needed. + new_syms = { + 'np_arange': getattr(np, 'arange'), + 'ensemble_ExtraTreesClassifier': getattr(ensemble, 'ExtraTreesClassifier') + } + + syms = make_symbol_table(use_numpy=False, **new_syms) + + if load_scipy: + scipy_distributions = scipy.stats.distributions.__dict__ + for key in scipy_distributions.keys(): + if isinstance(scipy_distributions[key], (scipy.stats.rv_continuous, scipy.stats.rv_discrete)): + syms['scipy_stats_' + key] = scipy_distributions[key] + + if load_numpy: + from_numpy_random = ['beta', 'binomial', 'bytes', 'chisquare', 'choice', 'dirichlet', 'division', + 'exponential', 'f', 'gamma', 'geometric', 'gumbel', 'hypergeometric', + 'laplace', 'logistic', 'lognormal', 'logseries', 'mtrand', 'multinomial', + 'multivariate_normal', 'negative_binomial', 'noncentral_chisquare', 'noncentral_f', + 'normal', 'pareto', 'permutation', 'poisson', 'power', 'rand', 'randint', + 'randn', 'random', 'random_integers', 'random_sample', 'ranf', 'rayleigh', + 'sample', 'seed', 'set_state', 'shuffle', 'standard_cauchy', 'standard_exponential', + 'standard_gamma', 'standard_normal', 'standard_t', 'triangular', 'uniform', + 'vonmises', 'wald', 'weibull', 'zipf' ] + for f in from_numpy_random: + syms['np_random_' + f] = getattr(np.random, f) + + for key in unwanted: + syms.pop(key, None) + + super(SafeEval, self).__init__( symtable=syms, use_numpy=False, minimal=False, + no_if=True, no_for=True, no_while=True, no_try=True, + no_functiondef=True, no_ifexp=True, no_listcomp=False, + no_augassign=False, no_assert=True, no_delete=True, + no_raise=True, no_print=True) + + +def get_search_params(params_builder): + search_params = {} + safe_eval = SafeEval(load_scipy=True, load_numpy=True) + + for p in params_builder['param_set']: + search_p = p['search_param_selector']['search_p'] + if search_p.strip() == '': + continue + param_type = p['search_param_selector']['selected_param_type'] + + lst = search_p.split(":") + assert (len(lst) == 2), "Error, make sure there is one and only one colon in search parameter input." + literal = lst[1].strip() + ev = safe_eval(literal) + if param_type == "final_estimator_p": + search_params["estimator__" + lst[0].strip()] = ev + else: + search_params["preprocessing_" + param_type[5:6] + "__" + lst[0].strip()] = ev + + return search_params + + +def get_estimator(estimator_json): + estimator_module = estimator_json['selected_module'] + estimator_cls = estimator_json['selected_estimator'] + + if estimator_module == "xgboost": + cls = getattr(xgboost, estimator_cls) + else: + module = getattr(sklearn, estimator_module) + cls = getattr(module, estimator_cls) + + estimator = cls() + + estimator_params = estimator_json['text_params'].strip() + if estimator_params != "": + try: + params = safe_eval('dict(' + estimator_params + ')') + except ValueError: + sys.exit("Unsupported parameter input: `%s`" %estimator_params) + estimator.set_params(**params) + if 'n_jobs' in estimator.get_params(): + estimator.set_params( n_jobs=N_JOBS ) + + return estimator + + +def get_cv(literal): + safe_eval = SafeEval() + if literal == "": + return None + if literal.isdigit(): + return int(literal) + m = re.match(r'^(?P<method>\w+)\((?P<args>.*)\)$', literal) + if m: + my_class = getattr( model_selection, m.group('method') ) + args = safe_eval( 'dict('+ m.group('args') + ')' ) + return my_class( **args ) + sys.exit("Unsupported CV input: %s" %literal) + + +def get_scoring(scoring_json): + def balanced_accuracy_score(y_true, y_pred): + C = metrics.confusion_matrix(y_true, y_pred) + with np.errstate(divide='ignore', invalid='ignore'): + per_class = np.diag(C) / C.sum(axis=1) + if np.any(np.isnan(per_class)): + warnings.warn('y_pred contains classes not in y_true') + per_class = per_class[~np.isnan(per_class)] + score = np.mean(per_class) + return score + + if scoring_json['primary_scoring'] == "default": + return None + + my_scorers = metrics.SCORERS + if 'balanced_accuracy' not in my_scorers: + my_scorers['balanced_accuracy'] = metrics.make_scorer(balanced_accuracy_score) + + if scoring_json['secondary_scoring'] != 'None'\ + and scoring_json['secondary_scoring'] != scoring_json['primary_scoring']: + scoring = {} + scoring['primary'] = my_scorers[ scoring_json['primary_scoring'] ] + for scorer in scoring_json['secondary_scoring'].split(','): + if scorer != scoring_json['primary_scoring']: + scoring[scorer] = my_scorers[scorer] + return scoring + + return my_scorers[ scoring_json['primary_scoring'] ] +