Mercurial > repos > bgruening > sklearn_model_validation
changeset 2:dd502cb0d567 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 4ed8c4f6ef9ece81797a398b17a99bbaf49a6978
author | bgruening |
---|---|
date | Wed, 30 May 2018 08:27:01 -0400 |
parents | 02eadaaa4bf7 |
children | 424d8d21744d |
files | main_macros.xml model_validation.xml test-data/mv_result07.tabular |
diffstat | 3 files changed, 164 insertions(+), 60 deletions(-) [+] |
line wrap: on
line diff
--- a/main_macros.xml Tue May 22 19:33:14 2018 -0400 +++ b/main_macros.xml Wed May 30 08:27:01 2018 -0400 @@ -16,6 +16,47 @@ return y </token> +## generate an instance for one of sklearn.feature_selection classes +## must call "@COLUMNS_FUNCTION@" + <token name="@FEATURE_SELECTOR_FUNCTION@"> +def feature_selector(inputs): + selector = inputs["selected_algorithm"] + selector = getattr(sklearn.feature_selection, selector) + options = inputs["options"] + + if inputs['selected_algorithm'] == 'SelectFromModel': + if not options['threshold'] or options['threshold'] == 'None': + options['threshold'] = None + if 'extra_estimator' in inputs and inputs['extra_estimator']['has_estimator'] == 'no_load': + fitted_estimator = pickle.load(open("inputs['extra_estimator']['fitted_estimator']", 'r')) + new_selector = selector(fitted_estimator, prefit=True, **options) + else: + estimator=inputs["estimator"] + if inputs["extra_estimator"]["has_estimator"]=='no': + estimator=inputs["extra_estimator"]["new_estimator"] + estimator=eval(estimator.replace('__dq__', '"').replace("__sq__","'")) + new_selector = selector(estimator, **options) + + elif inputs['selected_algorithm'] in ['RFE', 'RFECV']: + if 'scoring' in options and (not options['scoring'] or options['scoring'] == 'None'): + options['scoring'] = None + estimator=inputs["estimator"] + if inputs["extra_estimator"]["has_estimator"]=='no': + estimator=inputs["extra_estimator"]["new_estimator"] + estimator=eval(estimator.replace('__dq__', '"').replace("__sq__","'")) + new_selector = selector(estimator, **options) + + elif inputs['selected_algorithm'] == "VarianceThreshold": + new_selector = selector(**options) + + else: + score_func = inputs["score_func"] + score_func = getattr(sklearn.feature_selection, score_func) + new_selector = selector(score_func, **options) + + return new_selector + </token> + <xml name="python_requirements"> <requirements> <requirement type="package" version="2.7">python</requirement> @@ -794,6 +835,13 @@ </when> <yield/> </xml> + <xml name="estimator_input_no_fit"> + <expand macro="feature_selection_estimator" /> + <conditional name="extra_estimator"> + <expand macro="feature_selection_extra_estimator" /> + <expand macro="feature_selection_estimator_choices" /> + </conditional> + </xml> <xml name="feature_selection_all"> <conditional name="feature_selection_algorithms"> <param name="selected_algorithm" type="select" label="Select a feature selection algorithm"> @@ -975,8 +1023,8 @@ <param argument="scoring" type="text" value="" optional="true" label="scoring" help="A metric used to evaluate the estimator"/> </xml> - <xml name="pre_dispatch"> - <param argument="pre_dispatch" type="text" value="all" optional="true" label="pre_dispatch" help="Number of predispatched jobs for parallel execution"/> + <xml name="pre_dispatch" token_type="text" token_default_value="all" token_help="Number of predispatched jobs for parallel execution"> + <param argument="pre_dispatch" type="@TYPE@" value="@DEFAULT_VALUE@" optional="true" label="pre_dispatch" help="@HELP@"/> </xml> <!-- Outputs -->
--- a/model_validation.xml Tue May 22 19:33:14 2018 -0400 +++ b/model_validation.xml Wed May 30 08:27:01 2018 -0400 @@ -18,13 +18,17 @@ import sys import json import pandas +import ast import pickle import numpy as np import sklearn.model_selection from sklearn import svm, linear_model, ensemble +from sklearn.pipeline import Pipeline @COLUMNS_FUNCTION@ +@FEATURE_SELECTOR_FUNCTION@ + input_json_path = sys.argv[1] params = json.load(open(input_json_path, "r")) @@ -51,50 +55,90 @@ ) y=y.ravel() -validator = params["model_validation_functions"]["selected_function"] -validator = getattr(sklearn.model_selection, validator) options = params["model_validation_functions"]["options"] if 'scoring' in options and options['scoring'] == '': options['scoring'] = None +if 'pre_dispatch' in options and options['pre_dispatch'] == '': + options['pre_dispatch'] = None +pipeline_steps = [] + +## Set up feature selector and add to pipeline steps. +if params['feature_selection']['do_feature_selection'] == 'Yes': + feature_selector = feature_selector(params['feature_selection']['feature_selection_algorithms']) + pipeline_steps.append( ('feature_selector', feature_selector)) + +## Set up estimator and add to pipeline. estimator=params["model_validation_functions"]["estimator"] if params["model_validation_functions"]["extra_estimator"]["has_estimator"] == 'no': estimator = params["model_validation_functions"]["extra_estimator"]["new_estimator"] estimator = eval(estimator.replace('__dq__', '"').replace("__sq__","'")) -#if $model_validation_functions.selected_function == 'cross_validate': -res = validator(estimator, X, y, **options) -rval = res["$model_validation_functions.return_type"] +pipeline_steps.append( ('estimator', estimator) ) + +pipeline = Pipeline(pipeline_steps) + +## Set up validator, run pipeline through validator and return results. -#elif $model_validation_functions.selected_function == 'learning_curve': -options['train_sizes'] = eval(options['train_sizes']) -train_sizes_abs, train_scores, test_scores = validator(estimator, X, y, **options) -rval = eval("$model_validation_functions.return_type") +validator = params["model_validation_functions"]["selected_function"] +validator = getattr(sklearn.model_selection, validator) + +selected_function = params["model_validation_functions"]["selected_function"] +rval_type = params["model_validation_functions"].get("return_type", None) -#elif $model_validation_functions.selected_function == 'permutation_test_score': -score, permutation_scores, pvalue = validator(estimator, X, y, **options) -rval = eval("$model_validation_functions.return_type") -if "$model_validation_functions.return_type" in ["score", "pvalue"]: - rval = [rval] - -#elif $model_validation_functions.selected_function == 'validation_curve': -options['param_range'] = eval(options['param_range']) -train_scores, test_scores = validator(estimator, X, y, **options) -rval = eval("$model_validation_functions.return_type") - -#else: -rval = validator(estimator, X, y, **options) -#end if +if selected_function == 'cross_validate': + res = validator(pipeline, X, y, **options) + rval = res[rval_type] +elif selected_function == 'learning_curve': + options['train_sizes'] = eval(options['train_sizes']) + train_sizes_abs, train_scores, test_scores = validator(pipeline, X, y, **options) + rval = eval(rval_type) +elif selected_function == 'permutation_test_score': + score, permutation_scores, pvalue = validator(pipeline, X, y, **options) + rval = eval(rval_type) + if rval_type in ["score", "pvalue"]: + rval = [rval] +elif selected_function == 'validation_curve': + options['param_name'] = 'estimator__' + options['param_name'] + options['param_range'] = eval(options['param_range']) + train_scores, test_scores = validator(pipeline, X, y, **options) + rval = eval(rval_type) +elif selected_function == 'GridSearchCV': + param_grid = params["model_validation_functions"]["param_grid"].replace("__sq__","'")\ + .replace('__dq__','"').replace("__oc__", "{").replace("__cc__", "}")\ + .replace("__ob__", "[").replace("__cb__", "]") + param_grid = ast.literal_eval(param_grid) + grid = validator(pipeline, param_grid, **options) + grid.fit(X, y) + rval = getattr(grid, rval_type) + if rval_type in ["best_estimator_", "best_score_", "best_index_"]: + rval = [rval] +else: + rval = validator(pipeline, X, y, **options) rval = pandas.DataFrame(rval) -rval.to_csv(path_or_buf="$outfile", sep='\t', header=False, index=False) +if rval_type and rval_type == "cv_results_": + rval.to_csv(path_or_buf="$outfile", sep='\t', header=True, index=False) +else: + rval.to_csv(path_or_buf="$outfile", sep='\t', header=False, index=False) ]]> </configfile> </configfiles> <inputs> + <conditional name="feature_selection"> + <param name="do_feature_selection" type="select" label="Do feature selection?"> + <option value="No" selected="true"/> + <option value="Yes"/> + </param> + <when value="No"/> + <when value="Yes"> + <expand macro="feature_selection_all"/> + </when> + </conditional> <conditional name="model_validation_functions"> <param name="selected_function" type="select" label="Select a model validation function"> + <option value="GridSearchCV">GridSearchCV - Exhaustive search over specified parameter values for an estimator </option> <option value="cross_validate">cross_validate - Evaluate metric(s) by cross-validation and also record fit/score times</option> <option value="cross_val_predict">cross_val_predict - Generate cross-validated estimates for each input data point</option> <option value="cross_val_score">cross_val_score - Evaluate a score by cross-validation</option> @@ -102,12 +146,28 @@ <option value="permutation_test_score">permutation_test_score - Evaluate the significance of a cross-validated score with permutations</option> <option value="validation_curve">validation_curve - Validation curve</option> </param> + <when value="GridSearchCV"> + <expand macro="estimator_input_no_fit" /> + <param argument="param_grid" type="text" value="[{'feature_selector__k': [3, 5, 7, 9], 'estimator__C': [1, 10, 100, 1000]}]" label="param_grid" help="Dictionary with parameters names (string) as keys and lists of parameter settings to try as values, or a list of such dictionaries, in which case the grids spanned by each dictionary in the list are explored"/> + <section name="options" title="Other Options" expanded="false"> + <expand macro="scoring"/> + <expand macro="model_validation_common_options"/> + <expand macro="pre_dispatch" value="2*n_jobs" help="Controls the number of jobs that get dispatched during parallel execution"/> + <param argument="iid" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="true" label="iid" help="Data is identically distributed?"/> + <param argument="refit" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="true" label="refit" help="Refit an estimator using the best found parameters on the whole dataset."/> + <!--error_score--> + <param argument="return_train_score" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="false" label="return_train_score" help=""/> + </section> + <param name="return_type" type="select" label="Select a return type"> + <option value="cv_results_" selected="true">cv_results_</option> + <option value="best_estimator_">best_estimator_</option> + <option value="best_score_">best_score_</option> + <option value="best_params_">best_params_</option> + <option value="best_index_">best_index_</option> + </param> + </when> <when value="cross_validate"> - <expand macro="feature_selection_estimator" /> - <conditional name="extra_estimator"> - <expand macro="feature_selection_extra_estimator" /> - <expand macro="feature_selection_estimator_choices" /> - </conditional> + <expand macro="estimator_input_no_fit" /> <section name="options" title="Other Options" expanded="false"> <!--groups--> <expand macro="model_validation_common_options"/> @@ -123,18 +183,12 @@ </param> </when> <when value="cross_val_predict"> - <expand macro="feature_selection_estimator" /> - <conditional name="extra_estimator"> - <expand macro="feature_selection_extra_estimator" /> - <expand macro="feature_selection_estimator_choices" /> - </conditional> + <expand macro="estimator_input_no_fit" /> <section name="options" title="Other Options" expanded="false"> <!--groups--> - <param argument="cv" type="integer" value="" optional="true" label="cv" help="The number of folds in a (Stratified)KFold" /> - <expand macro="n_jobs"/> - <expand macro="verbose"/> + <expand macro="model_validation_common_options" /> <!--fit_params--> - <param argument="pre_dispatch" type="integer" value="" optional="true" label="pre_dispatch" help="Controls the number of jobs that get dispatched during parallel execution" /> + <expand macro="pre_dispatch" value="2*n_jobs’" help="Controls the number of jobs that get dispatched during parallel execution"/> <param argument="method" type="select" label="Invokes the passed method name of the passed estimator"> <option value="predict" selected="true">predict</option> <option value="predict_proba">predict_proba</option> @@ -142,11 +196,7 @@ </section> </when> <when value="cross_val_score"> - <expand macro="feature_selection_estimator" /> - <conditional name="extra_estimator"> - <expand macro="feature_selection_extra_estimator" /> - <expand macro="feature_selection_estimator_choices" /> - </conditional> + <expand macro="estimator_input_no_fit" /> <section name="options" title="Other Options" expanded="false"> <!--groups--> <expand macro="model_validation_common_options"/> @@ -156,11 +206,7 @@ </section> </when> <when value="learning_curve"> - <expand macro="feature_selection_estimator" /> - <conditional name="extra_estimator"> - <expand macro="feature_selection_extra_estimator" /> - <expand macro="feature_selection_estimator_choices" /> - </conditional> + <expand macro="estimator_input_no_fit" /> <section name="options" title="Other Options" expanded="false"> <!--groups--> <expand macro="model_validation_common_options"/> @@ -178,11 +224,7 @@ </param> </when> <when value="permutation_test_score"> - <expand macro="feature_selection_estimator" /> - <conditional name="extra_estimator"> - <expand macro="feature_selection_extra_estimator" /> - <expand macro="feature_selection_estimator_choices" /> - </conditional> + <expand macro="estimator_input_no_fit" /> <section name="options" title="Other Options" expanded="false"> <!--groups--> <expand macro="model_validation_common_options"/> @@ -197,11 +239,7 @@ </param> </when> <when value="validation_curve"> - <expand macro="feature_selection_estimator" /> - <conditional name="extra_estimator"> - <expand macro="feature_selection_extra_estimator" /> - <expand macro="feature_selection_estimator_choices" /> - </conditional> + <expand macro="estimator_input_no_fit" /> <section name="options" title="Other Options" expanded="false"> <param name="param_name" type="text" value="gamma" label="param_name" help="Name of the parameter that will be varied"/> <param name="param_range" type="text" value="np.logspace(-6, -1, 5)" label="param_range" help="The values of the parameter that will be evaluated."/> @@ -287,6 +325,23 @@ <param name="return_type" value="test_scores"/> <output name="outfile" file="mv_result06.tabular"/> </test> + <test> + <param name="do_feature_selection" value="Yes"/> + <param name="selected_algorithm" value="SelectKBest"/> + <param name="score_func" value="chi2"/> + <param name="selected_function" value="GridSearchCV"/> + <param name="estimator" value="svm.SVR(kernel="linear")"/> + <param name="has_estimator" value="yes"/> + <param name="param_grid" value="[{'feature_selector__k': [3, 7], 'estimator__C': [1, 100]}]"/> + <param name="return_type" value="best_score_"/> + <param name="infile1" value="regression_X.tabular" ftype="tabular"/> + <param name="header1" value="true" /> + <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17"/> + <param name="infile2" value="regression_y.tabular" ftype="tabular"/> + <param name="header2" value="true" /> + <param name="col2" value="1"/> + <output name="outfile" file="mv_result07.tabular"/> + </test> </tests> <help> <![CDATA[