Mercurial > repos > bgruening > sklearn_searchcv
comparison search_model_validation.xml @ 3:f9fea8323bcb draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit d00173591e4a783a4c1cb2664e4bb192ab5414f7
author | bgruening |
---|---|
date | Fri, 17 Aug 2018 12:26:17 -0400 |
parents | 907bb0418c9f |
children | 2e6540c11251 |
comparison
equal
deleted
inserted
replaced
2:907bb0418c9f | 3:f9fea8323bcb |
---|---|
2 <description>using exhausitive or randomized search</description> | 2 <description>using exhausitive or randomized search</description> |
3 <macros> | 3 <macros> |
4 <import>main_macros.xml</import> | 4 <import>main_macros.xml</import> |
5 </macros> | 5 </macros> |
6 <expand macro="python_requirements"> | 6 <expand macro="python_requirements"> |
7 <requirement type="package" version="0.9.12">asteval</requirement> | 7 <requirement type="package" version="0.6">skrebate</requirement> |
8 </expand> | 8 </expand> |
9 <expand macro="macro_stdio"/> | 9 <expand macro="macro_stdio"/> |
10 <version_command>echo "@VERSION@"</version_command> | 10 <version_command>echo "@VERSION@"</version_command> |
11 <command> | 11 <command> |
12 <![CDATA[ | 12 <![CDATA[ |
16 <configfiles> | 16 <configfiles> |
17 <inputs name="inputs" /> | 17 <inputs name="inputs" /> |
18 <configfile name="sklearn_search_model_validation_script"> | 18 <configfile name="sklearn_search_model_validation_script"> |
19 <![CDATA[ | 19 <![CDATA[ |
20 import sys | 20 import sys |
21 import os | |
21 import json | 22 import json |
22 import pandas | 23 import pandas |
23 import re | |
24 import pickle | 24 import pickle |
25 import numpy as np | 25 from sklearn import model_selection |
26 import xgboost | 26 from sklearn.exceptions import FitFailedWarning |
27 import scipy | 27 |
28 from asteval import Interpreter, make_symbol_table | 28 execfile("$__tool_directory__/utils.py") |
29 from sklearn import metrics, preprocessing, model_selection, ensemble | 29 |
30 from sklearn.pipeline import Pipeline | 30 warnings.simplefilter('ignore') |
31 | |
32 @COLUMNS_FUNCTION@ | |
33 @GET_ESTIMATOR_FUNCTION@ | |
34 @SAFE_EVAL_FUNCTION@ | |
35 @GET_SEARCH_PARAMS_FUNCTION@ | |
36 @GET_CV_FUNCTION@ | |
37 | 31 |
38 input_json_path = sys.argv[1] | 32 input_json_path = sys.argv[1] |
39 with open(input_json_path, "r") as param_handler: | 33 with open(input_json_path, "r") as param_handler: |
40 params = json.load(param_handler) | 34 params = json.load(param_handler) |
41 | 35 |
86 optimizers = params["search_schemes"]["selected_search_scheme"] | 80 optimizers = params["search_schemes"]["selected_search_scheme"] |
87 optimizers = getattr(model_selection, optimizers) | 81 optimizers = getattr(model_selection, optimizers) |
88 | 82 |
89 options = params["search_schemes"]["options"] | 83 options = params["search_schemes"]["options"] |
90 options['cv'] = get_cv( options['cv'].strip() ) | 84 options['cv'] = get_cv( options['cv'].strip() ) |
91 if 'scoring' in options and options['scoring'] == '': | 85 options['n_jobs'] = N_JOBS |
92 options['scoring'] = None | 86 options['scoring'] = get_scoring(options['scoring']) |
87 if options['error_score']: | |
88 options['error_score'] = 'raise' | |
89 else: | |
90 options['error_score'] = 0 | |
91 if options['refit'] and isinstance(options['scoring'], dict): | |
92 options['refit'] = 'primary' | |
93 if 'pre_dispatch' in options and options['pre_dispatch'] == '': | 93 if 'pre_dispatch' in options and options['pre_dispatch'] == '': |
94 options['pre_dispatch'] = None | 94 options['pre_dispatch'] = None |
95 | 95 |
96 with open(infile_pipeline, 'rb') as pipeline_handler: | 96 with open(infile_pipeline, 'rb') as pipeline_handler: |
97 pipeline = pickle.load(pipeline_handler) | 97 pipeline = pickle.load(pipeline_handler) |
98 search_params = get_search_params(params_builder) | 98 search_params = get_search_params(params_builder) |
99 searcher = optimizers(pipeline, search_params, **options) | 99 searcher = optimizers(pipeline, search_params, **options) |
100 | 100 |
101 searcher.fit(X, y) | 101 warnings.simplefilter('always', FitFailedWarning) |
102 with warnings.catch_warnings(record=True) as w: | |
103 try: | |
104 searcher.fit(X, y) | |
105 except ValueError: | |
106 pass | |
107 for warning in w: | |
108 print(repr(warning.message)) | |
102 | 109 |
103 cv_result = pandas.DataFrame(searcher.cv_results_) | 110 cv_result = pandas.DataFrame(searcher.cv_results_) |
104 cv_result.to_csv(path_or_buf=outfile_result, sep='\t', header=True, index=False) | 111 cv_result.to_csv(path_or_buf=outfile_result, sep='\t', header=True, index=False) |
105 | 112 |
106 #if $save: | 113 #if $save: |
148 <conditional name="search_param_selector"> | 155 <conditional name="search_param_selector"> |
149 <param name="search_p" value="C: [1, 10, 100, 1000]"/> | 156 <param name="search_p" value="C: [1, 10, 100, 1000]"/> |
150 <param name="selected_param_type" value="final_estimator_p"/> | 157 <param name="selected_param_type" value="final_estimator_p"/> |
151 </conditional> | 158 </conditional> |
152 <conditional name="search_param_selector"> | 159 <conditional name="search_param_selector"> |
160 <param name="search_p" value="k: [-1, 3, 5, 7, 9]"/> | |
161 <param name="selected_param_type" value="prep_2_p"/> | |
162 </conditional> | |
163 <param name="error_score" value="false"/> | |
164 <param name="infile1" value="regression_X.tabular" ftype="tabular"/> | |
165 <param name="header1" value="true" /> | |
166 <param name="selected_column_selector_option" value="all_columns"/> | |
167 <param name="infile2" value="regression_y.tabular" ftype="tabular"/> | |
168 <param name="header2" value="true" /> | |
169 <param name="selected_column_selector_option2" value="all_columns"/> | |
170 <output name="outfile_result" > | |
171 <assert_contents> | |
172 <has_text_matching expression="[^/d]+0.7938837807353147[^/d]+{u'estimator__C': 1, u'preprocessing_2__k': 9}[^/d]+1" /> | |
173 <has_text text="0.0"/> | |
174 </assert_contents> | |
175 </output> | |
176 </test> | |
177 <test> | |
178 <param name="selected_search_scheme" value="RandomizedSearchCV"/> | |
179 <param name="infile_pipeline" value="pipeline01"/> | |
180 <conditional name="search_param_selector"> | |
181 <param name="search_p" value="C: [1, 10, 100, 1000]"/> | |
182 <param name="selected_param_type" value="final_estimator_p"/> | |
183 </conditional> | |
184 <conditional name="search_param_selector"> | |
185 <param name="search_p" value="kernel: ['linear', 'poly', 'rbf', 'sigmoid']"/> | |
186 <param name="selected_param_type" value="final_estimator_p"/> | |
187 </conditional> | |
188 <conditional name="search_param_selector"> | |
153 <param name="search_p" value="k: [3, 5, 7, 9]"/> | 189 <param name="search_p" value="k: [3, 5, 7, 9]"/> |
154 <param name="selected_param_type" value="prep_2_p"/> | 190 <param name="selected_param_type" value="prep_2_p"/> |
155 </conditional> | 191 </conditional> |
156 <param name="infile1" value="regression_X.tabular" ftype="tabular"/> | |
157 <param name="header1" value="true" /> | |
158 <param name="selected_column_selector_option" value="all_columns"/> | |
159 <param name="infile2" value="regression_y.tabular" ftype="tabular"/> | |
160 <param name="header2" value="true" /> | |
161 <param name="selected_column_selector_option2" value="all_columns"/> | |
162 <output name="outfile_result" > | |
163 <assert_contents> | |
164 <has_text_matching expression="[^/d]+0.7938837807353147[^/d]+{u'estimator__C': 1, u'preprocessing_2__k': 9}[^/d]+1" /> | |
165 </assert_contents> | |
166 </output> | |
167 </test> | |
168 <test> | |
169 <param name="selected_search_scheme" value="RandomizedSearchCV"/> | |
170 <param name="infile_pipeline" value="pipeline01"/> | |
171 <conditional name="search_param_selector"> | |
172 <param name="search_p" value="C: [1, 10, 100, 1000]"/> | |
173 <param name="selected_param_type" value="final_estimator_p"/> | |
174 </conditional> | |
175 <conditional name="search_param_selector"> | |
176 <param name="search_p" value="kernel: ['linear', 'poly', 'rbf', 'sigmoid']"/> | |
177 <param name="selected_param_type" value="final_estimator_p"/> | |
178 </conditional> | |
179 <conditional name="search_param_selector"> | |
180 <param name="search_p" value="k: [3, 5, 7, 9]"/> | |
181 <param name="selected_param_type" value="prep_2_p"/> | |
182 </conditional> | |
183 <conditional name="search_param_selector"> | 192 <conditional name="search_param_selector"> |
184 <param name="search_p" value="with_centering: [True, False]"/> | 193 <param name="search_p" value="with_centering: [True, False]"/> |
185 <param name="selected_param_type" value="prep_1_p"/> | 194 <param name="selected_param_type" value="prep_1_p"/> |
186 </conditional> | 195 </conditional> |
187 <param name="infile1" value="regression_X.tabular" ftype="tabular"/> | 196 <param name="infile1" value="regression_X.tabular" ftype="tabular"/> |
207 <conditional name="search_param_selector"> | 216 <conditional name="search_param_selector"> |
208 <param name="search_p" value="max_depth: scipy_stats_randint(1, 51)"/> | 217 <param name="search_p" value="max_depth: scipy_stats_randint(1, 51)"/> |
209 <param name="selected_param_type" value="final_estimator_p"/> | 218 <param name="selected_param_type" value="final_estimator_p"/> |
210 </conditional> | 219 </conditional> |
211 <conditional name="search_param_selector"> | 220 <conditional name="search_param_selector"> |
212 <param name="search_p" value="gamma: np_random_uniform(low=0., high=1., size=2)"/> | 221 <param name="search_p" value="gamma: scipy_stats_uniform(0., 1.)"/> |
213 <param name="selected_param_type" value="final_estimator_p"/> | 222 <param name="selected_param_type" value="final_estimator_p"/> |
214 </conditional> | 223 </conditional> |
215 <conditional name="search_param_selector"> | 224 <conditional name="search_param_selector"> |
216 <param name="search_p" value="random_state: [324089]"/> | 225 <param name="search_p" value="random_state: [324089]"/> |
217 <param name="selected_param_type" value="final_estimator_p"/> | 226 <param name="selected_param_type" value="final_estimator_p"/> |
360 <param name="selected_column_selector_option" value="all_columns"/> | 369 <param name="selected_column_selector_option" value="all_columns"/> |
361 <param name="infile2" value="regression_y.tabular" ftype="tabular"/> | 370 <param name="infile2" value="regression_y.tabular" ftype="tabular"/> |
362 <param name="header2" value="true" /> | 371 <param name="header2" value="true" /> |
363 <param name="selected_column_selector_option2" value="all_columns"/> | 372 <param name="selected_column_selector_option2" value="all_columns"/> |
364 <output name="outfile_estimator" file="searchCV01" compare="sim_size" delta="1"/> | 373 <output name="outfile_estimator" file="searchCV01" compare="sim_size" delta="1"/> |
374 </test> | |
375 <test> | |
376 <param name="selected_search_scheme" value="GridSearchCV"/> | |
377 <param name="infile_pipeline" value="pipeline03"/> | |
378 <conditional name="search_param_selector"> | |
379 <param name="search_p" value="n_estimators: [10, 50, 200, 1000]"/> | |
380 <param name="selected_param_type" value="final_estimator_p"/> | |
381 </conditional> | |
382 <conditional name="search_param_selector"> | |
383 <param name="search_p" value="random_state: [324089]"/> | |
384 <param name="selected_param_type" value="final_estimator_p"/> | |
385 </conditional> | |
386 <param name="primary_scoring" value="balanced_accuracy"/> | |
387 <param name="cv" value="StratifiedKFold(n_splits=3, shuffle=True, random_state=10)"/> | |
388 <param name="infile1" value="regression_X.tabular" ftype="tabular"/> | |
389 <param name="header1" value="true" /> | |
390 <param name="selected_column_selector_option" value="all_columns"/> | |
391 <param name="infile2" value="regression_y.tabular" ftype="tabular"/> | |
392 <param name="header2" value="true" /> | |
393 <param name="selected_column_selector_option2" value="all_columns"/> | |
394 <output name="outfile_result" > | |
395 <assert_contents> | |
396 <has_n_columns n="13" /> | |
397 <has_text text="0.05366527890058046"/> | |
398 </assert_contents> | |
399 </output> | |
400 </test> | |
401 <test> | |
402 <param name="selected_search_scheme" value="GridSearchCV"/> | |
403 <param name="infile_pipeline" value="pipeline09"/> | |
404 <conditional name="search_param_selector"> | |
405 <param name="search_p" value="n_neighbors: [50, 100, 150, 200]"/> | |
406 <param name="selected_param_type" value="prep_1_p"/> | |
407 </conditional> | |
408 <conditional name="search_param_selector"> | |
409 <param name="search_p" value="random_state: [324089]"/> | |
410 <param name="selected_param_type" value="final_estimator_p"/> | |
411 </conditional> | |
412 <param name="primary_scoring" value="explained_variance"/> | |
413 <param name="secondary_scoring" value="neg_mean_squared_error,r2"/> | |
414 <param name="cv" value="StratifiedKFold(n_splits=3, shuffle=True, random_state=10)"/> | |
415 <param name="infile1" value="regression_X.tabular" ftype="tabular"/> | |
416 <param name="header1" value="true" /> | |
417 <param name="selected_column_selector_option" value="all_columns"/> | |
418 <param name="infile2" value="regression_y.tabular" ftype="tabular"/> | |
419 <param name="header2" value="true" /> | |
420 <param name="selected_column_selector_option2" value="all_columns"/> | |
421 <output name="outfile_result" > | |
422 <assert_contents> | |
423 <has_n_columns n="25" /> | |
424 <has_text text="0.7881203921915186"/> | |
425 <has_text text="0.7880692034558879"/> | |
426 <has_text text="-29.381892762877825"/> | |
427 </assert_contents> | |
428 </output> | |
365 </test> | 429 </test> |
366 </tests> | 430 </tests> |
367 <help> | 431 <help> |
368 <![CDATA[ | 432 <![CDATA[ |
369 **What it does** | 433 **What it does** |
371 please refer to `Scikit-learn model_selection GridSearchCV`_, `Scikit-learn model_selection RandomizedSearchCV`_ and `Tuning hyper-parameters`_. | 435 please refer to `Scikit-learn model_selection GridSearchCV`_, `Scikit-learn model_selection RandomizedSearchCV`_ and `Tuning hyper-parameters`_. |
372 | 436 |
373 **How to choose search patameters?** | 437 **How to choose search patameters?** |
374 | 438 |
375 Please refer to `svm`_, `linear_model`_, `ensemble`_, `naive_bayes`_, `tree`_, `neighbors`_ and `xgboost`_ for estimator parameters. | 439 Please refer to `svm`_, `linear_model`_, `ensemble`_, `naive_bayes`_, `tree`_, `neighbors`_ and `xgboost`_ for estimator parameters. |
376 Refer to `sklearn.preprocessing`_, `feature_selection`_, `decomposition`_, `kernel_approximation`_ and `cluster.FeatureAgglomeration`_ for parameter in the pre-processing steps. | 440 Refer to `sklearn.preprocessing`_, `feature_selection`_, `decomposition`_, `kernel_approximation`_, `cluster.FeatureAgglomeration`_ and `skrebate`_ for parameter in the pre-processing steps. |
377 | 441 |
378 **Search parameter input** accepts parameter and setting in key:value pair. One pair per input box. Setting can be list, numpy array, or distribution. | 442 **Search parameter input** accepts parameter and setting in key:value pair. One pair per input box. Setting can be list, numpy array, or distribution. |
379 The evaluation of settings supports operations in Math, list comprehension, numpy.arange(np_arange), most numpy.random(e.g., np_random_uniform) and some scipy.stats(e.g., scipy_stats_zipf) classes or functions, and others. | 443 The evaluation of settings supports operations in Math, list comprehension, numpy.arange(np_arange), most numpy.random(e.g., np_random_uniform) and some scipy.stats(e.g., scipy_stats_zipf) classes or functions, and others. |
380 | 444 |
381 **Examples:** | 445 **Examples:** |
408 .. _`sklearn.preprocessing`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.preprocessing | 472 .. _`sklearn.preprocessing`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.preprocessing |
409 .. _`feature_selection`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_selection | 473 .. _`feature_selection`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_selection |
410 .. _`decomposition`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.decomposition | 474 .. _`decomposition`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.decomposition |
411 .. _`kernel_approximation`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.kernel_approximation | 475 .. _`kernel_approximation`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.kernel_approximation |
412 .. _`cluster.FeatureAgglomeration`: http://scikit-learn.org/stable/modules/generated/sklearn.cluster.FeatureAgglomeration.html | 476 .. _`cluster.FeatureAgglomeration`: http://scikit-learn.org/stable/modules/generated/sklearn.cluster.FeatureAgglomeration.html |
477 .. _`skrebate`: https://epistasislab.github.io/scikit-rebate/using/ | |
413 | 478 |
414 ]]> | 479 ]]> |
415 </help> | 480 </help> |
416 <expand macro="sklearn_citation"/> | 481 <expand macro="sklearn_citation"/> |
417 </tool> | 482 </tool> |