comparison search_model_validation.xml @ 3:f9fea8323bcb draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit d00173591e4a783a4c1cb2664e4bb192ab5414f7
author bgruening
date Fri, 17 Aug 2018 12:26:17 -0400
parents 907bb0418c9f
children 2e6540c11251
comparison
equal deleted inserted replaced
2:907bb0418c9f 3:f9fea8323bcb
2 <description>using exhausitive or randomized search</description> 2 <description>using exhausitive or randomized search</description>
3 <macros> 3 <macros>
4 <import>main_macros.xml</import> 4 <import>main_macros.xml</import>
5 </macros> 5 </macros>
6 <expand macro="python_requirements"> 6 <expand macro="python_requirements">
7 <requirement type="package" version="0.9.12">asteval</requirement> 7 <requirement type="package" version="0.6">skrebate</requirement>
8 </expand> 8 </expand>
9 <expand macro="macro_stdio"/> 9 <expand macro="macro_stdio"/>
10 <version_command>echo "@VERSION@"</version_command> 10 <version_command>echo "@VERSION@"</version_command>
11 <command> 11 <command>
12 <![CDATA[ 12 <![CDATA[
16 <configfiles> 16 <configfiles>
17 <inputs name="inputs" /> 17 <inputs name="inputs" />
18 <configfile name="sklearn_search_model_validation_script"> 18 <configfile name="sklearn_search_model_validation_script">
19 <![CDATA[ 19 <![CDATA[
20 import sys 20 import sys
21 import os
21 import json 22 import json
22 import pandas 23 import pandas
23 import re
24 import pickle 24 import pickle
25 import numpy as np 25 from sklearn import model_selection
26 import xgboost 26 from sklearn.exceptions import FitFailedWarning
27 import scipy 27
28 from asteval import Interpreter, make_symbol_table 28 execfile("$__tool_directory__/utils.py")
29 from sklearn import metrics, preprocessing, model_selection, ensemble 29
30 from sklearn.pipeline import Pipeline 30 warnings.simplefilter('ignore')
31
32 @COLUMNS_FUNCTION@
33 @GET_ESTIMATOR_FUNCTION@
34 @SAFE_EVAL_FUNCTION@
35 @GET_SEARCH_PARAMS_FUNCTION@
36 @GET_CV_FUNCTION@
37 31
38 input_json_path = sys.argv[1] 32 input_json_path = sys.argv[1]
39 with open(input_json_path, "r") as param_handler: 33 with open(input_json_path, "r") as param_handler:
40 params = json.load(param_handler) 34 params = json.load(param_handler)
41 35
86 optimizers = params["search_schemes"]["selected_search_scheme"] 80 optimizers = params["search_schemes"]["selected_search_scheme"]
87 optimizers = getattr(model_selection, optimizers) 81 optimizers = getattr(model_selection, optimizers)
88 82
89 options = params["search_schemes"]["options"] 83 options = params["search_schemes"]["options"]
90 options['cv'] = get_cv( options['cv'].strip() ) 84 options['cv'] = get_cv( options['cv'].strip() )
91 if 'scoring' in options and options['scoring'] == '': 85 options['n_jobs'] = N_JOBS
92 options['scoring'] = None 86 options['scoring'] = get_scoring(options['scoring'])
87 if options['error_score']:
88 options['error_score'] = 'raise'
89 else:
90 options['error_score'] = 0
91 if options['refit'] and isinstance(options['scoring'], dict):
92 options['refit'] = 'primary'
93 if 'pre_dispatch' in options and options['pre_dispatch'] == '': 93 if 'pre_dispatch' in options and options['pre_dispatch'] == '':
94 options['pre_dispatch'] = None 94 options['pre_dispatch'] = None
95 95
96 with open(infile_pipeline, 'rb') as pipeline_handler: 96 with open(infile_pipeline, 'rb') as pipeline_handler:
97 pipeline = pickle.load(pipeline_handler) 97 pipeline = pickle.load(pipeline_handler)
98 search_params = get_search_params(params_builder) 98 search_params = get_search_params(params_builder)
99 searcher = optimizers(pipeline, search_params, **options) 99 searcher = optimizers(pipeline, search_params, **options)
100 100
101 searcher.fit(X, y) 101 warnings.simplefilter('always', FitFailedWarning)
102 with warnings.catch_warnings(record=True) as w:
103 try:
104 searcher.fit(X, y)
105 except ValueError:
106 pass
107 for warning in w:
108 print(repr(warning.message))
102 109
103 cv_result = pandas.DataFrame(searcher.cv_results_) 110 cv_result = pandas.DataFrame(searcher.cv_results_)
104 cv_result.to_csv(path_or_buf=outfile_result, sep='\t', header=True, index=False) 111 cv_result.to_csv(path_or_buf=outfile_result, sep='\t', header=True, index=False)
105 112
106 #if $save: 113 #if $save:
148 <conditional name="search_param_selector"> 155 <conditional name="search_param_selector">
149 <param name="search_p" value="C: [1, 10, 100, 1000]"/> 156 <param name="search_p" value="C: [1, 10, 100, 1000]"/>
150 <param name="selected_param_type" value="final_estimator_p"/> 157 <param name="selected_param_type" value="final_estimator_p"/>
151 </conditional> 158 </conditional>
152 <conditional name="search_param_selector"> 159 <conditional name="search_param_selector">
160 <param name="search_p" value="k: [-1, 3, 5, 7, 9]"/>
161 <param name="selected_param_type" value="prep_2_p"/>
162 </conditional>
163 <param name="error_score" value="false"/>
164 <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
165 <param name="header1" value="true" />
166 <param name="selected_column_selector_option" value="all_columns"/>
167 <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
168 <param name="header2" value="true" />
169 <param name="selected_column_selector_option2" value="all_columns"/>
170 <output name="outfile_result" >
171 <assert_contents>
172 <has_text_matching expression="[^/d]+0.7938837807353147[^/d]+{u'estimator__C': 1, u'preprocessing_2__k': 9}[^/d]+1" />
173 <has_text text="0.0"/>
174 </assert_contents>
175 </output>
176 </test>
177 <test>
178 <param name="selected_search_scheme" value="RandomizedSearchCV"/>
179 <param name="infile_pipeline" value="pipeline01"/>
180 <conditional name="search_param_selector">
181 <param name="search_p" value="C: [1, 10, 100, 1000]"/>
182 <param name="selected_param_type" value="final_estimator_p"/>
183 </conditional>
184 <conditional name="search_param_selector">
185 <param name="search_p" value="kernel: ['linear', 'poly', 'rbf', 'sigmoid']"/>
186 <param name="selected_param_type" value="final_estimator_p"/>
187 </conditional>
188 <conditional name="search_param_selector">
153 <param name="search_p" value="k: [3, 5, 7, 9]"/> 189 <param name="search_p" value="k: [3, 5, 7, 9]"/>
154 <param name="selected_param_type" value="prep_2_p"/> 190 <param name="selected_param_type" value="prep_2_p"/>
155 </conditional> 191 </conditional>
156 <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
157 <param name="header1" value="true" />
158 <param name="selected_column_selector_option" value="all_columns"/>
159 <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
160 <param name="header2" value="true" />
161 <param name="selected_column_selector_option2" value="all_columns"/>
162 <output name="outfile_result" >
163 <assert_contents>
164 <has_text_matching expression="[^/d]+0.7938837807353147[^/d]+{u'estimator__C': 1, u'preprocessing_2__k': 9}[^/d]+1" />
165 </assert_contents>
166 </output>
167 </test>
168 <test>
169 <param name="selected_search_scheme" value="RandomizedSearchCV"/>
170 <param name="infile_pipeline" value="pipeline01"/>
171 <conditional name="search_param_selector">
172 <param name="search_p" value="C: [1, 10, 100, 1000]"/>
173 <param name="selected_param_type" value="final_estimator_p"/>
174 </conditional>
175 <conditional name="search_param_selector">
176 <param name="search_p" value="kernel: ['linear', 'poly', 'rbf', 'sigmoid']"/>
177 <param name="selected_param_type" value="final_estimator_p"/>
178 </conditional>
179 <conditional name="search_param_selector">
180 <param name="search_p" value="k: [3, 5, 7, 9]"/>
181 <param name="selected_param_type" value="prep_2_p"/>
182 </conditional>
183 <conditional name="search_param_selector"> 192 <conditional name="search_param_selector">
184 <param name="search_p" value="with_centering: [True, False]"/> 193 <param name="search_p" value="with_centering: [True, False]"/>
185 <param name="selected_param_type" value="prep_1_p"/> 194 <param name="selected_param_type" value="prep_1_p"/>
186 </conditional> 195 </conditional>
187 <param name="infile1" value="regression_X.tabular" ftype="tabular"/> 196 <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
207 <conditional name="search_param_selector"> 216 <conditional name="search_param_selector">
208 <param name="search_p" value="max_depth: scipy_stats_randint(1, 51)"/> 217 <param name="search_p" value="max_depth: scipy_stats_randint(1, 51)"/>
209 <param name="selected_param_type" value="final_estimator_p"/> 218 <param name="selected_param_type" value="final_estimator_p"/>
210 </conditional> 219 </conditional>
211 <conditional name="search_param_selector"> 220 <conditional name="search_param_selector">
212 <param name="search_p" value="gamma: np_random_uniform(low=0., high=1., size=2)"/> 221 <param name="search_p" value="gamma: scipy_stats_uniform(0., 1.)"/>
213 <param name="selected_param_type" value="final_estimator_p"/> 222 <param name="selected_param_type" value="final_estimator_p"/>
214 </conditional> 223 </conditional>
215 <conditional name="search_param_selector"> 224 <conditional name="search_param_selector">
216 <param name="search_p" value="random_state: [324089]"/> 225 <param name="search_p" value="random_state: [324089]"/>
217 <param name="selected_param_type" value="final_estimator_p"/> 226 <param name="selected_param_type" value="final_estimator_p"/>
360 <param name="selected_column_selector_option" value="all_columns"/> 369 <param name="selected_column_selector_option" value="all_columns"/>
361 <param name="infile2" value="regression_y.tabular" ftype="tabular"/> 370 <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
362 <param name="header2" value="true" /> 371 <param name="header2" value="true" />
363 <param name="selected_column_selector_option2" value="all_columns"/> 372 <param name="selected_column_selector_option2" value="all_columns"/>
364 <output name="outfile_estimator" file="searchCV01" compare="sim_size" delta="1"/> 373 <output name="outfile_estimator" file="searchCV01" compare="sim_size" delta="1"/>
374 </test>
375 <test>
376 <param name="selected_search_scheme" value="GridSearchCV"/>
377 <param name="infile_pipeline" value="pipeline03"/>
378 <conditional name="search_param_selector">
379 <param name="search_p" value="n_estimators: [10, 50, 200, 1000]"/>
380 <param name="selected_param_type" value="final_estimator_p"/>
381 </conditional>
382 <conditional name="search_param_selector">
383 <param name="search_p" value="random_state: [324089]"/>
384 <param name="selected_param_type" value="final_estimator_p"/>
385 </conditional>
386 <param name="primary_scoring" value="balanced_accuracy"/>
387 <param name="cv" value="StratifiedKFold(n_splits=3, shuffle=True, random_state=10)"/>
388 <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
389 <param name="header1" value="true" />
390 <param name="selected_column_selector_option" value="all_columns"/>
391 <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
392 <param name="header2" value="true" />
393 <param name="selected_column_selector_option2" value="all_columns"/>
394 <output name="outfile_result" >
395 <assert_contents>
396 <has_n_columns n="13" />
397 <has_text text="0.05366527890058046"/>
398 </assert_contents>
399 </output>
400 </test>
401 <test>
402 <param name="selected_search_scheme" value="GridSearchCV"/>
403 <param name="infile_pipeline" value="pipeline09"/>
404 <conditional name="search_param_selector">
405 <param name="search_p" value="n_neighbors: [50, 100, 150, 200]"/>
406 <param name="selected_param_type" value="prep_1_p"/>
407 </conditional>
408 <conditional name="search_param_selector">
409 <param name="search_p" value="random_state: [324089]"/>
410 <param name="selected_param_type" value="final_estimator_p"/>
411 </conditional>
412 <param name="primary_scoring" value="explained_variance"/>
413 <param name="secondary_scoring" value="neg_mean_squared_error,r2"/>
414 <param name="cv" value="StratifiedKFold(n_splits=3, shuffle=True, random_state=10)"/>
415 <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
416 <param name="header1" value="true" />
417 <param name="selected_column_selector_option" value="all_columns"/>
418 <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
419 <param name="header2" value="true" />
420 <param name="selected_column_selector_option2" value="all_columns"/>
421 <output name="outfile_result" >
422 <assert_contents>
423 <has_n_columns n="25" />
424 <has_text text="0.7881203921915186"/>
425 <has_text text="0.7880692034558879"/>
426 <has_text text="-29.381892762877825"/>
427 </assert_contents>
428 </output>
365 </test> 429 </test>
366 </tests> 430 </tests>
367 <help> 431 <help>
368 <![CDATA[ 432 <![CDATA[
369 **What it does** 433 **What it does**
371 please refer to `Scikit-learn model_selection GridSearchCV`_, `Scikit-learn model_selection RandomizedSearchCV`_ and `Tuning hyper-parameters`_. 435 please refer to `Scikit-learn model_selection GridSearchCV`_, `Scikit-learn model_selection RandomizedSearchCV`_ and `Tuning hyper-parameters`_.
372 436
373 **How to choose search patameters?** 437 **How to choose search patameters?**
374 438
375 Please refer to `svm`_, `linear_model`_, `ensemble`_, `naive_bayes`_, `tree`_, `neighbors`_ and `xgboost`_ for estimator parameters. 439 Please refer to `svm`_, `linear_model`_, `ensemble`_, `naive_bayes`_, `tree`_, `neighbors`_ and `xgboost`_ for estimator parameters.
376 Refer to `sklearn.preprocessing`_, `feature_selection`_, `decomposition`_, `kernel_approximation`_ and `cluster.FeatureAgglomeration`_ for parameter in the pre-processing steps. 440 Refer to `sklearn.preprocessing`_, `feature_selection`_, `decomposition`_, `kernel_approximation`_, `cluster.FeatureAgglomeration`_ and `skrebate`_ for parameter in the pre-processing steps.
377 441
378 **Search parameter input** accepts parameter and setting in key:value pair. One pair per input box. Setting can be list, numpy array, or distribution. 442 **Search parameter input** accepts parameter and setting in key:value pair. One pair per input box. Setting can be list, numpy array, or distribution.
379 The evaluation of settings supports operations in Math, list comprehension, numpy.arange(np_arange), most numpy.random(e.g., np_random_uniform) and some scipy.stats(e.g., scipy_stats_zipf) classes or functions, and others. 443 The evaluation of settings supports operations in Math, list comprehension, numpy.arange(np_arange), most numpy.random(e.g., np_random_uniform) and some scipy.stats(e.g., scipy_stats_zipf) classes or functions, and others.
380 444
381 **Examples:** 445 **Examples:**
408 .. _`sklearn.preprocessing`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.preprocessing 472 .. _`sklearn.preprocessing`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.preprocessing
409 .. _`feature_selection`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_selection 473 .. _`feature_selection`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_selection
410 .. _`decomposition`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.decomposition 474 .. _`decomposition`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.decomposition
411 .. _`kernel_approximation`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.kernel_approximation 475 .. _`kernel_approximation`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.kernel_approximation
412 .. _`cluster.FeatureAgglomeration`: http://scikit-learn.org/stable/modules/generated/sklearn.cluster.FeatureAgglomeration.html 476 .. _`cluster.FeatureAgglomeration`: http://scikit-learn.org/stable/modules/generated/sklearn.cluster.FeatureAgglomeration.html
477 .. _`skrebate`: https://epistasislab.github.io/scikit-rebate/using/
413 478
414 ]]> 479 ]]>
415 </help> 480 </help>
416 <expand macro="sklearn_citation"/> 481 <expand macro="sklearn_citation"/>
417 </tool> 482 </tool>