Mercurial > repos > bgruening > sklearn_searchcv
comparison search_model_validation.xml @ 0:91bf3f0d7455 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 76583c1fcd9d06a4679cc46ffaee44117b9e22cd
author | bgruening |
---|---|
date | Sat, 04 Aug 2018 12:31:24 -0400 |
parents | |
children | 907bb0418c9f |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:91bf3f0d7455 |
---|---|
1 <tool id="sklearn_searchcv" name="Hyperparameter Search" version="@VERSION@"> | |
2 <description>using exhausitive or randomized search</description> | |
3 <macros> | |
4 <import>main_macros.xml</import> | |
5 </macros> | |
6 <expand macro="python_requirements"> | |
7 <requirement type="package" version="0.9.12">asteval</requirement> | |
8 </expand> | |
9 <expand macro="macro_stdio"/> | |
10 <version_command>echo "@VERSION@"</version_command> | |
11 <command> | |
12 <![CDATA[ | |
13 python "$sklearn_search_model_validation_script" '$inputs' | |
14 ]]> | |
15 </command> | |
16 <configfiles> | |
17 <inputs name="inputs" /> | |
18 <configfile name="sklearn_search_model_validation_script"> | |
19 <![CDATA[ | |
20 import sys | |
21 import json | |
22 import pandas | |
23 import pickle | |
24 import numpy as np | |
25 import xgboost | |
26 import scipy | |
27 from asteval import Interpreter, make_symbol_table | |
28 from sklearn import metrics, preprocessing, model_selection, ensemble | |
29 from sklearn.pipeline import Pipeline | |
30 | |
31 @COLUMNS_FUNCTION@ | |
32 @GET_ESTIMATOR_FUNCTION@ | |
33 @GET_SEARCH_PARAMS_FUNCTION@ | |
34 | |
35 input_json_path = sys.argv[1] | |
36 with open(input_json_path, "r") as param_handler: | |
37 params = json.load(param_handler) | |
38 | |
39 #handle cheatah | |
40 infile1 = "$input_options.infile1" | |
41 infile2 = "$input_options.infile2" | |
42 infile_pipeline = "$search_schemes.infile_pipeline" | |
43 outfile_result = "$outfile_result" | |
44 outfile_estimator = "$outfile_estimator" | |
45 #if $search_schemes.selected_search_scheme == "RandomizedSearchCV": | |
46 np.random.seed($search_schemes.random_seed) | |
47 #end if | |
48 | |
49 params_builder = params['search_schemes']['search_params_builder'] | |
50 | |
51 input_type = params["input_options"]["selected_input"] | |
52 if input_type=="tabular": | |
53 header = 'infer' if params["input_options"]["header1"] else None | |
54 column_option = params["input_options"]["column_selector_options_1"]["selected_column_selector_option"] | |
55 if column_option in ["by_index_number", "all_but_by_index_number", "by_header_name", "all_but_by_header_name"]: | |
56 c = params["input_options"]["column_selector_options_1"]["col1"] | |
57 else: | |
58 c = None | |
59 X = read_columns( | |
60 infile1, | |
61 c = c, | |
62 c_option = column_option, | |
63 sep='\t', | |
64 header=header, | |
65 parse_dates=True | |
66 ) | |
67 else: | |
68 X = mmread(open("$input_options.infile1", 'r')) | |
69 | |
70 header = 'infer' if params["input_options"]["header2"] else None | |
71 column_option = params["input_options"]["column_selector_options_2"]["selected_column_selector_option2"] | |
72 if column_option in ["by_index_number", "all_but_by_index_number", "by_header_name", "all_but_by_header_name"]: | |
73 c = params["input_options"]["column_selector_options_2"]["col2"] | |
74 else: | |
75 c = None | |
76 y = read_columns( | |
77 infile2, | |
78 c = c, | |
79 c_option = column_option, | |
80 sep='\t', | |
81 header=header, | |
82 parse_dates=True | |
83 ) | |
84 y=y.ravel() | |
85 | |
86 optimizers = params["search_schemes"]["selected_search_scheme"] | |
87 optimizers = getattr(model_selection, optimizers) | |
88 | |
89 options = params["search_schemes"]["options"] | |
90 if 'scoring' in options and options['scoring'] == '': | |
91 options['scoring'] = None | |
92 if 'pre_dispatch' in options and options['pre_dispatch'] == '': | |
93 options['pre_dispatch'] = None | |
94 | |
95 with open(infile_pipeline, 'rb') as pipeline_handler: | |
96 pipeline = pickle.load(pipeline_handler) | |
97 search_params = get_search_params(params_builder) | |
98 searcher = optimizers(pipeline, search_params, **options) | |
99 | |
100 searcher.fit(X, y) | |
101 | |
102 cv_result = pandas.DataFrame(searcher.cv_results_) | |
103 cv_result.to_csv(path_or_buf=outfile_result, sep='\t', header=True, index=False) | |
104 | |
105 #if $save: | |
106 with open(outfile_estimator, "wb") as output_handler: | |
107 pickle.dump(searcher.best_estimator_, output_handler, pickle.HIGHEST_PROTOCOL) | |
108 #end if | |
109 | |
110 ]]> | |
111 </configfile> | |
112 </configfiles> | |
113 <inputs> | |
114 <conditional name="search_schemes"> | |
115 <param name="selected_search_scheme" type="select" label="Select a model selection search scheme:"> | |
116 <option value="GridSearchCV" selected="true">GridSearchCV - Exhaustive search over specified parameter values for an estimator </option> | |
117 <option value="RandomizedSearchCV">RandomizedSearchCV - Randomized search on hyper parameters for an estimator</option> | |
118 </param> | |
119 <when value="GridSearchCV"> | |
120 <expand macro="search_cv_estimator"/> | |
121 <section name="options" title="Advanced Options for SearchCV" expanded="false"> | |
122 <expand macro="search_cv_options"/> | |
123 </section> | |
124 </when> | |
125 <when value="RandomizedSearchCV"> | |
126 <param name="random_seed" type="integer" value="65535" min="0" max="65535" label="Set up random seed:"/> | |
127 <expand macro="search_cv_estimator"/> | |
128 <section name="options" title="Advanced Options for SearchCV" expanded="false"> | |
129 <expand macro="search_cv_options"/> | |
130 <param argument="n_iter" type="integer" value="10" label="Number of parameter settings that are sampled"/> | |
131 <expand macro="random_state"/> | |
132 </section> | |
133 </when> | |
134 </conditional> | |
135 <param name="save" type="boolean" truevalue="booltrue" falsevalue="boolflase" checked="true" label="Save the best estimator/pipeline?"/> | |
136 <expand macro="sl_mixed_input"/> | |
137 </inputs> | |
138 <outputs> | |
139 <data format="tabular" name="outfile_result"/> | |
140 <data format="zip" name="outfile_estimator"> | |
141 <filter>save</filter> | |
142 </data> | |
143 </outputs> | |
144 <tests> | |
145 <test> | |
146 <param name="selected_search_scheme" value="GridSearchCV"/> | |
147 <param name="infile_pipeline" value="pipeline01"/> | |
148 <conditional name="search_param_selector"> | |
149 <param name="search_p" value="C: [1, 10, 100, 1000]"/> | |
150 <param name="selected_param_type" value="final_estimator_p"/> | |
151 </conditional> | |
152 <conditional name="search_param_selector"> | |
153 <param name="search_p" value="k: [3, 5, 7, 9]"/> | |
154 <param name="selected_param_type" value="prep_2_p"/> | |
155 </conditional> | |
156 <param name="infile1" value="regression_X.tabular" ftype="tabular"/> | |
157 <param name="header1" value="true" /> | |
158 <param name="selected_column_selector_option" value="all_columns"/> | |
159 <param name="infile2" value="regression_y.tabular" ftype="tabular"/> | |
160 <param name="header2" value="true" /> | |
161 <param name="selected_column_selector_option2" value="all_columns"/> | |
162 <output name="outfile_result" > | |
163 <assert_contents> | |
164 <has_text_matching expression="[^/d]+0.7938837807353147[^/d]+{u'estimator__C': 1, u'preprocessing_2__k': 9}[^/d]+1" /> | |
165 </assert_contents> | |
166 </output> | |
167 </test> | |
168 <test> | |
169 <param name="selected_search_scheme" value="RandomizedSearchCV"/> | |
170 <param name="infile_pipeline" value="pipeline01"/> | |
171 <conditional name="search_param_selector"> | |
172 <param name="search_p" value="C: [1, 10, 100, 1000]"/> | |
173 <param name="selected_param_type" value="final_estimator_p"/> | |
174 </conditional> | |
175 <conditional name="search_param_selector"> | |
176 <param name="search_p" value="kernel: ['linear', 'poly', 'rbf', 'sigmoid']"/> | |
177 <param name="selected_param_type" value="final_estimator_p"/> | |
178 </conditional> | |
179 <conditional name="search_param_selector"> | |
180 <param name="search_p" value="k: [3, 5, 7, 9]"/> | |
181 <param name="selected_param_type" value="prep_2_p"/> | |
182 </conditional> | |
183 <conditional name="search_param_selector"> | |
184 <param name="search_p" value="with_centering: [True, False]"/> | |
185 <param name="selected_param_type" value="prep_1_p"/> | |
186 </conditional> | |
187 <param name="infile1" value="regression_X.tabular" ftype="tabular"/> | |
188 <param name="header1" value="true" /> | |
189 <param name="selected_column_selector_option" value="all_columns"/> | |
190 <param name="infile2" value="regression_y.tabular" ftype="tabular"/> | |
191 <param name="header2" value="true" /> | |
192 <param name="selected_column_selector_option2" value="all_columns"/> | |
193 <output name="outfile_result" > | |
194 <assert_contents> | |
195 <has_n_columns n="15" /> | |
196 <has_text text="param_preprocessing_1__with_centering"/> | |
197 </assert_contents> | |
198 </output> | |
199 </test> | |
200 <test> | |
201 <param name="selected_search_scheme" value="RandomizedSearchCV"/> | |
202 <param name="infile_pipeline" value="pipeline03"/> | |
203 <conditional name="search_param_selector"> | |
204 <param name="search_p" value="n_estimators: np_arange(50, 1001, 50)"/> | |
205 <param name="selected_param_type" value="final_estimator_p"/> | |
206 </conditional> | |
207 <conditional name="search_param_selector"> | |
208 <param name="search_p" value="max_depth: scipy_stats_randint(1, 51)"/> | |
209 <param name="selected_param_type" value="final_estimator_p"/> | |
210 </conditional> | |
211 <conditional name="search_param_selector"> | |
212 <param name="search_p" value="gamma: np_random_uniform(low=0., high=1., size=2)"/> | |
213 <param name="selected_param_type" value="final_estimator_p"/> | |
214 </conditional> | |
215 <conditional name="search_param_selector"> | |
216 <param name="search_p" value="random_state: [324089]"/> | |
217 <param name="selected_param_type" value="final_estimator_p"/> | |
218 </conditional> | |
219 <param name="infile1" value="regression_X.tabular" ftype="tabular"/> | |
220 <param name="header1" value="true" /> | |
221 <param name="selected_column_selector_option" value="all_columns"/> | |
222 <param name="infile2" value="regression_y.tabular" ftype="tabular"/> | |
223 <param name="header2" value="true" /> | |
224 <param name="selected_column_selector_option2" value="all_columns"/> | |
225 <output name="outfile_result" > | |
226 <assert_contents> | |
227 <has_n_columns n="15" /> | |
228 <has_text text="param_estimator__max_depth"/> | |
229 </assert_contents> | |
230 </output> | |
231 </test> | |
232 <test> | |
233 <param name="selected_search_scheme" value="GridSearchCV"/> | |
234 <param name="infile_pipeline" value="pipeline04"/> | |
235 <conditional name="search_param_selector"> | |
236 <param name="search_p" value="random_state: list(range(100, 1001, 100))"/> | |
237 <param name="selected_param_type" value="final_estimator_p"/> | |
238 </conditional> | |
239 <conditional name="search_param_selector"> | |
240 <param name="search_p" value="estimator: [ensemble_ExtraTreesClassifier(n_estimators=100, random_state=324089)]"/> | |
241 <param name="selected_param_type" value="prep_1_p"/> | |
242 </conditional> | |
243 <param name="infile1" value="regression_X.tabular" ftype="tabular"/> | |
244 <param name="header1" value="true" /> | |
245 <param name="selected_column_selector_option" value="all_columns"/> | |
246 <param name="infile2" value="regression_y.tabular" ftype="tabular"/> | |
247 <param name="header2" value="true" /> | |
248 <param name="selected_column_selector_option2" value="all_columns"/> | |
249 <output name="outfile_result"> | |
250 <assert_contents> | |
251 <has_n_columns n="13"/> | |
252 <has_text text="0.05363984674329502"/> | |
253 </assert_contents> | |
254 </output> | |
255 </test> | |
256 <test> | |
257 <param name="selected_search_scheme" value="GridSearchCV"/> | |
258 <param name="infile_pipeline" value="pipeline01"/> | |
259 <conditional name="search_param_selector"> | |
260 <param name="search_p" value="C: [1, 10, 100, 1000]"/> | |
261 <param name="selected_param_type" value="final_estimator_p"/> | |
262 </conditional> | |
263 <param name="infile1" value="regression_X.tabular" ftype="tabular"/> | |
264 <param name="header1" value="true" /> | |
265 <param name="selected_column_selector_option" value="all_columns"/> | |
266 <param name="infile2" value="regression_y.tabular" ftype="tabular"/> | |
267 <param name="header2" value="true" /> | |
268 <param name="selected_column_selector_option2" value="all_columns"/> | |
269 <output name="outfile_estimator" file="searchCV01" compare="sim_size" delta="1"/> | |
270 </test> | |
271 <test> | |
272 <param name="selected_search_scheme" value="GridSearchCV"/> | |
273 <param name="infile_pipeline" value="pipeline06"/> | |
274 <conditional name="search_param_selector"> | |
275 <param name="search_p" value="n_estimators: [10, 50, 200, 1000]"/> | |
276 <param name="selected_param_type" value="final_estimator_p"/> | |
277 </conditional> | |
278 <conditional name="search_param_selector"> | |
279 <param name="search_p" value="random_state: [324089]"/> | |
280 <param name="selected_param_type" value="final_estimator_p"/> | |
281 </conditional> | |
282 <param name="infile1" value="regression_X.tabular" ftype="tabular"/> | |
283 <param name="header1" value="true" /> | |
284 <param name="selected_column_selector_option" value="all_columns"/> | |
285 <param name="infile2" value="regression_y.tabular" ftype="tabular"/> | |
286 <param name="header2" value="true" /> | |
287 <param name="selected_column_selector_option2" value="all_columns"/> | |
288 <output name="outfile_result"> | |
289 <assert_contents> | |
290 <has_n_columns n="13"/> | |
291 <has_text_matching expression=".+0.7772355090078996[^/w]+1000[^/d]" /> | |
292 </assert_contents> | |
293 </output> | |
294 </test> | |
295 <test> | |
296 <param name="selected_search_scheme" value="GridSearchCV"/> | |
297 <param name="infile_pipeline" value="pipeline07"/> | |
298 <conditional name="search_param_selector"> | |
299 <param name="search_p" value="n_estimators: [10, 50, 100, 200]"/> | |
300 <param name="selected_param_type" value="final_estimator_p"/> | |
301 </conditional> | |
302 <conditional name="search_param_selector"> | |
303 <param name="search_p" value="random_state: [324089]"/> | |
304 <param name="selected_param_type" value="final_estimator_p"/> | |
305 </conditional> | |
306 <conditional name="search_param_selector"> | |
307 <param name="search_p" value="gamma: [1.0, 2.0]"/> | |
308 <param name="selected_param_type" value="prep_1_p"/> | |
309 </conditional> | |
310 <param name="infile1" value="regression_X.tabular" ftype="tabular"/> | |
311 <param name="header1" value="true" /> | |
312 <param name="selected_column_selector_option" value="all_columns"/> | |
313 <param name="infile2" value="regression_y.tabular" ftype="tabular"/> | |
314 <param name="header2" value="true" /> | |
315 <param name="selected_column_selector_option2" value="all_columns"/> | |
316 <output name="outfile_result"> | |
317 <assert_contents> | |
318 <has_n_columns n="14"/> | |
319 <has_text_matching expression=".+0.05747126436781609[^/d]" /> | |
320 </assert_contents> | |
321 </output> | |
322 </test> | |
323 <test> | |
324 <param name="selected_search_scheme" value="GridSearchCV"/> | |
325 <param name="infile_pipeline" value="pipeline08"/> | |
326 <conditional name="search_param_selector"> | |
327 <param name="search_p" value="n_estimators: [10, 50, 100, 200]"/> | |
328 <param name="selected_param_type" value="final_estimator_p"/> | |
329 </conditional> | |
330 <conditional name="search_param_selector"> | |
331 <param name="search_p" value="random_state: [324089]"/> | |
332 <param name="selected_param_type" value="final_estimator_p"/> | |
333 </conditional> | |
334 <conditional name="search_param_selector"> | |
335 <param name="search_p" value="linkage: ['ward', 'complete', 'average']"/> | |
336 <param name="selected_param_type" value="prep_1_p"/> | |
337 </conditional> | |
338 <param name="infile1" value="regression_X.tabular" ftype="tabular"/> | |
339 <param name="header1" value="true" /> | |
340 <param name="selected_column_selector_option" value="all_columns"/> | |
341 <param name="infile2" value="regression_y.tabular" ftype="tabular"/> | |
342 <param name="header2" value="true" /> | |
343 <param name="selected_column_selector_option2" value="all_columns"/> | |
344 <output name="outfile_result"> | |
345 <assert_contents> | |
346 <has_text_matching expression=".+0.08045977011494253[^/w]+10[^/w]" /> | |
347 </assert_contents> | |
348 </output> | |
349 </test> | |
350 </tests> | |
351 <help> | |
352 <![CDATA[ | |
353 **What it does** | |
354 Searches optimized parameter values for an estimator or pipeline through either exhaustive grid cross validation search or Randomized cross validation search. | |
355 please refer to `Scikit-learn model_selection GridSearchCV`_, `Scikit-learn model_selection RandomizedSearchCV`_ and `Tuning hyper-parameters`_. | |
356 | |
357 **How to choose search patameters?** | |
358 | |
359 Please refer to `svm`_, `linear_model`_, `ensemble`_, `naive_bayes`_, `tree`_, `neighbors`_ and `xgboost`_ for estimator parameters. | |
360 Refer to `sklearn.preprocessing`_, `feature_selection`_, `decomposition`_, `kernel_approximation`_ and `cluster.FeatureAgglomeration`_ for parameter in the pre-processing steps. | |
361 | |
362 **Search parameter input** accepts parameter and setting in key:value pair. One pair per input box. Setting can be list, numpy array, or distribution. | |
363 The evaluation of settings supports operations in Math, list comprehension, numpy.arange(np_arange), most numpy.random(e.g., np_random_uniform) and some scipy.stats(e.g., scipy_stats_zipf) classes or functions, and others. | |
364 | |
365 **Examples:** | |
366 | |
367 - K: [3, 5, 7, 9] | |
368 | |
369 - n_estimators: list(range(50, 1001, 50)) | |
370 | |
371 - gamma: np_arange(0.01, 1, 0.1) | |
372 | |
373 - alpha: np_random_choice(list(range(1, 51)) + [None], size=20) | |
374 | |
375 - max_depth: scipy_stats_randin(1, 11) | |
376 | |
377 - estimator: [ensemble_ExtraTreesClassifier(n_estimators=100, random_state=324089)] | |
378 | |
379 | |
380 .. _`Scikit-learn model_selection GridSearchCV`: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html | |
381 .. _`Scikit-learn model_selection RandomizedSearchCV`: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html | |
382 .. _`Tuning hyper-parameters`: http://scikit-learn.org/stable/modules/grid_search.html | |
383 | |
384 .. _`svm`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.svm | |
385 .. _`linear_model`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.linear_model | |
386 .. _`ensemble`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.ensemble | |
387 .. _`naive_bayes`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.naive_bayes | |
388 .. _`tree`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.tree | |
389 .. _`neighbors`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.neighbors | |
390 .. _`xgboost`: https://xgboost.readthedocs.io/en/latest/python/python_api.html | |
391 | |
392 .. _`sklearn.preprocessing`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.preprocessing | |
393 .. _`feature_selection`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_selection | |
394 .. _`decomposition`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.decomposition | |
395 .. _`kernel_approximation`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.kernel_approximation | |
396 .. _`cluster.FeatureAgglomeration`: http://scikit-learn.org/stable/modules/generated/sklearn.cluster.FeatureAgglomeration.html | |
397 | |
398 ]]> | |
399 </help> | |
400 <expand macro="sklearn_citation"/> | |
401 </tool> |