comparison search_model_validation.xml @ 0:91bf3f0d7455 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 76583c1fcd9d06a4679cc46ffaee44117b9e22cd
author bgruening
date Sat, 04 Aug 2018 12:31:24 -0400
parents
children 907bb0418c9f
comparison
equal deleted inserted replaced
-1:000000000000 0:91bf3f0d7455
1 <tool id="sklearn_searchcv" name="Hyperparameter Search" version="@VERSION@">
2 <description>using exhausitive or randomized search</description>
3 <macros>
4 <import>main_macros.xml</import>
5 </macros>
6 <expand macro="python_requirements">
7 <requirement type="package" version="0.9.12">asteval</requirement>
8 </expand>
9 <expand macro="macro_stdio"/>
10 <version_command>echo "@VERSION@"</version_command>
11 <command>
12 <![CDATA[
13 python "$sklearn_search_model_validation_script" '$inputs'
14 ]]>
15 </command>
16 <configfiles>
17 <inputs name="inputs" />
18 <configfile name="sklearn_search_model_validation_script">
19 <![CDATA[
20 import sys
21 import json
22 import pandas
23 import pickle
24 import numpy as np
25 import xgboost
26 import scipy
27 from asteval import Interpreter, make_symbol_table
28 from sklearn import metrics, preprocessing, model_selection, ensemble
29 from sklearn.pipeline import Pipeline
30
31 @COLUMNS_FUNCTION@
32 @GET_ESTIMATOR_FUNCTION@
33 @GET_SEARCH_PARAMS_FUNCTION@
34
35 input_json_path = sys.argv[1]
36 with open(input_json_path, "r") as param_handler:
37 params = json.load(param_handler)
38
39 #handle cheatah
40 infile1 = "$input_options.infile1"
41 infile2 = "$input_options.infile2"
42 infile_pipeline = "$search_schemes.infile_pipeline"
43 outfile_result = "$outfile_result"
44 outfile_estimator = "$outfile_estimator"
45 #if $search_schemes.selected_search_scheme == "RandomizedSearchCV":
46 np.random.seed($search_schemes.random_seed)
47 #end if
48
49 params_builder = params['search_schemes']['search_params_builder']
50
51 input_type = params["input_options"]["selected_input"]
52 if input_type=="tabular":
53 header = 'infer' if params["input_options"]["header1"] else None
54 column_option = params["input_options"]["column_selector_options_1"]["selected_column_selector_option"]
55 if column_option in ["by_index_number", "all_but_by_index_number", "by_header_name", "all_but_by_header_name"]:
56 c = params["input_options"]["column_selector_options_1"]["col1"]
57 else:
58 c = None
59 X = read_columns(
60 infile1,
61 c = c,
62 c_option = column_option,
63 sep='\t',
64 header=header,
65 parse_dates=True
66 )
67 else:
68 X = mmread(open("$input_options.infile1", 'r'))
69
70 header = 'infer' if params["input_options"]["header2"] else None
71 column_option = params["input_options"]["column_selector_options_2"]["selected_column_selector_option2"]
72 if column_option in ["by_index_number", "all_but_by_index_number", "by_header_name", "all_but_by_header_name"]:
73 c = params["input_options"]["column_selector_options_2"]["col2"]
74 else:
75 c = None
76 y = read_columns(
77 infile2,
78 c = c,
79 c_option = column_option,
80 sep='\t',
81 header=header,
82 parse_dates=True
83 )
84 y=y.ravel()
85
86 optimizers = params["search_schemes"]["selected_search_scheme"]
87 optimizers = getattr(model_selection, optimizers)
88
89 options = params["search_schemes"]["options"]
90 if 'scoring' in options and options['scoring'] == '':
91 options['scoring'] = None
92 if 'pre_dispatch' in options and options['pre_dispatch'] == '':
93 options['pre_dispatch'] = None
94
95 with open(infile_pipeline, 'rb') as pipeline_handler:
96 pipeline = pickle.load(pipeline_handler)
97 search_params = get_search_params(params_builder)
98 searcher = optimizers(pipeline, search_params, **options)
99
100 searcher.fit(X, y)
101
102 cv_result = pandas.DataFrame(searcher.cv_results_)
103 cv_result.to_csv(path_or_buf=outfile_result, sep='\t', header=True, index=False)
104
105 #if $save:
106 with open(outfile_estimator, "wb") as output_handler:
107 pickle.dump(searcher.best_estimator_, output_handler, pickle.HIGHEST_PROTOCOL)
108 #end if
109
110 ]]>
111 </configfile>
112 </configfiles>
113 <inputs>
114 <conditional name="search_schemes">
115 <param name="selected_search_scheme" type="select" label="Select a model selection search scheme:">
116 <option value="GridSearchCV" selected="true">GridSearchCV - Exhaustive search over specified parameter values for an estimator </option>
117 <option value="RandomizedSearchCV">RandomizedSearchCV - Randomized search on hyper parameters for an estimator</option>
118 </param>
119 <when value="GridSearchCV">
120 <expand macro="search_cv_estimator"/>
121 <section name="options" title="Advanced Options for SearchCV" expanded="false">
122 <expand macro="search_cv_options"/>
123 </section>
124 </when>
125 <when value="RandomizedSearchCV">
126 <param name="random_seed" type="integer" value="65535" min="0" max="65535" label="Set up random seed:"/>
127 <expand macro="search_cv_estimator"/>
128 <section name="options" title="Advanced Options for SearchCV" expanded="false">
129 <expand macro="search_cv_options"/>
130 <param argument="n_iter" type="integer" value="10" label="Number of parameter settings that are sampled"/>
131 <expand macro="random_state"/>
132 </section>
133 </when>
134 </conditional>
135 <param name="save" type="boolean" truevalue="booltrue" falsevalue="boolflase" checked="true" label="Save the best estimator/pipeline?"/>
136 <expand macro="sl_mixed_input"/>
137 </inputs>
138 <outputs>
139 <data format="tabular" name="outfile_result"/>
140 <data format="zip" name="outfile_estimator">
141 <filter>save</filter>
142 </data>
143 </outputs>
144 <tests>
145 <test>
146 <param name="selected_search_scheme" value="GridSearchCV"/>
147 <param name="infile_pipeline" value="pipeline01"/>
148 <conditional name="search_param_selector">
149 <param name="search_p" value="C: [1, 10, 100, 1000]"/>
150 <param name="selected_param_type" value="final_estimator_p"/>
151 </conditional>
152 <conditional name="search_param_selector">
153 <param name="search_p" value="k: [3, 5, 7, 9]"/>
154 <param name="selected_param_type" value="prep_2_p"/>
155 </conditional>
156 <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
157 <param name="header1" value="true" />
158 <param name="selected_column_selector_option" value="all_columns"/>
159 <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
160 <param name="header2" value="true" />
161 <param name="selected_column_selector_option2" value="all_columns"/>
162 <output name="outfile_result" >
163 <assert_contents>
164 <has_text_matching expression="[^/d]+0.7938837807353147[^/d]+{u'estimator__C': 1, u'preprocessing_2__k': 9}[^/d]+1" />
165 </assert_contents>
166 </output>
167 </test>
168 <test>
169 <param name="selected_search_scheme" value="RandomizedSearchCV"/>
170 <param name="infile_pipeline" value="pipeline01"/>
171 <conditional name="search_param_selector">
172 <param name="search_p" value="C: [1, 10, 100, 1000]"/>
173 <param name="selected_param_type" value="final_estimator_p"/>
174 </conditional>
175 <conditional name="search_param_selector">
176 <param name="search_p" value="kernel: ['linear', 'poly', 'rbf', 'sigmoid']"/>
177 <param name="selected_param_type" value="final_estimator_p"/>
178 </conditional>
179 <conditional name="search_param_selector">
180 <param name="search_p" value="k: [3, 5, 7, 9]"/>
181 <param name="selected_param_type" value="prep_2_p"/>
182 </conditional>
183 <conditional name="search_param_selector">
184 <param name="search_p" value="with_centering: [True, False]"/>
185 <param name="selected_param_type" value="prep_1_p"/>
186 </conditional>
187 <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
188 <param name="header1" value="true" />
189 <param name="selected_column_selector_option" value="all_columns"/>
190 <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
191 <param name="header2" value="true" />
192 <param name="selected_column_selector_option2" value="all_columns"/>
193 <output name="outfile_result" >
194 <assert_contents>
195 <has_n_columns n="15" />
196 <has_text text="param_preprocessing_1__with_centering"/>
197 </assert_contents>
198 </output>
199 </test>
200 <test>
201 <param name="selected_search_scheme" value="RandomizedSearchCV"/>
202 <param name="infile_pipeline" value="pipeline03"/>
203 <conditional name="search_param_selector">
204 <param name="search_p" value="n_estimators: np_arange(50, 1001, 50)"/>
205 <param name="selected_param_type" value="final_estimator_p"/>
206 </conditional>
207 <conditional name="search_param_selector">
208 <param name="search_p" value="max_depth: scipy_stats_randint(1, 51)"/>
209 <param name="selected_param_type" value="final_estimator_p"/>
210 </conditional>
211 <conditional name="search_param_selector">
212 <param name="search_p" value="gamma: np_random_uniform(low=0., high=1., size=2)"/>
213 <param name="selected_param_type" value="final_estimator_p"/>
214 </conditional>
215 <conditional name="search_param_selector">
216 <param name="search_p" value="random_state: [324089]"/>
217 <param name="selected_param_type" value="final_estimator_p"/>
218 </conditional>
219 <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
220 <param name="header1" value="true" />
221 <param name="selected_column_selector_option" value="all_columns"/>
222 <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
223 <param name="header2" value="true" />
224 <param name="selected_column_selector_option2" value="all_columns"/>
225 <output name="outfile_result" >
226 <assert_contents>
227 <has_n_columns n="15" />
228 <has_text text="param_estimator__max_depth"/>
229 </assert_contents>
230 </output>
231 </test>
232 <test>
233 <param name="selected_search_scheme" value="GridSearchCV"/>
234 <param name="infile_pipeline" value="pipeline04"/>
235 <conditional name="search_param_selector">
236 <param name="search_p" value="random_state: list(range(100, 1001, 100))"/>
237 <param name="selected_param_type" value="final_estimator_p"/>
238 </conditional>
239 <conditional name="search_param_selector">
240 <param name="search_p" value="estimator: [ensemble_ExtraTreesClassifier(n_estimators=100, random_state=324089)]"/>
241 <param name="selected_param_type" value="prep_1_p"/>
242 </conditional>
243 <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
244 <param name="header1" value="true" />
245 <param name="selected_column_selector_option" value="all_columns"/>
246 <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
247 <param name="header2" value="true" />
248 <param name="selected_column_selector_option2" value="all_columns"/>
249 <output name="outfile_result">
250 <assert_contents>
251 <has_n_columns n="13"/>
252 <has_text text="0.05363984674329502"/>
253 </assert_contents>
254 </output>
255 </test>
256 <test>
257 <param name="selected_search_scheme" value="GridSearchCV"/>
258 <param name="infile_pipeline" value="pipeline01"/>
259 <conditional name="search_param_selector">
260 <param name="search_p" value="C: [1, 10, 100, 1000]"/>
261 <param name="selected_param_type" value="final_estimator_p"/>
262 </conditional>
263 <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
264 <param name="header1" value="true" />
265 <param name="selected_column_selector_option" value="all_columns"/>
266 <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
267 <param name="header2" value="true" />
268 <param name="selected_column_selector_option2" value="all_columns"/>
269 <output name="outfile_estimator" file="searchCV01" compare="sim_size" delta="1"/>
270 </test>
271 <test>
272 <param name="selected_search_scheme" value="GridSearchCV"/>
273 <param name="infile_pipeline" value="pipeline06"/>
274 <conditional name="search_param_selector">
275 <param name="search_p" value="n_estimators: [10, 50, 200, 1000]"/>
276 <param name="selected_param_type" value="final_estimator_p"/>
277 </conditional>
278 <conditional name="search_param_selector">
279 <param name="search_p" value="random_state: [324089]"/>
280 <param name="selected_param_type" value="final_estimator_p"/>
281 </conditional>
282 <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
283 <param name="header1" value="true" />
284 <param name="selected_column_selector_option" value="all_columns"/>
285 <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
286 <param name="header2" value="true" />
287 <param name="selected_column_selector_option2" value="all_columns"/>
288 <output name="outfile_result">
289 <assert_contents>
290 <has_n_columns n="13"/>
291 <has_text_matching expression=".+0.7772355090078996[^/w]+1000[^/d]" />
292 </assert_contents>
293 </output>
294 </test>
295 <test>
296 <param name="selected_search_scheme" value="GridSearchCV"/>
297 <param name="infile_pipeline" value="pipeline07"/>
298 <conditional name="search_param_selector">
299 <param name="search_p" value="n_estimators: [10, 50, 100, 200]"/>
300 <param name="selected_param_type" value="final_estimator_p"/>
301 </conditional>
302 <conditional name="search_param_selector">
303 <param name="search_p" value="random_state: [324089]"/>
304 <param name="selected_param_type" value="final_estimator_p"/>
305 </conditional>
306 <conditional name="search_param_selector">
307 <param name="search_p" value="gamma: [1.0, 2.0]"/>
308 <param name="selected_param_type" value="prep_1_p"/>
309 </conditional>
310 <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
311 <param name="header1" value="true" />
312 <param name="selected_column_selector_option" value="all_columns"/>
313 <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
314 <param name="header2" value="true" />
315 <param name="selected_column_selector_option2" value="all_columns"/>
316 <output name="outfile_result">
317 <assert_contents>
318 <has_n_columns n="14"/>
319 <has_text_matching expression=".+0.05747126436781609[^/d]" />
320 </assert_contents>
321 </output>
322 </test>
323 <test>
324 <param name="selected_search_scheme" value="GridSearchCV"/>
325 <param name="infile_pipeline" value="pipeline08"/>
326 <conditional name="search_param_selector">
327 <param name="search_p" value="n_estimators: [10, 50, 100, 200]"/>
328 <param name="selected_param_type" value="final_estimator_p"/>
329 </conditional>
330 <conditional name="search_param_selector">
331 <param name="search_p" value="random_state: [324089]"/>
332 <param name="selected_param_type" value="final_estimator_p"/>
333 </conditional>
334 <conditional name="search_param_selector">
335 <param name="search_p" value="linkage: ['ward', 'complete', 'average']"/>
336 <param name="selected_param_type" value="prep_1_p"/>
337 </conditional>
338 <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
339 <param name="header1" value="true" />
340 <param name="selected_column_selector_option" value="all_columns"/>
341 <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
342 <param name="header2" value="true" />
343 <param name="selected_column_selector_option2" value="all_columns"/>
344 <output name="outfile_result">
345 <assert_contents>
346 <has_text_matching expression=".+0.08045977011494253[^/w]+10[^/w]" />
347 </assert_contents>
348 </output>
349 </test>
350 </tests>
351 <help>
352 <![CDATA[
353 **What it does**
354 Searches optimized parameter values for an estimator or pipeline through either exhaustive grid cross validation search or Randomized cross validation search.
355 please refer to `Scikit-learn model_selection GridSearchCV`_, `Scikit-learn model_selection RandomizedSearchCV`_ and `Tuning hyper-parameters`_.
356
357 **How to choose search patameters?**
358
359 Please refer to `svm`_, `linear_model`_, `ensemble`_, `naive_bayes`_, `tree`_, `neighbors`_ and `xgboost`_ for estimator parameters.
360 Refer to `sklearn.preprocessing`_, `feature_selection`_, `decomposition`_, `kernel_approximation`_ and `cluster.FeatureAgglomeration`_ for parameter in the pre-processing steps.
361
362 **Search parameter input** accepts parameter and setting in key:value pair. One pair per input box. Setting can be list, numpy array, or distribution.
363 The evaluation of settings supports operations in Math, list comprehension, numpy.arange(np_arange), most numpy.random(e.g., np_random_uniform) and some scipy.stats(e.g., scipy_stats_zipf) classes or functions, and others.
364
365 **Examples:**
366
367 - K: [3, 5, 7, 9]
368
369 - n_estimators: list(range(50, 1001, 50))
370
371 - gamma: np_arange(0.01, 1, 0.1)
372
373 - alpha: np_random_choice(list(range(1, 51)) + [None], size=20)
374
375 - max_depth: scipy_stats_randin(1, 11)
376
377 - estimator: [ensemble_ExtraTreesClassifier(n_estimators=100, random_state=324089)]
378
379
380 .. _`Scikit-learn model_selection GridSearchCV`: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
381 .. _`Scikit-learn model_selection RandomizedSearchCV`: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html
382 .. _`Tuning hyper-parameters`: http://scikit-learn.org/stable/modules/grid_search.html
383
384 .. _`svm`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.svm
385 .. _`linear_model`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.linear_model
386 .. _`ensemble`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.ensemble
387 .. _`naive_bayes`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.naive_bayes
388 .. _`tree`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.tree
389 .. _`neighbors`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.neighbors
390 .. _`xgboost`: https://xgboost.readthedocs.io/en/latest/python/python_api.html
391
392 .. _`sklearn.preprocessing`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.preprocessing
393 .. _`feature_selection`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_selection
394 .. _`decomposition`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.decomposition
395 .. _`kernel_approximation`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.kernel_approximation
396 .. _`cluster.FeatureAgglomeration`: http://scikit-learn.org/stable/modules/generated/sklearn.cluster.FeatureAgglomeration.html
397
398 ]]>
399 </help>
400 <expand macro="sklearn_citation"/>
401 </tool>