Mercurial > repos > bgruening > sklearn_build_pipeline
comparison pipeline.xml @ 8:913ee94945f3 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit c0a3a186966888e5787335a7628bf0a4382637e7
author | bgruening |
---|---|
date | Tue, 14 May 2019 18:06:37 -0400 |
parents | 99038af8deda |
children | 775b004b7920 |
comparison
equal
deleted
inserted
replaced
7:99038af8deda | 8:913ee94945f3 |
---|---|
1 <tool id="sklearn_build_pipeline" name="Pipeline Builder" version="@VERSION@"> | 1 <tool id="sklearn_build_pipeline" name="Pipeline Builder" version="@VERSION@"> |
2 <description>constructs a list of transforms and a final estimator</description> | 2 <description>constructs a list of transforms and a final estimator</description> |
3 <macros> | 3 <macros> |
4 <import>main_macros.xml</import> | 4 <import>main_macros.xml</import> |
5 </macros> | 5 </macros> |
6 <expand macro="python_requirements"> | 6 <expand macro="python_requirements"/> |
7 <requirement type="package" version="0.6">skrebate</requirement> | |
8 <requirement type="package" version="0.4.2">imbalanced-learn</requirement> | |
9 </expand> | |
10 <expand macro="macro_stdio"/> | 7 <expand macro="macro_stdio"/> |
11 <version_command>echo "@VERSION@"</version_command> | 8 <version_command>echo "@VERSION@"</version_command> |
12 <command> | 9 <command> |
13 <![CDATA[ | 10 <![CDATA[ |
14 python "$sklearn_pipeline_script" '$inputs' | 11 python "$sklearn_pipeline_script" '$inputs' |
16 </command> | 13 </command> |
17 <configfiles> | 14 <configfiles> |
18 <inputs name="inputs" /> | 15 <inputs name="inputs" /> |
19 <configfile name="sklearn_pipeline_script"> | 16 <configfile name="sklearn_pipeline_script"> |
20 <![CDATA[ | 17 <![CDATA[ |
18 import imblearn | |
21 import json | 19 import json |
20 import pickle | |
22 import pprint | 21 import pprint |
23 import skrebate | 22 import skrebate |
24 import imblearn | 23 import sys |
25 from imblearn import under_sampling, over_sampling, combine | 24 import warnings |
26 from imblearn.pipeline import Pipeline as imbPipeline | 25 from mlxtend import classifier, regressor |
27 from sklearn import (preprocessing, svm, linear_model, ensemble, naive_bayes, | 26 from sklearn import ( |
28 tree, neighbors, decomposition, kernel_approximation, cluster) | 27 cluster, compose, decomposition, ensemble, feature_extraction, |
29 from sklearn.pipeline import Pipeline | 28 feature_selection, gaussian_process, kernel_approximation, metrics, |
30 | 29 model_selection, naive_bayes, neighbors, pipeline, preprocessing, |
31 with open('$__tool_directory__/sk_whitelist.json', 'r') as f: | 30 svm, linear_model, tree, discriminant_analysis) |
32 sk_whitelist = json.load(f) | 31 from sklearn.pipeline import make_pipeline |
33 exec(open('$__tool_directory__/utils.py').read(), globals()) | 32 from imblearn.pipeline import make_pipeline as imb_make_pipeline |
33 | |
34 sys.path.insert(0, '$__tool_directory__') | |
35 | |
36 from utils import SafeEval, feature_selector, get_estimator, try_get_attr | |
37 from preprocessors import Z_RandomOverSampler | |
38 | |
39 N_JOBS = int(__import__('os').environ.get('GALAXY_SLOTS', 1)) | |
34 | 40 |
35 warnings.filterwarnings('ignore') | 41 warnings.filterwarnings('ignore') |
36 | 42 |
37 safe_eval = SafeEval() | 43 safe_eval = SafeEval() |
38 | 44 |
39 input_json_path = sys.argv[1] | 45 input_json_path = sys.argv[1] |
40 with open(input_json_path, 'r') as param_handler: | 46 with open(input_json_path, 'r') as param_handler: |
41 params = json.load(param_handler) | 47 params = json.load(param_handler) |
42 | 48 |
43 #if $final_estimator.estimator_selector.selected_module == 'customer_estimator': | 49 #if $final_estimator.estimator_selector.selected_module == 'custom_estimator': |
44 params['final_estimator']['estimator_selector']['c_estimator'] =\ | 50 params['final_estimator']['estimator_selector']['c_estimator'] =\ |
45 '$final_estimator.estimator_selector.c_estimator' | 51 '$final_estimator.estimator_selector.c_estimator' |
52 #end if | |
53 | |
54 #if $final_estimator.estimator_selector.selected_module == 'binarize_target': | |
55 params['final_estimator']['estimator_selector']['wrapped_estimator'] =\ | |
56 '$final_estimator.estimator_selector.wrapped_estimator' | |
46 #end if | 57 #end if |
47 | 58 |
48 pipeline_steps = [] | 59 pipeline_steps = [] |
49 | 60 |
50 def get_component(input_json, check_none=False): | 61 def get_component(input_json, check_none=False): |
51 is_imblearn = False | 62 is_imblearn = False |
52 if input_json['component_type'] == 'None': | 63 if input_json['component_type'] == 'None': |
53 if not check_none: | 64 if not check_none: |
54 return None, False | 65 return None, False |
55 else: | 66 else: |
56 sys.exit("The pre-processing component type can't be None when the number of components is greater than 1.") | 67 sys.exit("The pre-processing component type can't be None " |
68 "when the number of components is greater than 1.") | |
57 if input_json['component_type'] == 'pre_processor': | 69 if input_json['component_type'] == 'pre_processor': |
58 preprocessor = input_json['pre_processors']['selected_pre_processor'] | 70 preprocessor = input_json['pre_processors']['selected_pre_processor'] |
59 pre_processor_options = input_json['pre_processors']['options'] | 71 pre_processor_options = input_json['pre_processors']['options'] |
60 my_class = getattr(preprocessing, preprocessor) | 72 my_class = getattr(preprocessing, preprocessor) |
61 obj = my_class(**pre_processor_options) | 73 obj = my_class(**pre_processor_options) |
95 elif input_json['component_type'] == 'imblearn': | 107 elif input_json['component_type'] == 'imblearn': |
96 is_imblearn = True | 108 is_imblearn = True |
97 algorithm = input_json['imblearn_selector']['select_algorithm'] | 109 algorithm = input_json['imblearn_selector']['select_algorithm'] |
98 if algorithm == 'over_sampling.SMOTENC': | 110 if algorithm == 'over_sampling.SMOTENC': |
99 obj = over_sampling.SMOTENC(categorical_features=[]) | 111 obj = over_sampling.SMOTENC(categorical_features=[]) |
112 elif algorithm == 'Z_RandomOverSampler': | |
113 obj = Z_RandomOverSampler() | |
100 else: | 114 else: |
101 globals = algorithm.split('.') | 115 globals = algorithm.split('.') |
102 mod, klass = globals[0], globals[1] | 116 mod, klass = globals[0], globals[1] |
103 obj = getattr(getattr(imblearn, mod), klass)() | 117 obj = getattr(getattr(imblearn, mod), klass)() |
104 options = input_json['imblearn_selector']['text_params'].strip() | 118 options = input_json['imblearn_selector']['text_params'].strip() |
105 if options != '': | 119 if options != '': |
106 options = safe_eval( 'dict(' + options + ')' ) | 120 options = safe_eval( 'dict(' + options + ')' ) |
107 obj.set_params(**options) | 121 obj.set_params(**options) |
122 elif input_json['component_type'] == 'IRAPS': | |
123 iraps_core = try_get_attr('iraps_classifier','IRAPSCore')() | |
124 core_params = input_json['text_params'].strip() | |
125 if core_params != '': | |
126 try: | |
127 params = safe_eval('dict(' + core_params + ')') | |
128 except ValueError: | |
129 sys.exit("Unsupported parameter input: `%s`" % core_params) | |
130 iraps_core.set_params(**params) | |
131 options = {} | |
132 if input_json['p_thres'] is not None: | |
133 options['p_thres'] = input_json['p_thres'] | |
134 if input_json['fc_thres'] is not None: | |
135 options['fc_thres'] = input_json['fc_thres'] | |
136 if input_json['occurrence'] is not None: | |
137 options['occurrence'] = input_json['occurrence'] | |
138 if input_json['discretize'] is not None: | |
139 options['discretize'] = input_json['discretize'] | |
140 IRAPSClassifier = try_get_attr('iraps_classifier','IRAPSClassifier') | |
141 obj = IRAPSClassifier(iraps_core, **options) | |
108 if 'n_jobs' in obj.get_params(): | 142 if 'n_jobs' in obj.get_params(): |
109 obj.set_params( n_jobs=N_JOBS ) | 143 obj.set_params( n_jobs=N_JOBS ) |
110 return obj, is_imblearn | 144 return obj, is_imblearn |
111 | 145 |
112 has_imblearn = False | 146 has_imblearn = False |
113 if len(params['pipeline_component']) == 1: | 147 if len(params['pipeline_component']) == 1: |
114 step_obj, is_imblearn = get_component( params['pipeline_component'][0]['component_selector']) | 148 step_obj, is_imblearn = get_component( params['pipeline_component'][0]['component_selector']) |
115 if step_obj: | 149 if step_obj: |
116 pipeline_steps.append( ('preprocessing_1', step_obj) ) | 150 pipeline_steps.append( step_obj ) |
117 if is_imblearn: | 151 if is_imblearn: |
118 has_imblearn = True | 152 has_imblearn = True |
119 else: | 153 else: |
120 for i, c in enumerate(params['pipeline_component']): | 154 for i, c in enumerate(params['pipeline_component']): |
121 step_obj, is_imblearn = get_component( c['component_selector'], check_none=True ) | 155 step_obj, is_imblearn = get_component( c['component_selector'], check_none=True ) |
122 pipeline_steps.append( ('preprocessing_' + str(i+1), step_obj) ) | 156 pipeline_steps.append( step_obj ) |
123 if is_imblearn: | 157 if is_imblearn: |
124 has_imblearn = True | 158 has_imblearn = True |
125 | 159 |
126 # Set up final estimator and add to pipeline. | 160 ## Set up final estimator and add to pipeline. |
127 estimator_json = params['final_estimator']['estimator_selector'] | 161 estimator_json = params['final_estimator']['estimator_selector'] |
128 if estimator_json['selected_module'] == 'none': | 162 if estimator_json['selected_module'] == 'none': |
129 if len(pipeline_steps) == 0: | 163 if len(pipeline_steps) == 0: |
130 sys.exit("No pipeline steps specified!") | 164 sys.exit("No pipeline steps specified!") |
131 else: # turn the last pre-process component to final estimator | 165 ## else: turn the last pre-process component to final estimator |
132 pipeline_steps[-1] = ('estimator', pipeline_steps[-1][-1]) | |
133 else: | 166 else: |
134 estimator = get_estimator(estimator_json) | 167 estimator = get_estimator(estimator_json) |
135 pipeline_steps.append( ('estimator', estimator) ) | 168 pipeline_steps.append( estimator ) |
136 | 169 |
170 #if $output_type == 'Final_Estimator_Builder': | |
171 with open('$outfile', 'wb') as out_handler: | |
172 final_est = pipeline_steps[-1] | |
173 print(final_est) | |
174 pickle.dump(final_est, out_handler, pickle.HIGHEST_PROTOCOL) | |
175 #else: | |
137 if has_imblearn: | 176 if has_imblearn: |
138 pipeline = imbPipeline(pipeline_steps) | 177 pipeline = imb_make_pipeline(*pipeline_steps) |
139 else: | 178 else: |
140 pipeline = Pipeline(pipeline_steps) | 179 pipeline = make_pipeline(*pipeline_steps) |
141 pprint.pprint(pipeline.named_steps) | 180 pprint.pprint(pipeline.named_steps) |
142 | 181 |
143 with open('$outfile', 'wb') as out_handler: | 182 with open('$outfile', 'wb') as out_handler: |
144 pickle.dump(pipeline, out_handler, pickle.HIGHEST_PROTOCOL) | 183 pickle.dump(pipeline, out_handler, pickle.HIGHEST_PROTOCOL) |
145 | 184 #end if |
146 ]]> | 185 ]]> |
147 </configfile> | 186 </configfile> |
148 </configfiles> | 187 </configfiles> |
149 <inputs> | 188 <inputs> |
150 <repeat name="pipeline_component" min="1" max="5" title="Pre-processing step"> | 189 <repeat name="pipeline_component" min="1" max="5" title="Pre-processing step"> |
156 <option value="decomposition">Matrix Decomposition</option> | 195 <option value="decomposition">Matrix Decomposition</option> |
157 <option value="kernel_approximation">Kernel Approximation</option> | 196 <option value="kernel_approximation">Kernel Approximation</option> |
158 <option value="FeatureAgglomeration">Agglomerate Features</option> | 197 <option value="FeatureAgglomeration">Agglomerate Features</option> |
159 <option value="skrebate">SK-rebate feature selection</option> | 198 <option value="skrebate">SK-rebate feature selection</option> |
160 <option value="imblearn">imbalanced-learn sampling</option> | 199 <option value="imblearn">imbalanced-learn sampling</option> |
200 <option value="IRAPS">IRAPS -- feature selector and classifier</option> | |
161 </param> | 201 </param> |
162 <when value="None"/> | 202 <when value="None"/> |
163 <when value="pre_processor"> | 203 <when value="pre_processor"> |
164 <conditional name="pre_processors"> | 204 <conditional name="pre_processors"> |
165 <expand macro="sparse_preprocessors_ext" /> | 205 <expand macro="sparse_preprocessors_ext" /> |
182 <expand macro="skrebate"/> | 222 <expand macro="skrebate"/> |
183 </when> | 223 </when> |
184 <when value="imblearn"> | 224 <when value="imblearn"> |
185 <expand macro="imbalanced_learn_sampling"/> | 225 <expand macro="imbalanced_learn_sampling"/> |
186 </when> | 226 </when> |
227 <when value="IRAPS"> | |
228 <expand macro="estimator_params_text" | |
229 label="Type in parameter settings for IRAPSCore if different from default:" | |
230 help="Default(=blank): n_iter=1000, responsive_thres=-1, resistant_thres=0, random_state=None. No double quotes"/> | |
231 <param argument="p_thres" type="float" value="0.001" label="P value threshold" help="Float. default=0.001"/> | |
232 <param argument="fc_thres" type="float" value="0.1" label="fold change threshold" help="Float. default=0.1"/> | |
233 <param argument="occurrence" type="float" value="0.7" label="reservation factor" help="Float. default=0.7"/> | |
234 <param argument="discretize" type="float" value="-1" label="The z_score threshold to discretize target value" help="Float. default=-1"/> | |
235 </when> | |
187 </conditional> | 236 </conditional> |
188 </repeat> | 237 </repeat> |
189 <section name="final_estimator" title="Final Estimator" expanded="true"> | 238 <section name="final_estimator" title="Final Estimator" expanded="true"> |
190 <conditional name="estimator_selector"> | 239 <conditional name="estimator_selector"> |
191 <param name="selected_module" type="select" label="Choose the module that contains target estimator:" > | 240 <param name="selected_module" type="select" label="Choose the module that contains target estimator:" > |
192 <expand macro="estimator_module_options"> | 241 <expand macro="estimator_module_options"> |
193 <option value="customer_estimator">Load a customer estimator</option> | 242 <option value="binarize_target">Binarize Target Classifier or Regressor</option> |
243 <option value="custom_estimator">Load a custom estimator</option> | |
194 <option value="none">none -- The last component of pre-processing step will turn to a final estimator</option> | 244 <option value="none">none -- The last component of pre-processing step will turn to a final estimator</option> |
195 </expand> | 245 </expand> |
196 </param> | 246 </param> |
197 <expand macro="estimator_suboptions"> | 247 <expand macro="estimator_suboptions"> |
198 <when value="customer_estimator"> | 248 <when value="binarize_target"> |
199 <param name="c_estimator" type="data" format="zip" label="Choose the dataset containing the customer estimator or pipeline:"/> | 249 <param name="clf_or_regr" type="select" label="Classifier or Regressor:"> |
250 <option value="BinarizeTargetClassifier">BinarizeTargetClassifier</option> | |
251 <option value="BinarizeTargetRegressor">BinarizeTargetRegressor</option> | |
252 </param> | |
253 <param name="wrapped_estimator" type="data" format="zip" label="Choose the dataset containing the wrapped estimator or pipeline"/> | |
254 <param name='z_score' type="float" value="-1" optional="false" label="Discrize target values using z_score"/> | |
255 <param name='value' type="float" value="" optional="true" label="Discretize target values using a fixed value instead" help="Optional. default: None."/> | |
256 <param name="less_is_positive" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="true" label="Are the detecting values smaller than others?"/> | |
257 </when> | |
258 <when value="custom_estimator"> | |
259 <param name="c_estimator" type="data" format="zip" label="Choose the dataset containing the custom estimator or pipeline"/> | |
200 </when> | 260 </when> |
201 <when value="none"/> | 261 <when value="none"/> |
202 </expand> | 262 </expand> |
203 </conditional> | 263 </conditional> |
204 </section> | 264 </section> |
265 <param name="output_type" type="select" label="Output the final estimator instead?"> | |
266 <option value="Pipeline_Builder" selected="true">Pipeline</option> | |
267 <option value="Final_Estimator_Builder">Final Estimator</option> | |
268 </param> | |
205 </inputs> | 269 </inputs> |
206 <outputs> | 270 <outputs> |
207 <data format="zip" name="outfile"/> | 271 <data format="zip" name="outfile" label="${output_type}"/> |
208 </outputs> | 272 </outputs> |
209 <tests> | 273 <tests> |
210 <test> | 274 <test> |
211 <repeat name="pipeline_component"> | 275 <repeat name="pipeline_component"> |
212 <conditional name="component_selector"> | 276 <conditional name="component_selector"> |
311 <param name="text_params" value="n_clusters=3, affinity='euclidean'"/> | 375 <param name="text_params" value="n_clusters=3, affinity='euclidean'"/> |
312 </conditional> | 376 </conditional> |
313 </conditional> | 377 </conditional> |
314 <param name="selected_module" value="ensemble"/> | 378 <param name="selected_module" value="ensemble"/> |
315 <param name="selected_estimator" value="AdaBoostClassifier"/> | 379 <param name="selected_estimator" value="AdaBoostClassifier"/> |
316 <output name="outfile" file="pipeline08" compare="sim_size" delta="5"/> | 380 <output name="outfile" file="pipeline08" compare="sim_size" delta="20"/> |
317 </test> | 381 </test> |
318 <test> | 382 <test> |
319 <conditional name="component_selector"> | 383 <conditional name="component_selector"> |
320 <param name="component_type" value="skrebate"/> | 384 <param name="component_type" value="skrebate"/> |
321 <conditional name="skrebate_selector"> | 385 <conditional name="skrebate_selector"> |
370 <conditional name="estimator_selector"> | 434 <conditional name="estimator_selector"> |
371 <param name="selected_module" value="none"/> | 435 <param name="selected_module" value="none"/> |
372 </conditional> | 436 </conditional> |
373 </section> | 437 </section> |
374 <output name="outfile" file="pipeline12" compare="sim_size" delta="5"/> | 438 <output name="outfile" file="pipeline12" compare="sim_size" delta="5"/> |
439 </test> | |
440 <test> | |
441 <conditional name="component_selector"> | |
442 <param name="component_type" value="None"/> | |
443 </conditional> | |
444 <param name="selected_module" value="ensemble"/> | |
445 <param name="selected_estimator" value="RandomForestClassifier"/> | |
446 <param name="output_type" value="Final_Estimator_Builder"/> | |
447 <output name="outfile" file="RandomForestClassifier.zip" compare="sim_size" delta="5"/> | |
448 </test> | |
449 <test> | |
450 <conditional name="component_selector"> | |
451 <param name="component_type" value="IRAPS"/> | |
452 </conditional> | |
453 <section name="final_estimator"> | |
454 <conditional name="estimator_selector"> | |
455 <param name="selected_module" value="none"/> | |
456 </conditional> | |
457 </section> | |
458 <param name="output_type" value="Final_Estimator_Builder"/> | |
459 <output name="outfile" file="pipeline14" compare="sim_size" delta="5"/> | |
460 </test> | |
461 <test> | |
462 <conditional name="component_selector"> | |
463 <param name="component_type" value="None"/> | |
464 </conditional> | |
465 <section name="final_estimator"> | |
466 <conditional name="estimator_selector"> | |
467 <param name="selected_module" value="binarize_target"/> | |
468 <param name="clf_or_regr" value="BinarizeTargetClassifier"/> | |
469 <param name="wrapped_estimator" value="RandomForestClassifier.zip" ftype="zip"/> | |
470 </conditional> | |
471 </section> | |
472 <param name="output_type" value="Final_Estimator_Builder"/> | |
473 <output name="outfile" file="pipeline15" compare="sim_size" delta="5"/> | |
375 </test> | 474 </test> |
376 </tests> | 475 </tests> |
377 <help> | 476 <help> |
378 <![CDATA[ | 477 <![CDATA[ |
379 **What it does** | 478 **What it does** |