comparison pipeline.xml @ 8:913ee94945f3 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit c0a3a186966888e5787335a7628bf0a4382637e7
author bgruening
date Tue, 14 May 2019 18:06:37 -0400
parents 99038af8deda
children 775b004b7920
comparison
equal deleted inserted replaced
7:99038af8deda 8:913ee94945f3
1 <tool id="sklearn_build_pipeline" name="Pipeline Builder" version="@VERSION@"> 1 <tool id="sklearn_build_pipeline" name="Pipeline Builder" version="@VERSION@">
2 <description>constructs a list of transforms and a final estimator</description> 2 <description>constructs a list of transforms and a final estimator</description>
3 <macros> 3 <macros>
4 <import>main_macros.xml</import> 4 <import>main_macros.xml</import>
5 </macros> 5 </macros>
6 <expand macro="python_requirements"> 6 <expand macro="python_requirements"/>
7 <requirement type="package" version="0.6">skrebate</requirement>
8 <requirement type="package" version="0.4.2">imbalanced-learn</requirement>
9 </expand>
10 <expand macro="macro_stdio"/> 7 <expand macro="macro_stdio"/>
11 <version_command>echo "@VERSION@"</version_command> 8 <version_command>echo "@VERSION@"</version_command>
12 <command> 9 <command>
13 <![CDATA[ 10 <![CDATA[
14 python "$sklearn_pipeline_script" '$inputs' 11 python "$sklearn_pipeline_script" '$inputs'
16 </command> 13 </command>
17 <configfiles> 14 <configfiles>
18 <inputs name="inputs" /> 15 <inputs name="inputs" />
19 <configfile name="sklearn_pipeline_script"> 16 <configfile name="sklearn_pipeline_script">
20 <![CDATA[ 17 <![CDATA[
18 import imblearn
21 import json 19 import json
20 import pickle
22 import pprint 21 import pprint
23 import skrebate 22 import skrebate
24 import imblearn 23 import sys
25 from imblearn import under_sampling, over_sampling, combine 24 import warnings
26 from imblearn.pipeline import Pipeline as imbPipeline 25 from mlxtend import classifier, regressor
27 from sklearn import (preprocessing, svm, linear_model, ensemble, naive_bayes, 26 from sklearn import (
28 tree, neighbors, decomposition, kernel_approximation, cluster) 27 cluster, compose, decomposition, ensemble, feature_extraction,
29 from sklearn.pipeline import Pipeline 28 feature_selection, gaussian_process, kernel_approximation, metrics,
30 29 model_selection, naive_bayes, neighbors, pipeline, preprocessing,
31 with open('$__tool_directory__/sk_whitelist.json', 'r') as f: 30 svm, linear_model, tree, discriminant_analysis)
32 sk_whitelist = json.load(f) 31 from sklearn.pipeline import make_pipeline
33 exec(open('$__tool_directory__/utils.py').read(), globals()) 32 from imblearn.pipeline import make_pipeline as imb_make_pipeline
33
34 sys.path.insert(0, '$__tool_directory__')
35
36 from utils import SafeEval, feature_selector, get_estimator, try_get_attr
37 from preprocessors import Z_RandomOverSampler
38
39 N_JOBS = int(__import__('os').environ.get('GALAXY_SLOTS', 1))
34 40
35 warnings.filterwarnings('ignore') 41 warnings.filterwarnings('ignore')
36 42
37 safe_eval = SafeEval() 43 safe_eval = SafeEval()
38 44
39 input_json_path = sys.argv[1] 45 input_json_path = sys.argv[1]
40 with open(input_json_path, 'r') as param_handler: 46 with open(input_json_path, 'r') as param_handler:
41 params = json.load(param_handler) 47 params = json.load(param_handler)
42 48
43 #if $final_estimator.estimator_selector.selected_module == 'customer_estimator': 49 #if $final_estimator.estimator_selector.selected_module == 'custom_estimator':
44 params['final_estimator']['estimator_selector']['c_estimator'] =\ 50 params['final_estimator']['estimator_selector']['c_estimator'] =\
45 '$final_estimator.estimator_selector.c_estimator' 51 '$final_estimator.estimator_selector.c_estimator'
52 #end if
53
54 #if $final_estimator.estimator_selector.selected_module == 'binarize_target':
55 params['final_estimator']['estimator_selector']['wrapped_estimator'] =\
56 '$final_estimator.estimator_selector.wrapped_estimator'
46 #end if 57 #end if
47 58
48 pipeline_steps = [] 59 pipeline_steps = []
49 60
50 def get_component(input_json, check_none=False): 61 def get_component(input_json, check_none=False):
51 is_imblearn = False 62 is_imblearn = False
52 if input_json['component_type'] == 'None': 63 if input_json['component_type'] == 'None':
53 if not check_none: 64 if not check_none:
54 return None, False 65 return None, False
55 else: 66 else:
56 sys.exit("The pre-processing component type can't be None when the number of components is greater than 1.") 67 sys.exit("The pre-processing component type can't be None "
68 "when the number of components is greater than 1.")
57 if input_json['component_type'] == 'pre_processor': 69 if input_json['component_type'] == 'pre_processor':
58 preprocessor = input_json['pre_processors']['selected_pre_processor'] 70 preprocessor = input_json['pre_processors']['selected_pre_processor']
59 pre_processor_options = input_json['pre_processors']['options'] 71 pre_processor_options = input_json['pre_processors']['options']
60 my_class = getattr(preprocessing, preprocessor) 72 my_class = getattr(preprocessing, preprocessor)
61 obj = my_class(**pre_processor_options) 73 obj = my_class(**pre_processor_options)
95 elif input_json['component_type'] == 'imblearn': 107 elif input_json['component_type'] == 'imblearn':
96 is_imblearn = True 108 is_imblearn = True
97 algorithm = input_json['imblearn_selector']['select_algorithm'] 109 algorithm = input_json['imblearn_selector']['select_algorithm']
98 if algorithm == 'over_sampling.SMOTENC': 110 if algorithm == 'over_sampling.SMOTENC':
99 obj = over_sampling.SMOTENC(categorical_features=[]) 111 obj = over_sampling.SMOTENC(categorical_features=[])
112 elif algorithm == 'Z_RandomOverSampler':
113 obj = Z_RandomOverSampler()
100 else: 114 else:
101 globals = algorithm.split('.') 115 globals = algorithm.split('.')
102 mod, klass = globals[0], globals[1] 116 mod, klass = globals[0], globals[1]
103 obj = getattr(getattr(imblearn, mod), klass)() 117 obj = getattr(getattr(imblearn, mod), klass)()
104 options = input_json['imblearn_selector']['text_params'].strip() 118 options = input_json['imblearn_selector']['text_params'].strip()
105 if options != '': 119 if options != '':
106 options = safe_eval( 'dict(' + options + ')' ) 120 options = safe_eval( 'dict(' + options + ')' )
107 obj.set_params(**options) 121 obj.set_params(**options)
122 elif input_json['component_type'] == 'IRAPS':
123 iraps_core = try_get_attr('iraps_classifier','IRAPSCore')()
124 core_params = input_json['text_params'].strip()
125 if core_params != '':
126 try:
127 params = safe_eval('dict(' + core_params + ')')
128 except ValueError:
129 sys.exit("Unsupported parameter input: `%s`" % core_params)
130 iraps_core.set_params(**params)
131 options = {}
132 if input_json['p_thres'] is not None:
133 options['p_thres'] = input_json['p_thres']
134 if input_json['fc_thres'] is not None:
135 options['fc_thres'] = input_json['fc_thres']
136 if input_json['occurrence'] is not None:
137 options['occurrence'] = input_json['occurrence']
138 if input_json['discretize'] is not None:
139 options['discretize'] = input_json['discretize']
140 IRAPSClassifier = try_get_attr('iraps_classifier','IRAPSClassifier')
141 obj = IRAPSClassifier(iraps_core, **options)
108 if 'n_jobs' in obj.get_params(): 142 if 'n_jobs' in obj.get_params():
109 obj.set_params( n_jobs=N_JOBS ) 143 obj.set_params( n_jobs=N_JOBS )
110 return obj, is_imblearn 144 return obj, is_imblearn
111 145
112 has_imblearn = False 146 has_imblearn = False
113 if len(params['pipeline_component']) == 1: 147 if len(params['pipeline_component']) == 1:
114 step_obj, is_imblearn = get_component( params['pipeline_component'][0]['component_selector']) 148 step_obj, is_imblearn = get_component( params['pipeline_component'][0]['component_selector'])
115 if step_obj: 149 if step_obj:
116 pipeline_steps.append( ('preprocessing_1', step_obj) ) 150 pipeline_steps.append( step_obj )
117 if is_imblearn: 151 if is_imblearn:
118 has_imblearn = True 152 has_imblearn = True
119 else: 153 else:
120 for i, c in enumerate(params['pipeline_component']): 154 for i, c in enumerate(params['pipeline_component']):
121 step_obj, is_imblearn = get_component( c['component_selector'], check_none=True ) 155 step_obj, is_imblearn = get_component( c['component_selector'], check_none=True )
122 pipeline_steps.append( ('preprocessing_' + str(i+1), step_obj) ) 156 pipeline_steps.append( step_obj )
123 if is_imblearn: 157 if is_imblearn:
124 has_imblearn = True 158 has_imblearn = True
125 159
126 # Set up final estimator and add to pipeline. 160 ## Set up final estimator and add to pipeline.
127 estimator_json = params['final_estimator']['estimator_selector'] 161 estimator_json = params['final_estimator']['estimator_selector']
128 if estimator_json['selected_module'] == 'none': 162 if estimator_json['selected_module'] == 'none':
129 if len(pipeline_steps) == 0: 163 if len(pipeline_steps) == 0:
130 sys.exit("No pipeline steps specified!") 164 sys.exit("No pipeline steps specified!")
131 else: # turn the last pre-process component to final estimator 165 ## else: turn the last pre-process component to final estimator
132 pipeline_steps[-1] = ('estimator', pipeline_steps[-1][-1])
133 else: 166 else:
134 estimator = get_estimator(estimator_json) 167 estimator = get_estimator(estimator_json)
135 pipeline_steps.append( ('estimator', estimator) ) 168 pipeline_steps.append( estimator )
136 169
170 #if $output_type == 'Final_Estimator_Builder':
171 with open('$outfile', 'wb') as out_handler:
172 final_est = pipeline_steps[-1]
173 print(final_est)
174 pickle.dump(final_est, out_handler, pickle.HIGHEST_PROTOCOL)
175 #else:
137 if has_imblearn: 176 if has_imblearn:
138 pipeline = imbPipeline(pipeline_steps) 177 pipeline = imb_make_pipeline(*pipeline_steps)
139 else: 178 else:
140 pipeline = Pipeline(pipeline_steps) 179 pipeline = make_pipeline(*pipeline_steps)
141 pprint.pprint(pipeline.named_steps) 180 pprint.pprint(pipeline.named_steps)
142 181
143 with open('$outfile', 'wb') as out_handler: 182 with open('$outfile', 'wb') as out_handler:
144 pickle.dump(pipeline, out_handler, pickle.HIGHEST_PROTOCOL) 183 pickle.dump(pipeline, out_handler, pickle.HIGHEST_PROTOCOL)
145 184 #end if
146 ]]> 185 ]]>
147 </configfile> 186 </configfile>
148 </configfiles> 187 </configfiles>
149 <inputs> 188 <inputs>
150 <repeat name="pipeline_component" min="1" max="5" title="Pre-processing step"> 189 <repeat name="pipeline_component" min="1" max="5" title="Pre-processing step">
156 <option value="decomposition">Matrix Decomposition</option> 195 <option value="decomposition">Matrix Decomposition</option>
157 <option value="kernel_approximation">Kernel Approximation</option> 196 <option value="kernel_approximation">Kernel Approximation</option>
158 <option value="FeatureAgglomeration">Agglomerate Features</option> 197 <option value="FeatureAgglomeration">Agglomerate Features</option>
159 <option value="skrebate">SK-rebate feature selection</option> 198 <option value="skrebate">SK-rebate feature selection</option>
160 <option value="imblearn">imbalanced-learn sampling</option> 199 <option value="imblearn">imbalanced-learn sampling</option>
200 <option value="IRAPS">IRAPS -- feature selector and classifier</option>
161 </param> 201 </param>
162 <when value="None"/> 202 <when value="None"/>
163 <when value="pre_processor"> 203 <when value="pre_processor">
164 <conditional name="pre_processors"> 204 <conditional name="pre_processors">
165 <expand macro="sparse_preprocessors_ext" /> 205 <expand macro="sparse_preprocessors_ext" />
182 <expand macro="skrebate"/> 222 <expand macro="skrebate"/>
183 </when> 223 </when>
184 <when value="imblearn"> 224 <when value="imblearn">
185 <expand macro="imbalanced_learn_sampling"/> 225 <expand macro="imbalanced_learn_sampling"/>
186 </when> 226 </when>
227 <when value="IRAPS">
228 <expand macro="estimator_params_text"
229 label="Type in parameter settings for IRAPSCore if different from default:"
230 help="Default(=blank): n_iter=1000, responsive_thres=-1, resistant_thres=0, random_state=None. No double quotes"/>
231 <param argument="p_thres" type="float" value="0.001" label="P value threshold" help="Float. default=0.001"/>
232 <param argument="fc_thres" type="float" value="0.1" label="fold change threshold" help="Float. default=0.1"/>
233 <param argument="occurrence" type="float" value="0.7" label="reservation factor" help="Float. default=0.7"/>
234 <param argument="discretize" type="float" value="-1" label="The z_score threshold to discretize target value" help="Float. default=-1"/>
235 </when>
187 </conditional> 236 </conditional>
188 </repeat> 237 </repeat>
189 <section name="final_estimator" title="Final Estimator" expanded="true"> 238 <section name="final_estimator" title="Final Estimator" expanded="true">
190 <conditional name="estimator_selector"> 239 <conditional name="estimator_selector">
191 <param name="selected_module" type="select" label="Choose the module that contains target estimator:" > 240 <param name="selected_module" type="select" label="Choose the module that contains target estimator:" >
192 <expand macro="estimator_module_options"> 241 <expand macro="estimator_module_options">
193 <option value="customer_estimator">Load a customer estimator</option> 242 <option value="binarize_target">Binarize Target Classifier or Regressor</option>
243 <option value="custom_estimator">Load a custom estimator</option>
194 <option value="none">none -- The last component of pre-processing step will turn to a final estimator</option> 244 <option value="none">none -- The last component of pre-processing step will turn to a final estimator</option>
195 </expand> 245 </expand>
196 </param> 246 </param>
197 <expand macro="estimator_suboptions"> 247 <expand macro="estimator_suboptions">
198 <when value="customer_estimator"> 248 <when value="binarize_target">
199 <param name="c_estimator" type="data" format="zip" label="Choose the dataset containing the customer estimator or pipeline:"/> 249 <param name="clf_or_regr" type="select" label="Classifier or Regressor:">
250 <option value="BinarizeTargetClassifier">BinarizeTargetClassifier</option>
251 <option value="BinarizeTargetRegressor">BinarizeTargetRegressor</option>
252 </param>
253 <param name="wrapped_estimator" type="data" format="zip" label="Choose the dataset containing the wrapped estimator or pipeline"/>
254 <param name='z_score' type="float" value="-1" optional="false" label="Discrize target values using z_score"/>
255 <param name='value' type="float" value="" optional="true" label="Discretize target values using a fixed value instead" help="Optional. default: None."/>
256 <param name="less_is_positive" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="true" label="Are the detecting values smaller than others?"/>
257 </when>
258 <when value="custom_estimator">
259 <param name="c_estimator" type="data" format="zip" label="Choose the dataset containing the custom estimator or pipeline"/>
200 </when> 260 </when>
201 <when value="none"/> 261 <when value="none"/>
202 </expand> 262 </expand>
203 </conditional> 263 </conditional>
204 </section> 264 </section>
265 <param name="output_type" type="select" label="Output the final estimator instead?">
266 <option value="Pipeline_Builder" selected="true">Pipeline</option>
267 <option value="Final_Estimator_Builder">Final Estimator</option>
268 </param>
205 </inputs> 269 </inputs>
206 <outputs> 270 <outputs>
207 <data format="zip" name="outfile"/> 271 <data format="zip" name="outfile" label="${output_type}"/>
208 </outputs> 272 </outputs>
209 <tests> 273 <tests>
210 <test> 274 <test>
211 <repeat name="pipeline_component"> 275 <repeat name="pipeline_component">
212 <conditional name="component_selector"> 276 <conditional name="component_selector">
311 <param name="text_params" value="n_clusters=3, affinity='euclidean'"/> 375 <param name="text_params" value="n_clusters=3, affinity='euclidean'"/>
312 </conditional> 376 </conditional>
313 </conditional> 377 </conditional>
314 <param name="selected_module" value="ensemble"/> 378 <param name="selected_module" value="ensemble"/>
315 <param name="selected_estimator" value="AdaBoostClassifier"/> 379 <param name="selected_estimator" value="AdaBoostClassifier"/>
316 <output name="outfile" file="pipeline08" compare="sim_size" delta="5"/> 380 <output name="outfile" file="pipeline08" compare="sim_size" delta="20"/>
317 </test> 381 </test>
318 <test> 382 <test>
319 <conditional name="component_selector"> 383 <conditional name="component_selector">
320 <param name="component_type" value="skrebate"/> 384 <param name="component_type" value="skrebate"/>
321 <conditional name="skrebate_selector"> 385 <conditional name="skrebate_selector">
370 <conditional name="estimator_selector"> 434 <conditional name="estimator_selector">
371 <param name="selected_module" value="none"/> 435 <param name="selected_module" value="none"/>
372 </conditional> 436 </conditional>
373 </section> 437 </section>
374 <output name="outfile" file="pipeline12" compare="sim_size" delta="5"/> 438 <output name="outfile" file="pipeline12" compare="sim_size" delta="5"/>
439 </test>
440 <test>
441 <conditional name="component_selector">
442 <param name="component_type" value="None"/>
443 </conditional>
444 <param name="selected_module" value="ensemble"/>
445 <param name="selected_estimator" value="RandomForestClassifier"/>
446 <param name="output_type" value="Final_Estimator_Builder"/>
447 <output name="outfile" file="RandomForestClassifier.zip" compare="sim_size" delta="5"/>
448 </test>
449 <test>
450 <conditional name="component_selector">
451 <param name="component_type" value="IRAPS"/>
452 </conditional>
453 <section name="final_estimator">
454 <conditional name="estimator_selector">
455 <param name="selected_module" value="none"/>
456 </conditional>
457 </section>
458 <param name="output_type" value="Final_Estimator_Builder"/>
459 <output name="outfile" file="pipeline14" compare="sim_size" delta="5"/>
460 </test>
461 <test>
462 <conditional name="component_selector">
463 <param name="component_type" value="None"/>
464 </conditional>
465 <section name="final_estimator">
466 <conditional name="estimator_selector">
467 <param name="selected_module" value="binarize_target"/>
468 <param name="clf_or_regr" value="BinarizeTargetClassifier"/>
469 <param name="wrapped_estimator" value="RandomForestClassifier.zip" ftype="zip"/>
470 </conditional>
471 </section>
472 <param name="output_type" value="Final_Estimator_Builder"/>
473 <output name="outfile" file="pipeline15" compare="sim_size" delta="5"/>
375 </test> 474 </test>
376 </tests> 475 </tests>
377 <help> 476 <help>
378 <![CDATA[ 477 <![CDATA[
379 **What it does** 478 **What it does**