Mercurial > repos > bgruening > sklearn_build_pipeline
diff pipeline.xml @ 7:99038af8deda draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 57f4407e278a615f47a377a3328782b1d8e0b54d
author | bgruening |
---|---|
date | Sun, 30 Dec 2018 01:52:15 -0500 |
parents | 52c4e0ef580a |
children | 913ee94945f3 |
line wrap: on
line diff
--- a/pipeline.xml Thu Oct 11 03:30:40 2018 -0400 +++ b/pipeline.xml Sun Dec 30 01:52:15 2018 -0500 @@ -5,6 +5,7 @@ </macros> <expand macro="python_requirements"> <requirement type="package" version="0.6">skrebate</requirement> + <requirement type="package" version="0.4.2">imbalanced-learn</requirement> </expand> <expand macro="macro_stdio"/> <version_command>echo "@VERSION@"</version_command> @@ -17,34 +18,45 @@ <inputs name="inputs" /> <configfile name="sklearn_pipeline_script"> <![CDATA[ -import sys -import os import json import pprint import skrebate +import imblearn +from imblearn import under_sampling, over_sampling, combine +from imblearn.pipeline import Pipeline as imbPipeline from sklearn import (preprocessing, svm, linear_model, ensemble, naive_bayes, tree, neighbors, decomposition, kernel_approximation, cluster) from sklearn.pipeline import Pipeline -exec(open("$__tool_directory__/utils.py").read(), globals()) +with open('$__tool_directory__/sk_whitelist.json', 'r') as f: + sk_whitelist = json.load(f) +exec(open('$__tool_directory__/utils.py').read(), globals()) + +warnings.filterwarnings('ignore') safe_eval = SafeEval() input_json_path = sys.argv[1] -with open(input_json_path, "r") as param_handler: +with open(input_json_path, 'r') as param_handler: params = json.load(param_handler) +#if $final_estimator.estimator_selector.selected_module == 'customer_estimator': +params['final_estimator']['estimator_selector']['c_estimator'] =\ + '$final_estimator.estimator_selector.c_estimator' +#end if + pipeline_steps = [] def get_component(input_json, check_none=False): + is_imblearn = False if input_json['component_type'] == 'None': if not check_none: - return + return None, False else: sys.exit("The pre-processing component type can't be None when the number of components is greater than 1.") if input_json['component_type'] == 'pre_processor': - preprocessor = input_json["pre_processors"]["selected_pre_processor"] - pre_processor_options = input_json["pre_processors"]["options"] + preprocessor = input_json['pre_processors']['selected_pre_processor'] + pre_processor_options = input_json['pre_processors']['options'] my_class = getattr(preprocessing, preprocessor) obj = my_class(**pre_processor_options) elif input_json['component_type'] == 'feature_selection': @@ -53,22 +65,22 @@ algorithm = input_json['matrix_decomposition_selector']['select_algorithm'] obj = getattr(decomposition, algorithm)() options = input_json['matrix_decomposition_selector']['text_params'].strip() - if options != "": - options = safe_eval('dict(' + options + ')') + if options != '': + options = safe_eval( 'dict(' + options + ')' ) obj.set_params(**options) elif input_json['component_type'] == 'kernel_approximation': algorithm = input_json['kernel_approximation_selector']['select_algorithm'] obj = getattr(kernel_approximation, algorithm)() options = input_json['kernel_approximation_selector']['text_params'].strip() - if options != "": - options = safe_eval('dict(' + options + ')') + if options != '': + options = safe_eval( 'dict(' + options + ')' ) obj.set_params(**options) elif input_json['component_type'] == 'FeatureAgglomeration': algorithm = input_json['FeatureAgglomeration_selector']['select_algorithm'] obj = getattr(cluster, algorithm)() options = input_json['FeatureAgglomeration_selector']['text_params'].strip() - if options != "": - options = safe_eval('dict(' + options + ')') + if options != '': + options = safe_eval( 'dict(' + options + ')' ) obj.set_params(**options) elif input_json['component_type'] == 'skrebate': algorithm = input_json['skrebate_selector']['select_algorithm'] @@ -77,32 +89,58 @@ else: obj = getattr(skrebate, algorithm)() options = input_json['skrebate_selector']['text_params'].strip() - if options != "": - options = safe_eval('dict(' + options + ')') + if options != '': + options = safe_eval( 'dict(' + options + ')' ) + obj.set_params(**options) + elif input_json['component_type'] == 'imblearn': + is_imblearn = True + algorithm = input_json['imblearn_selector']['select_algorithm'] + if algorithm == 'over_sampling.SMOTENC': + obj = over_sampling.SMOTENC(categorical_features=[]) + else: + globals = algorithm.split('.') + mod, klass = globals[0], globals[1] + obj = getattr(getattr(imblearn, mod), klass)() + options = input_json['imblearn_selector']['text_params'].strip() + if options != '': + options = safe_eval( 'dict(' + options + ')' ) obj.set_params(**options) if 'n_jobs' in obj.get_params(): obj.set_params( n_jobs=N_JOBS ) - return obj + return obj, is_imblearn +has_imblearn = False if len(params['pipeline_component']) == 1: - step_obj = get_component( params['pipeline_component'][0]['component_selector']) + step_obj, is_imblearn = get_component( params['pipeline_component'][0]['component_selector']) if step_obj: pipeline_steps.append( ('preprocessing_1', step_obj) ) + if is_imblearn: + has_imblearn = True else: for i, c in enumerate(params['pipeline_component']): - step_obj = get_component( c['component_selector'], check_none=True ) + step_obj, is_imblearn = get_component( c['component_selector'], check_none=True ) pipeline_steps.append( ('preprocessing_' + str(i+1), step_obj) ) + if is_imblearn: + has_imblearn = True # Set up final estimator and add to pipeline. -estimator_json = params["final_estimator"]['estimator_selector'] -estimator = get_estimator(estimator_json) +estimator_json = params['final_estimator']['estimator_selector'] +if estimator_json['selected_module'] == 'none': + if len(pipeline_steps) == 0: + sys.exit("No pipeline steps specified!") + else: # turn the last pre-process component to final estimator + pipeline_steps[-1] = ('estimator', pipeline_steps[-1][-1]) +else: + estimator = get_estimator(estimator_json) + pipeline_steps.append( ('estimator', estimator) ) -pipeline_steps.append( ('estimator', estimator) ) - -pipeline = Pipeline(pipeline_steps) +if has_imblearn: + pipeline = imbPipeline(pipeline_steps) +else: + pipeline = Pipeline(pipeline_steps) pprint.pprint(pipeline.named_steps) -with open("$outfile", 'wb') as out_handler: +with open('$outfile', 'wb') as out_handler: pickle.dump(pipeline, out_handler, pickle.HIGHEST_PROTOCOL) ]]> @@ -118,7 +156,8 @@ <option value="decomposition">Matrix Decomposition</option> <option value="kernel_approximation">Kernel Approximation</option> <option value="FeatureAgglomeration">Agglomerate Features</option> - <option value="skrebate">Skrebate algorithm</option> + <option value="skrebate">SK-rebate feature selection</option> + <option value="imblearn">imbalanced-learn sampling</option> </param> <when value="None"/> <when value="pre_processor"> @@ -128,9 +167,7 @@ </conditional> </when> <when value="feature_selection"> - <expand macro="feature_selection_all"> - <expand macro="fs_selectfrommodel_no_prefitted"/> - </expand> + <expand macro="feature_selection_pipeline"/> </when> <when value="decomposition"> <expand macro="matrix_decomposition_all"/> @@ -144,10 +181,26 @@ <when value="skrebate"> <expand macro="skrebate"/> </when> + <when value="imblearn"> + <expand macro="imbalanced_learn_sampling"/> + </when> </conditional> </repeat> <section name="final_estimator" title="Final Estimator" expanded="true"> - <expand macro="estimator_selector_all" /> + <conditional name="estimator_selector"> + <param name="selected_module" type="select" label="Choose the module that contains target estimator:" > + <expand macro="estimator_module_options"> + <option value="customer_estimator">Load a customer estimator</option> + <option value="none">none -- The last component of pre-processing step will turn to a final estimator</option> + </expand> + </param> + <expand macro="estimator_suboptions"> + <when value="customer_estimator"> + <param name="c_estimator" type="data" format="zip" label="Choose the dataset containing the customer estimator or pipeline:"/> + </when> + <when value="none"/> + </expand> + </conditional> </section> </inputs> <outputs> @@ -175,7 +228,7 @@ <param name="selected_module" value="svm"/> <param name="selected_estimator" value="SVR"/> <param name="text_params" value="kernel='linear'"/> - <output name="outfile" file="pipeline01" compare="sim_size" delta="1"/> + <output name="outfile" file="pipeline01" compare="sim_size" delta="5"/> </test> <test> <conditional name="component_selector"> @@ -186,7 +239,7 @@ </conditional> <param name="selected_module" value="linear_model"/> <param name="selected_estimator" value="LassoCV"/> - <output name="outfile" file="pipeline02" compare="sim_size" delta="1"/> + <output name="outfile" file="pipeline02" compare="sim_size" delta="5"/> </test> <test> <conditional name="component_selector"> @@ -197,7 +250,7 @@ </conditional> <param name="selected_module" value="xgboost"/> <param name="selected_estimator" value="XGBClassifier"/> - <output name="outfile" file="pipeline03" compare="sim_size" delta="1"/> + <output name="outfile" file="pipeline03" compare="sim_size" delta="5"/> </test> <test> <conditional name="component_selector"> @@ -216,7 +269,7 @@ <param name="selected_module" value="svm"/> <param name="selected_estimator" value="LinearSVC"/> </section> - <output name="outfile" file="pipeline04" compare="sim_size" delta="1"/> + <output name="outfile" file="pipeline04" compare="sim_size" delta="5"/> </test> <test> <conditional name="component_selector"> @@ -225,7 +278,7 @@ <param name="selected_module" value="ensemble"/> <param name="selected_estimator" value="RandomForestRegressor"/> <param name="text_params" value="n_estimators=100, random_state=42"/> - <output name="outfile" file="pipeline05" compare="sim_size" delta="1"/> + <output name="outfile" file="pipeline05" compare="sim_size" delta="5"/> </test> <test> <conditional name="component_selector"> @@ -236,7 +289,7 @@ </conditional> <param name="selected_module" value="ensemble"/> <param name="selected_estimator" value="AdaBoostRegressor"/> - <output name="outfile" file="pipeline06" compare="sim_size" delta="1"/> + <output name="outfile" file="pipeline06" compare="sim_size" delta="5"/> </test> <test> <conditional name="component_selector"> @@ -248,7 +301,7 @@ </conditional> <param name="selected_module" value="ensemble"/> <param name="selected_estimator" value="AdaBoostClassifier"/> - <output name="outfile" file="pipeline07" compare="sim_size" delta="1"/> + <output name="outfile" file="pipeline07" compare="sim_size" delta="5"/> </test> <test> <conditional name="component_selector"> @@ -260,7 +313,7 @@ </conditional> <param name="selected_module" value="ensemble"/> <param name="selected_estimator" value="AdaBoostClassifier"/> - <output name="outfile" file="pipeline08" compare="sim_size" delta="1"/> + <output name="outfile" file="pipeline08" compare="sim_size" delta="5"/> </test> <test> <conditional name="component_selector"> @@ -272,19 +325,26 @@ </conditional> <param name="selected_module" value="ensemble"/> <param name="selected_estimator" value="RandomForestRegressor"/> - <output name="outfile" file="pipeline09" compare="sim_size" delta="1"/> + <output name="outfile" file="pipeline09" compare="sim_size" delta="5"/> </test> <test> <conditional name="component_selector"> - <param name="component_type" value="skrebate"/> - <conditional name="skrebate_selector"> - <param name="select_algorithm" value="TuRF"/> - <param name="text_params" value=""/> - </conditional> + <param name="component_type" value="None"/> </conditional> <param name="selected_module" value="ensemble"/> - <param name="selected_estimator" value="RandomForestRegressor"/> - <output name="outfile" file="pipeline10" compare="sim_size" delta="1"/> + <param name="selected_estimator" value="AdaBoostRegressor"/> + <output name="outfile" file="pipeline10" compare="sim_size" delta="5"/> + </test> + <test> + <conditional name="component_selector"> + <param name="component_type" value="imblearn"/> + <conditional name="imblearn_selector"> + <param name="select_algorithm" value="under_sampling.EditedNearestNeighbours"/> + </conditional> + </conditional> + <param name="selected_module" value="ensemble"/> + <param name="selected_estimator" value="RandomForestClassifier"/> + <output name="outfile" file="pipeline11" compare="sim_size" delta="5"/> </test> <test expect_failure="true"> <conditional name="component_selector"> @@ -294,6 +354,25 @@ <param name="selected_estimator" value="RandomForestRegressor"/> <param name="text_params" value="n_estimators=__import__('os').system('ls ~')"/> </test> + <test> + <conditional name="component_selector"> + <param name="component_type" value="feature_selection"/> + <conditional name="fs_algorithm_selector"> + <param name="selected_algorithm" value="RFE"/> + <conditional name="estimator_selector"> + <param name="selected_module" value="xgboost"/> + <param name="selected_estimator" value="XGBRegressor"/> + <param name="text_params" value="random_state=0"/> + </conditional> + </conditional> + </conditional> + <section name="final_estimator"> + <conditional name="estimator_selector"> + <param name="selected_module" value="none"/> + </conditional> + </section> + <output name="outfile" file="pipeline12" compare="sim_size" delta="5"/> + </test> </tests> <help> <![CDATA[ @@ -328,5 +407,6 @@ <expand macro="sklearn_citation"> <expand macro="skrebate_citation"/> <expand macro="xgboost_citation"/> + <expand macro="imblearn_citation"/> </expand> </tool>