Mercurial > repos > bgruening > sklearn_build_pipeline
view pipeline.xml @ 29:4c4ec859c31a draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 57a0433defa3cbc37ab34fbb0ebcfaeb680db8d5
author | bgruening |
---|---|
date | Sun, 05 Nov 2023 15:54:36 +0000 |
parents | 118e230e85ce |
children |
line wrap: on
line source
<tool id="sklearn_build_pipeline" name="Pipeline Builder" version="@VERSION@" profile="@PROFILE@"> <description>an all-in-one platform to build pipeline, single estimator, preprocessor and custom wrappers</description> <macros> <import>main_macros.xml</import> </macros> <expand macro="python_requirements" /> <expand macro="macro_stdio" /> <version_command>echo "@VERSION@"</version_command> <command> <![CDATA[ python "$sklearn_pipeline_script" '$inputs' ]]> </command> <configfiles> <inputs name="inputs" /> <configfile name="sklearn_pipeline_script"> <![CDATA[ import imblearn import json import pandas as pd import pprint import skrebate import sys import warnings from sklearn import ( cluster, compose, decomposition, ensemble, feature_extraction, feature_selection, gaussian_process, kernel_approximation, metrics, model_selection, naive_bayes, neighbors, pipeline, preprocessing, svm, linear_model, tree, discriminant_analysis) from sklearn.pipeline import make_pipeline from imblearn.pipeline import make_pipeline as imb_make_pipeline from galaxy_ml.model_persist import dump_model_to_h5, load_model_from_h5 from galaxy_ml.utils import (SafeEval, feature_selector, get_estimator, try_get_attr, get_search_params) N_JOBS = int(__import__('os').environ.get('GALAXY_SLOTS', 1)) warnings.filterwarnings('ignore') safe_eval = SafeEval() input_json_path = sys.argv[1] with open(input_json_path, 'r') as param_handler: params = json.load(param_handler) #if $final_estimator.estimator_selector.selected_module == 'custom_estimator': params['final_estimator']['estimator_selector']['c_estimator'] =\ '$final_estimator.estimator_selector.c_estimator' #end if #if $final_estimator.estimator_selector.selected_module == 'binarize_target': params['final_estimator']['estimator_selector']['wrapped_estimator'] =\ '$final_estimator.estimator_selector.wrapped_estimator' #end if pipeline_steps = [] def get_component(input_json, check_none=False): is_imblearn = False if input_json['component_type'] == 'None': if not check_none: return None, False else: sys.exit("The pre-processing component type can't be None " "when the number of components is greater than 1.") if input_json['component_type'] == 'pre_processor': preprocessor = input_json['pre_processors']['selected_pre_processor'] pre_processor_options = input_json['pre_processors']['options'] if 'feature_range' in pre_processor_options: feature_range = safe_eval(pre_processor_options['feature_range'].strip()) if not feature_range: feature_range = (0, 1) pre_processor_options['feature_range'] = feature_range my_class = getattr(preprocessing, preprocessor) obj = my_class(**pre_processor_options) elif input_json['component_type'] == 'feature_selection': obj = feature_selector(input_json['fs_algorithm_selector']) elif input_json['component_type'] == 'decomposition': algorithm = input_json['matrix_decomposition_selector']['select_algorithm'] obj = getattr(decomposition, algorithm)() options = input_json['matrix_decomposition_selector']['text_params'].strip() if options != '': options = safe_eval( 'dict(' + options + ')' ) obj.set_params(**options) elif input_json['component_type'] == 'kernel_approximation': algorithm = input_json['kernel_approximation_selector']['select_algorithm'] obj = getattr(kernel_approximation, algorithm)() options = input_json['kernel_approximation_selector']['text_params'].strip() if options != '': options = safe_eval( 'dict(' + options + ')' ) obj.set_params(**options) elif input_json['component_type'] == 'FeatureAgglomeration': algorithm = input_json['FeatureAgglomeration_selector']['select_algorithm'] obj = getattr(cluster, algorithm)() options = input_json['FeatureAgglomeration_selector']['text_params'].strip() if options != '': options = safe_eval( 'dict(' + options + ')' ) obj.set_params(**options) elif input_json['component_type'] == 'skrebate': algorithm = input_json['skrebate_selector']['select_algorithm'] if algorithm == 'TuRF': obj = getattr(skrebate, algorithm)(core_algorithm='ReliefF') else: obj = getattr(skrebate, algorithm)() options = input_json['skrebate_selector']['text_params'].strip() if options != '': options = safe_eval( 'dict(' + options + ')' ) obj.set_params(**options) elif input_json['component_type'] == 'imblearn': is_imblearn = True algorithm = input_json['imblearn_selector']['select_algorithm'] if algorithm == 'over_sampling.SMOTENC': obj = over_sampling.SMOTENC(categorical_features=[]) elif algorithm == 'Z_RandomOverSampler': Z_RandomOverSampler = try_get_attr('galaxy_ml.preprocessors', 'Z_RandomOverSampler') obj = Z_RandomOverSampler() else: globals = algorithm.split('.') mod, klass = globals[0], globals[1] obj = getattr(getattr(imblearn, mod), klass)() options = input_json['imblearn_selector']['text_params'].strip() if options != '': options = safe_eval( 'dict(' + options + ')' ) obj.set_params(**options) elif input_json['component_type'] == 'IRAPS': iraps_core = try_get_attr('galaxy_ml.iraps_classifier','IRAPSCore')() core_params = input_json['text_params'].strip() if core_params != '': try: params = safe_eval('dict(' + core_params + ')') except ValueError: sys.exit("Unsupported parameter input: `%s`" % core_params) iraps_core.set_params(**params) options = {} if input_json['p_thres'] is not None: options['p_thres'] = input_json['p_thres'] if input_json['fc_thres'] is not None: options['fc_thres'] = input_json['fc_thres'] if input_json['occurrence'] is not None: options['occurrence'] = input_json['occurrence'] if input_json['discretize'] is not None: options['discretize'] = input_json['discretize'] IRAPSClassifier = try_get_attr('galaxy_ml.iraps_classifier','IRAPSClassifier') obj = IRAPSClassifier(iraps_core, **options) elif input_json['component_type'] == 'preprocessors': encoder_selection = input_json['encoder_selection'] encoder_type = encoder_selection.pop('encoder_type') klass = try_get_attr('galaxy_ml.preprocessors', encoder_type) obj = klass(**encoder_selection) if 'n_jobs' in obj.get_params(): obj.set_params( n_jobs=N_JOBS ) return obj, is_imblearn has_imblearn = False if len(params['pipeline_component']) == 1: step_obj, is_imblearn = get_component( params['pipeline_component'][0]['component_selector']) if step_obj: pipeline_steps.append( step_obj ) if is_imblearn: has_imblearn = True else: for i, c in enumerate(params['pipeline_component']): step_obj, is_imblearn = get_component( c['component_selector'], check_none=True ) pipeline_steps.append( step_obj ) if is_imblearn: has_imblearn = True ## Set up final estimator and add to pipeline. estimator_json = params['final_estimator']['estimator_selector'] if estimator_json['selected_module'] == 'none': if len(pipeline_steps) == 0: sys.exit("No pipeline steps specified!") ## else: turn the last pre-process component to final estimator elif estimator_json['selected_module'] == 'sklearn.compose': #if $final_estimator.estimator_selector.selected_module == 'sklearn.compose': regressor_path = '$final_estimator.estimator_selector.regressor' transformer_path = '$final_estimator.estimator_selector.transformer' #end if regressor = load_model_from_h5(regressor_path) transformer = load_model_from_h5(transformer_path) estimator = compose.TransformedTargetRegressor(regressor=regressor, transformer=transformer) pipeline_steps.append( estimator ) else: estimator = get_estimator(estimator_json) pipeline_steps.append( estimator ) if len(pipeline_steps) == 1: out_obj = pipeline_steps[-1] print(out_obj) else: if has_imblearn: out_obj = imb_make_pipeline(*pipeline_steps) else: out_obj = make_pipeline(*pipeline_steps) pprint.pprint(out_obj.named_steps) dump_model_to_h5(out_obj, '$outfile', verbose=0) ]]> </configfile> </configfiles> <inputs> <repeat name="pipeline_component" min="1" max="5" title="Pre-processing step"> <conditional name="component_selector"> <param name="component_type" type="select" label="Choose the type of transformation:"> <option value="None" selected="true">None</option> <option value="pre_processor">Sklearn Preprocessor</option> <option value="feature_selection">Feature Selection</option> <option value="decomposition">Matrix Decomposition</option> <option value="kernel_approximation">Kernel Approximation</option> <option value="FeatureAgglomeration">Agglomerate Features</option> <option value="skrebate">SK-rebate Feature Selection</option> <option value="imblearn">Imbalanced-learn Sampling</option> <option value="IRAPS">IRAPS -- feature selector and classifier</option> <option value="preprocessors">Bio-sequence Encoders</option> </param> <when value="None" /> <when value="pre_processor"> <conditional name="pre_processors"> <expand macro="sparse_preprocessors_ext" /> <expand macro="sparse_preprocessor_options_ext" /> </conditional> </when> <when value="feature_selection"> <expand macro="feature_selection_pipeline" /> </when> <when value="decomposition"> <expand macro="matrix_decomposition_all" /> </when> <when value="kernel_approximation"> <expand macro="kernel_approximation_all" /> </when> <when value="FeatureAgglomeration"> <expand macro="FeatureAgglomeration" /> </when> <when value="skrebate"> <expand macro="skrebate" /> </when> <when value="imblearn"> <expand macro="imbalanced_learn_sampling" /> </when> <when value="IRAPS"> <expand macro="estimator_params_text" label="Type in parameter settings for IRAPSCore if different from default:" help="Default(=blank): n_iter=1000, responsive_thres=-1, resistant_thres=0, random_state=None. No double quotes" /> <param argument="p_thres" type="float" value="0.001" label="P value threshold" help="Float. default=0.001" /> <param argument="fc_thres" type="float" value="0.1" label="fold change threshold" help="Float. default=0.1" /> <param argument="occurrence" type="float" value="0.7" label="reservation factor" help="Float. default=0.7" /> <param argument="discretize" type="float" value="-1" label="The z_score threshold to discretize target value" help="Float. default=-1" /> </when> <when value="preprocessors"> <expand macro="preprocessors_sequence_encoders" /> </when> </conditional> </repeat> <section name="final_estimator" title="Final Estimator" expanded="true"> <conditional name="estimator_selector"> <param name="selected_module" type="select" label="Choose the module that contains target estimator:" > <expand macro="estimator_module_options"> <option value="sklearn.compose">sklearn.compose</option> <option value="binarize_target">Binarize Target Classifier or Regressor</option> <option value="custom_estimator">Load a custom estimator</option> <option value="none">none -- The last component of pre-processing step will turn to a final estimator</option> </expand> </param> <expand macro="estimator_suboptions"> <when value="sklearn.compose"> <param name="selected_estimator" type="select" label="Choose estimator class:"> <option value="TransformedTargetRegressor" selected="true">TransformedTargetRegressor</option> </param> <param name="regressor" type="data" format="h5mlm" label="Choose the dataset containing the wrapped regressor" /> <param name="transformer" type="data" format="h5mlm" label="Choose the dataset containing transformer" /> </when> <when value="binarize_target"> <param name="clf_or_regr" type="select" label="Classifier or Regressor:"> <option value="BinarizeTargetClassifier">BinarizeTargetClassifier</option> <option value="BinarizeTargetRegressor">BinarizeTargetRegressor</option> </param> <param name="wrapped_estimator" type="data" format="h5mlm" label="Choose the dataset containing the wrapped estimator or pipeline" /> <param name='z_score' type="float" value="-1" optional="false" label="Discrize target values using z_score" /> <param name='value' type="float" value="" optional="true" label="Discretize target values using a fixed value instead" help="Optional. default: None." /> <param name="less_is_positive" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="true" label="Are the detecting values smaller than others?" /> </when> <when value="custom_estimator"> <param name="c_estimator" type="data" format="h5mlm" label="Choose the dataset containing the custom estimator or pipeline" /> </when> <when value="none" /> </expand> </conditional> </section> <!--param name="output_type" type="select" label="Output the final estimator instead?"> <option value="Pipeline_Builder" selected="true">Pipeline</option> <option value="Final_Estimator_Builder">Final Estimator</option> </param>--> </inputs> <outputs> <data format="h5mlm" name="outfile" label="New Pipleline/Estimator" /> </outputs> <tests> <test> <conditional name="component_selector"> <param name="component_type" value="pre_processor" /> <conditional name="pre_processors"> <param name="selected_pre_processor" value="QuantileTransformer" /> <section name="options"> <param name="random_state" value="10" /> </section> </conditional> </conditional> <section name="final_estimator"> <conditional name="estimator_selector"> <param name="selected_module" value="none" /> </conditional> </section> <output name="outfile" file="pipeline17" compare="sim_size" delta="5" /> </test> <test> <conditional name="component_selector"> <param name="component_type" value="pre_processor" /> <conditional name="pre_processors"> <param name="selected_pre_processor" value="PowerTransformer" /> </conditional> </conditional> <section name="final_estimator"> <conditional name="estimator_selector"> <param name="selected_module" value="sklearn.compose" /> <param name="regressor" value="RandomForestRegressor01.h5mlm" ftype="h5mlm" /> <param name="transformer" value="pipeline17" ftype="h5mlm" /> </conditional> </section> <output name="outfile" file="pipeline18" compare="sim_size" delta="5" /> </test> <test> <repeat name="pipeline_component"> <conditional name="component_selector"> <param name="component_type" value="pre_processor" /> <conditional name="pre_processors"> <param name="selected_pre_processor" value="RobustScaler" /> </conditional> </conditional> </repeat> <repeat name="pipeline_component"> <conditional name="component_selector"> <param name="component_type" value="feature_selection" /> <conditional name="fs_algorithm_selector"> <param name="selected_algorithm" value="SelectKBest" /> <param name="score_func" value="f_classif" /> </conditional> </conditional> </repeat> <param name="selected_module" value="svm" /> <param name="selected_estimator" value="SVR" /> <param name="text_params" value="kernel='linear'" /> <output name="outfile" file="pipeline01" compare="sim_size" delta="5" /> </test> <test> <conditional name="component_selector"> <param name="component_type" value="pre_processor" /> <conditional name="pre_processors"> <param name="selected_pre_processor" value="RobustScaler" /> </conditional> </conditional> <param name="selected_module" value="linear_model" /> <param name="selected_estimator" value="LassoCV" /> <output name="outfile" file="pipeline02" compare="sim_size" delta="5" /> </test> <test> <conditional name="component_selector"> <param name="component_type" value="pre_processor" /> <conditional name="pre_processors"> <param name="selected_pre_processor" value="RobustScaler" /> </conditional> </conditional> <param name="selected_module" value="xgboost" /> <param name="selected_estimator" value="XGBClassifier" /> <output name="outfile" file="pipeline03" compare="sim_size" delta="5" /> </test> <test> <conditional name="component_selector"> <param name="component_type" value="feature_selection" /> <conditional name="fs_algorithm_selector"> <param name="selected_algorithm" value="SelectFromModel" /> <conditional name="model_inputter"> <conditional name="estimator_selector"> <param name="selected_module" value="ensemble" /> <param name="selected_estimator" value="AdaBoostClassifier" /> </conditional> </conditional> </conditional> </conditional> <section name="final_estimator"> <param name="selected_module" value="svm" /> <param name="selected_estimator" value="LinearSVC" /> </section> <output name="outfile" file="pipeline04" compare="sim_size" delta="5" /> </test> <test> <conditional name="component_selector"> <param name="component_type" value="None" /> </conditional> <param name="selected_module" value="ensemble" /> <param name="selected_estimator" value="RandomForestRegressor" /> <param name="text_params" value="n_estimators=100, random_state=42" /> <output name="outfile" file="pipeline05" compare="sim_size" delta="5" /> </test> <test> <conditional name="component_selector"> <param name="component_type" value="decomposition" /> <conditional name="matrix_decomposition_selector"> <param name="select_algorithm" value="PCA" /> </conditional> </conditional> <param name="selected_module" value="ensemble" /> <param name="selected_estimator" value="AdaBoostRegressor" /> <output name="outfile" file="pipeline06" compare="sim_size" delta="5" /> </test> <test> <conditional name="component_selector"> <param name="component_type" value="kernel_approximation" /> <conditional name="kernel_approximation_selector"> <param name="select_algorithm" value="RBFSampler" /> <param name="text_params" value="n_components=10, gamma=2.0" /> </conditional> </conditional> <param name="selected_module" value="ensemble" /> <param name="selected_estimator" value="AdaBoostClassifier" /> <output name="outfile" file="pipeline07" compare="sim_size" delta="5" /> </test> <test> <conditional name="component_selector"> <param name="component_type" value="FeatureAgglomeration" /> <conditional name="FeatureAgglomeration_selector"> <param name="select_algorithm" value="FeatureAgglomeration" /> <param name="text_params" value="n_clusters=3, affinity='euclidean'" /> </conditional> </conditional> <param name="selected_module" value="ensemble" /> <param name="selected_estimator" value="AdaBoostClassifier" /> <output name="outfile" file="pipeline08" compare="sim_size" delta="20" /> </test> <test> <conditional name="component_selector"> <param name="component_type" value="skrebate" /> <conditional name="skrebate_selector"> <param name="select_algorithm" value="ReliefF" /> <param name="text_params" value="n_features_to_select=3, n_neighbors=100" /> </conditional> </conditional> <param name="selected_module" value="ensemble" /> <param name="selected_estimator" value="RandomForestRegressor" /> <output name="outfile" file="pipeline09" compare="sim_size" delta="5" /> </test> <test> <conditional name="component_selector"> <param name="component_type" value="imblearn" /> <conditional name="imblearn_selector"> <param name="select_algorithm" value="under_sampling.EditedNearestNeighbours" /> </conditional> </conditional> <param name="selected_module" value="ensemble" /> <param name="selected_estimator" value="RandomForestClassifier" /> <output name="outfile" file="pipeline11" compare="sim_size" delta="5" /> </test> <test expect_failure="true"> <conditional name="component_selector"> <param name="component_type" value="None" /> </conditional> <param name="selected_module" value="ensemble" /> <param name="selected_estimator" value="RandomForestRegressor" /> <param name="text_params" value="n_estimators=__import__('os').system('ls ~')" /> </test> <test> <conditional name="component_selector"> <param name="component_type" value="feature_selection" /> <conditional name="fs_algorithm_selector"> <param name="selected_algorithm" value="RFE" /> <conditional name="estimator_selector"> <param name="selected_module" value="xgboost" /> <param name="selected_estimator" value="XGBRegressor" /> <param name="text_params" value="random_state=0" /> </conditional> </conditional> </conditional> <section name="final_estimator"> <conditional name="estimator_selector"> <param name="selected_module" value="none" /> </conditional> </section> <output name="outfile" file="pipeline12" compare="sim_size" delta="5" /> </test> <test> <conditional name="component_selector"> <param name="component_type" value="None" /> </conditional> <param name="selected_module" value="ensemble" /> <param name="selected_estimator" value="RandomForestClassifier" /> <output name="outfile" file="RandomForestClassifier.h5mlm" compare="sim_size" delta="5" /> </test> <test> <conditional name="component_selector"> <param name="component_type" value="IRAPS" /> </conditional> <section name="final_estimator"> <conditional name="estimator_selector"> <param name="selected_module" value="none" /> </conditional> </section> <output name="outfile" file="pipeline14" compare="sim_size" delta="5" /> </test> <test> <conditional name="component_selector"> <param name="component_type" value="None" /> </conditional> <section name="final_estimator"> <conditional name="estimator_selector"> <param name="selected_module" value="binarize_target" /> <param name="clf_or_regr" value="BinarizeTargetClassifier" /> <param name="wrapped_estimator" value="RandomForestClassifier.h5mlm" ftype="h5mlm" /> </conditional> </section> <output name="outfile" file="pipeline15" compare="sim_size" delta="5" /> </test> <test> <conditional name="component_selector"> <param name="component_type" value="preprocessors" /> <conditional name="encoder_selection"> <param name="encoder_type" value="GenomeOneHotEncoder" /> <param name="seq_length" value="1000" /> <param name="padding" value="True" /> </conditional> </conditional> <section name="final_estimator"> <conditional name="estimator_selector"> <param name="selected_module" value="custom_estimator" /> <param name="c_estimator" value="keras_model02" ftype="h5mlm" /> </conditional> </section> <output name="outfile" file="pipeline16" compare="sim_size" delta="5" /> </test> </tests> <help> <![CDATA[ **What it does** This tool not only builds sklearn pipeline object, but also builds single main estimator or single preprocessing component. The output object type is based on the length of pipeline steps. When there is only one step (choose `None` for others), either a main estimator or preprocessor, the component is output directly instead of wrapping in a pipeline object. A typical pipeline chains one or more preprocessing steps plus a final main estimator, for example, [VarianceThreshold, StandardScaler, SGDClassifier] which is composed of a feature selctor, a preprocessing scaler and a main estimator together. For more information, please refer to `Scikit-learn pipeline Pipeline`_. **Pre-processing components** come from `sklearn.preprocessing`_, `feature_selection`_, `decomposition`_, `kernel_approximation`_, `cluster.FeatureAgglomeration`_, `skrebate`_ and more. **Final Estimator** supports estimators from `xgboost`_ and many scikit-learn modules, including `svm`_, `linear_model`_, `ensemble`_, `naive_bayes`_, `tree`_, `neighbors`_ and so on. **Custom estimators** - `GenomeOneHotEncoder`_ - `ProteinOnehotEncoder`_ - `IRAPSClassifier`_ - `BinarizeTargetClassifier`_ - `BinarizeTargetRegressor`_ **Output** - Pipeline/estimator object - Hyperparameter of the object (optional) .. _`Scikit-learn pipeline Pipeline`: http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html .. _`svm`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.svm .. _`linear_model`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.linear_model .. _`ensemble`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.ensemble .. _`naive_bayes`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.naive_bayes .. _`tree`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.tree .. _`neighbors`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.neighbors .. _`xgboost`: https://xgboost.readthedocs.io/en/latest/python/python_api.html .. _`sklearn.preprocessing`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.preprocessing .. _`feature_selection`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_selection .. _`decomposition`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.decomposition .. _`kernel_approximation`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.kernel_approximation .. _`cluster.FeatureAgglomeration`: http://scikit-learn.org/stable/modules/generated/sklearn.cluster.FeatureAgglomeration.html .. _`skrebate`: https://epistasislab.github.io/scikit-rebate/using/ .. _`GenomeOneHotEncoder`: https://goeckslab.github.io/Galaxy-ML/APIs/preprocessors/#genomeonehotencoder .. _`ProteinOnehotEncoder`: https://goeckslab.github.io/Galaxy-ML/APIs/preprocessors/#proteinonehotencoder .. _`IRAPSClassifier`: https://goeckslab.github.io/Galaxy-ML/APIs/iraps-classifier/#irapsclassifier .. _`BinarizeTargetClassifier`: https://goeckslab.github.io/Galaxy-ML/APIs/binarize-target/#binarizetargetclassifier .. _`BinarizeTargetRegressor`: https://goeckslab.github.io/Galaxy-ML/APIs/binarize-target/#binarizetargetregressor ]]> </help> <expand macro="sklearn_citation"> <expand macro="skrebate_citation" /> <expand macro="xgboost_citation" /> <expand macro="imblearn_citation" /> </expand> </tool>