Mercurial > repos > bgruening > sklearn_build_pipeline

<tool id="sklearn_build_pipeline" name="Pipeline Builder" version="@VERSION@">
    <description>constructs a list of transforms and a final estimator</description>
    <macros>
        <import>main_macros.xml</import>
    </macros>
    <expand macro="python_requirements"/>
    <expand macro="macro_stdio"/>
    <version_command>echo "@VERSION@"</version_command>
    <command>
        <![CDATA[
        python "$sklearn_pipeline_script" '$inputs'
        ]]>
    </command>
    <configfiles>
        <inputs name="inputs" />
        <configfile name="sklearn_pipeline_script">
            <![CDATA[
import imblearn
import json
import pandas as pd
import pickle
import pprint
import skrebate
import sys
import warnings
from sklearn import (
    cluster, compose, decomposition, ensemble, feature_extraction,
    feature_selection, gaussian_process, kernel_approximation, metrics,
    model_selection, naive_bayes, neighbors, pipeline, preprocessing,
    svm, linear_model, tree, discriminant_analysis)
from sklearn.pipeline import make_pipeline
from imblearn.pipeline import make_pipeline as imb_make_pipeline
from galaxy_ml.utils import (SafeEval, feature_selector, get_estimator,
                             try_get_attr, get_search_params)


N_JOBS = int(__import__('os').environ.get('GALAXY_SLOTS', 1))

warnings.filterwarnings('ignore')

safe_eval = SafeEval()

input_json_path = sys.argv[1]
with open(input_json_path, 'r') as param_handler:
    params = json.load(param_handler)

#if $final_estimator.estimator_selector.selected_module == 'custom_estimator':
params['final_estimator']['estimator_selector']['c_estimator'] =\
        '$final_estimator.estimator_selector.c_estimator'
#end if

#if $final_estimator.estimator_selector.selected_module == 'binarize_target':
params['final_estimator']['estimator_selector']['wrapped_estimator'] =\
        '$final_estimator.estimator_selector.wrapped_estimator'
#end if

pipeline_steps = []

def get_component(input_json, check_none=False):
    is_imblearn = False
    if input_json['component_type'] == 'None':
        if not check_none:
            return None, False
        else:
            sys.exit("The pre-processing component type can't be None "
                     "when the number of components is greater than 1.")
    if input_json['component_type'] == 'pre_processor':
        preprocessor = input_json['pre_processors']['selected_pre_processor']
        pre_processor_options = input_json['pre_processors']['options']
        if 'feature_range' in pre_processor_options:
            feature_range = safe_eval(pre_processor_options['feature_range'].strip())
            if not feature_range:
                feature_range = (0, 1)
            pre_processor_options['feature_range'] = feature_range
        my_class = getattr(preprocessing, preprocessor)
        obj = my_class(**pre_processor_options)
    elif input_json['component_type'] == 'feature_selection':
        obj = feature_selector(input_json['fs_algorithm_selector'])
    elif input_json['component_type'] == 'decomposition':
        algorithm = input_json['matrix_decomposition_selector']['select_algorithm']
        obj = getattr(decomposition, algorithm)()
        options = input_json['matrix_decomposition_selector']['text_params'].strip()
        if options != '':
            options = safe_eval( 'dict(' + options + ')' )
            obj.set_params(**options)
    elif input_json['component_type'] == 'kernel_approximation':
        algorithm = input_json['kernel_approximation_selector']['select_algorithm']
        obj = getattr(kernel_approximation, algorithm)()
        options = input_json['kernel_approximation_selector']['text_params'].strip()
        if options != '':
            options = safe_eval( 'dict(' + options + ')' )
            obj.set_params(**options)
    elif input_json['component_type'] == 'FeatureAgglomeration':
        algorithm = input_json['FeatureAgglomeration_selector']['select_algorithm']
        obj = getattr(cluster, algorithm)()
        options = input_json['FeatureAgglomeration_selector']['text_params'].strip()
        if options != '':
            options = safe_eval( 'dict(' + options + ')' )
            obj.set_params(**options)
    elif input_json['component_type'] == 'skrebate':
        algorithm = input_json['skrebate_selector']['select_algorithm']
        if algorithm == 'TuRF':
            obj = getattr(skrebate, algorithm)(core_algorithm='ReliefF')
        else:
            obj = getattr(skrebate, algorithm)()
        options = input_json['skrebate_selector']['text_params'].strip()
        if options != '':
            options = safe_eval( 'dict(' + options + ')' )
            obj.set_params(**options)
    elif input_json['component_type'] == 'imblearn':
        is_imblearn = True
        algorithm = input_json['imblearn_selector']['select_algorithm']
        if algorithm == 'over_sampling.SMOTENC':
            obj = over_sampling.SMOTENC(categorical_features=[])
        elif algorithm == 'Z_RandomOverSampler':
            Z_RandomOverSampler = try_get_attr('galaxy_ml.preprocessors',
                                               'Z_RandomOverSampler')
            obj = Z_RandomOverSampler()
        else:
            globals = algorithm.split('.')
            mod, klass = globals[0], globals[1]
            obj = getattr(getattr(imblearn, mod), klass)()
        options = input_json['imblearn_selector']['text_params'].strip()
        if options != '':
            options = safe_eval( 'dict(' + options + ')' )
            obj.set_params(**options)
    elif input_json['component_type'] == 'IRAPS':
        iraps_core = try_get_attr('galaxy_ml.iraps_classifier','IRAPSCore')()
        core_params = input_json['text_params'].strip()
        if core_params != '':
            try:
                params = safe_eval('dict(' + core_params + ')')
            except ValueError:
                sys.exit("Unsupported parameter input: `%s`" % core_params)
            iraps_core.set_params(**params)
        options = {}
        if input_json['p_thres'] is not None:
            options['p_thres'] = input_json['p_thres']
        if input_json['fc_thres'] is not None:
            options['fc_thres'] = input_json['fc_thres']
        if input_json['occurrence'] is not None:
            options['occurrence'] = input_json['occurrence']
        if input_json['discretize'] is not None:
            options['discretize'] = input_json['discretize']
        IRAPSClassifier = try_get_attr('galaxy_ml.iraps_classifier','IRAPSClassifier')
        obj = IRAPSClassifier(iraps_core, **options)
    elif input_json['component_type'] == 'preprocessors':
        encoder_selection = input_json['encoder_selection']
        encoder_type = encoder_selection.pop('encoder_type')
        klass = try_get_attr('galaxy_ml.preprocessors', encoder_type)
        obj = klass(**encoder_selection)

    if 'n_jobs' in obj.get_params():
        obj.set_params( n_jobs=N_JOBS )
    return obj, is_imblearn

has_imblearn = False
if len(params['pipeline_component']) == 1:
    step_obj, is_imblearn = get_component( params['pipeline_component'][0]['component_selector'])
    if step_obj:
        pipeline_steps.append( step_obj )
        if is_imblearn:
            has_imblearn = True
else:
    for i, c in enumerate(params['pipeline_component']):
        step_obj, is_imblearn = get_component( c['component_selector'], check_none=True )
        pipeline_steps.append(  step_obj )
        if is_imblearn:
            has_imblearn = True

## Set up final estimator and add to pipeline.
estimator_json = params['final_estimator']['estimator_selector']
if estimator_json['selected_module'] == 'none':
    if len(pipeline_steps) == 0:
        sys.exit("No pipeline steps specified!")
    ## else:  turn the last pre-process component to final estimator
else:
    estimator = get_estimator(estimator_json)
    pipeline_steps.append( estimator )

#if $output_type == 'Final_Estimator_Builder':
with open('$outfile', 'wb') as out_handler:
    final_est = pipeline_steps[-1]
    print(final_est)
    pickle.dump(final_est, out_handler, pickle.HIGHEST_PROTOCOL)
out_obj = final_est
#else:
if has_imblearn:
    pipeline = imb_make_pipeline(*pipeline_steps)
else:
    pipeline = make_pipeline(*pipeline_steps)
pprint.pprint(pipeline.named_steps)

with open('$outfile', 'wb') as out_handler:
    pickle.dump(pipeline, out_handler, pickle.HIGHEST_PROTOCOL)
out_obj = pipeline
#end if

#if $get_params
results = get_search_params(out_obj)
df = pd.DataFrame(results, columns=['', 'Parameter', 'Value'])
df.to_csv('$outfile_params', sep='\t', index=False)
#end if
            ]]>
        </configfile>
    </configfiles>
    <inputs>
        <repeat name="pipeline_component" min="1" max="5" title="Pre-processing step">
            <conditional name="component_selector">
                <param name="component_type" type="select" label="Choose the type of transformation:">
                    <option value="None" selected="true">None</option>
                    <option value="pre_processor">Sklearn Preprocessor</option>
                    <option value="feature_selection">Feature Selection</option>
                    <option value="decomposition">Matrix Decomposition</option>
                    <option value="kernel_approximation">Kernel Approximation</option>
                    <option value="FeatureAgglomeration">Agglomerate Features</option>
                    <option value="skrebate">SK-rebate Feature Selection</option>
                    <option value="imblearn">Imbalanced-learn Sampling</option>
                    <option value="IRAPS">IRAPS -- feature selector and classifier</option>
                    <option value="preprocessors">Bio-sequence Encoders</option>
                </param>
                <when value="None"/>
                <when value="pre_processor">
                    <conditional name="pre_processors">
                        <expand macro="sparse_preprocessors_ext" />
                        <expand macro="sparse_preprocessor_options_ext" />
                    </conditional>
                </when>
                <when value="feature_selection">
                    <expand macro="feature_selection_pipeline"/>
                </when>
                <when value="decomposition">
                    <expand macro="matrix_decomposition_all"/>
                </when>
                <when value="kernel_approximation">
                    <expand macro="kernel_approximation_all"/>
                </when>
                <when value="FeatureAgglomeration">
                    <expand macro="FeatureAgglomeration"/>
                </when>
                <when value="skrebate">
                    <expand macro="skrebate"/>
                </when>
                <when value="imblearn">
                    <expand macro="imbalanced_learn_sampling"/>
                </when>
                <when value="IRAPS">
                    <expand macro="estimator_params_text"
                        label="Type in parameter settings for IRAPSCore if different from default:"
                        help="Default(=blank): n_iter=1000, responsive_thres=-1, resistant_thres=0, random_state=None. No double quotes"/>
                    <param argument="p_thres" type="float" value="0.001" label="P value threshold" help="Float. default=0.001"/>
                    <param argument="fc_thres" type="float" value="0.1" label="fold change threshold" help="Float. default=0.1"/>
                    <param argument="occurrence" type="float" value="0.7" label="reservation factor" help="Float. default=0.7"/>
                    <param argument="discretize" type="float" value="-1" label="The z_score threshold to discretize target value" help="Float. default=-1"/>
                </when>
                <when value="preprocessors">
                    <expand macro="preprocessors_sequence_encoders"/>
                </when>
            </conditional>
        </repeat>
        <section name="final_estimator" title="Final Estimator" expanded="true">
            <conditional name="estimator_selector">
                <param name="selected_module" type="select" label="Choose the module that contains target estimator:" >
                    <expand macro="estimator_module_options">
                        <option value="binarize_target">Binarize Target Classifier or Regressor</option>
                        <option value="custom_estimator">Load a custom estimator</option>
                        <option value="none">none -- The last component of pre-processing step will turn to a final estimator</option>
                    </expand>
                </param>
                <expand macro="estimator_suboptions">
                    <when value="binarize_target">
                        <param name="clf_or_regr" type="select" label="Classifier or Regressor:">
                            <option value="BinarizeTargetClassifier">BinarizeTargetClassifier</option>
                            <option value="BinarizeTargetRegressor">BinarizeTargetRegressor</option>
                        </param>
                        <param name="wrapped_estimator" type="data" format="zip" label="Choose the dataset containing the wrapped estimator or pipeline"/>
                        <param name='z_score' type="float" value="-1" optional="false" label="Discrize target values using z_score"/>
                        <param name='value' type="float" value="" optional="true" label="Discretize target values using a fixed value instead" help="Optional. default: None."/>
                        <param name="less_is_positive" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="true" label="Are the detecting values smaller than others?"/>
                    </when>
                    <when value="custom_estimator">
                        <param name="c_estimator" type="data" format="zip" label="Choose the dataset containing the custom estimator or pipeline"/>
                    </when>
                    <when value="none"/>
                </expand>
            </conditional>
        </section>
        <param name="output_type" type="select" label="Output the final estimator instead?">
            <option value="Pipeline_Builder" selected="true">Pipeline</option>
            <option value="Final_Estimator_Builder">Final Estimator</option>
        </param>
        <param name="get_params" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="false" label="Output parameters for searchCV?"
                help="Optional. Tunable parameters could be obtained through `estimator_attributes` tool."/>
    </inputs>
    <outputs>
        <data format="zip" name="outfile" label="${output_type}"/>
        <data format="tabular" name="outfile_params" label="get_params for ${output_type}">
            <filter>get_params</filter>
        </data>
    </outputs>
    <tests>
        <test>
            <repeat name="pipeline_component">
                <conditional name="component_selector">
                    <param name="component_type" value="pre_processor"/>
                    <conditional name="pre_processors">
                        <param name="selected_pre_processor" value="RobustScaler"/>
                    </conditional>
                </conditional>
            </repeat>
            <repeat name="pipeline_component">
                <conditional name="component_selector">
                    <param name="component_type" value="feature_selection"/>
                    <conditional name="fs_algorithm_selector">
                        <param name="selected_algorithm" value="SelectKBest"/>
                        <param name="score_func" value="f_classif"/>
                    </conditional>
                </conditional>
            </repeat>
            <param name="selected_module" value="svm"/>
            <param name="selected_estimator" value="SVR"/>
            <param name="text_params" value="kernel='linear'"/>
            <output name="outfile" file="pipeline01" compare="sim_size" delta="5"/>
        </test>
        <test>
            <conditional name="component_selector">
                <param name="component_type" value="pre_processor"/>
                <conditional name="pre_processors">
                    <param name="selected_pre_processor" value="RobustScaler"/>
                </conditional>
            </conditional>
            <param name="selected_module" value="linear_model"/>
            <param name="selected_estimator" value="LassoCV"/>
            <output name="outfile" file="pipeline02" compare="sim_size" delta="5"/>
        </test>
        <test>
            <conditional name="component_selector">
                <param name="component_type" value="pre_processor"/>
                <conditional name="pre_processors">
                    <param name="selected_pre_processor" value="RobustScaler"/>
                </conditional>
            </conditional>
            <param name="selected_module" value="xgboost"/>
            <param name="selected_estimator" value="XGBClassifier"/>
            <output name="outfile" file="pipeline03" compare="sim_size" delta="5"/>
        </test>
        <test>
            <conditional name="component_selector">
                <param name="component_type" value="feature_selection"/>
                <conditional name="fs_algorithm_selector">
                    <param name="selected_algorithm" value="SelectFromModel"/>
                    <conditional name="model_inputter">
                        <conditional name="estimator_selector">
                            <param name="selected_module" value="ensemble"/>
                            <param name="selected_estimator" value="AdaBoostClassifier"/>
                        </conditional>
                    </conditional>
                </conditional>
            </conditional>
            <section name="final_estimator">
                <param name="selected_module" value="svm"/>
                <param name="selected_estimator" value="LinearSVC"/>
            </section>
            <output name="outfile" file="pipeline04" compare="sim_size" delta="5"/>
        </test>
        <test>
            <conditional name="component_selector">
                <param name="component_type" value="None"/>
            </conditional>
            <param name="selected_module" value="ensemble"/>
            <param name="selected_estimator" value="RandomForestRegressor"/>
            <param name="text_params" value="n_estimators=100, random_state=42"/>
            <output name="outfile" file="pipeline05" compare="sim_size" delta="5"/>
        </test>
        <test>
            <conditional name="component_selector">
                <param name="component_type" value="decomposition"/>
                    <conditional name="matrix_decomposition_selector">
                        <param name="select_algorithm" value="PCA"/>
                    </conditional>
            </conditional>
            <param name="selected_module" value="ensemble"/>
            <param name="selected_estimator" value="AdaBoostRegressor"/>
            <output name="outfile" file="pipeline06" compare="sim_size" delta="5"/>
        </test>
        <test>
            <conditional name="component_selector">
                <param name="component_type" value="kernel_approximation"/>
                    <conditional name="kernel_approximation_selector">
                        <param name="select_algorithm" value="RBFSampler"/>
                        <param name="text_params" value="n_components=10, gamma=2.0"/>
                    </conditional>
            </conditional>
            <param name="selected_module" value="ensemble"/>
            <param name="selected_estimator" value="AdaBoostClassifier"/>
            <output name="outfile" file="pipeline07" compare="sim_size" delta="5"/>
        </test>
        <test>
            <conditional name="component_selector">
                <param name="component_type" value="FeatureAgglomeration"/>
                    <conditional name="FeatureAgglomeration_selector">
                        <param name="select_algorithm" value="FeatureAgglomeration"/>
                        <param name="text_params" value="n_clusters=3, affinity='euclidean'"/>
                    </conditional>
            </conditional>
            <param name="selected_module" value="ensemble"/>
            <param name="selected_estimator" value="AdaBoostClassifier"/>
            <output name="outfile" file="pipeline08" compare="sim_size" delta="20"/>
        </test>
        <test>
            <conditional name="component_selector">
                <param name="component_type" value="skrebate"/>
                    <conditional name="skrebate_selector">
                        <param name="select_algorithm" value="ReliefF"/>
                        <param name="text_params" value="n_features_to_select=3, n_neighbors=100"/>
                    </conditional>
            </conditional>
            <param name="selected_module" value="ensemble"/>
            <param name="selected_estimator" value="RandomForestRegressor"/>
            <output name="outfile" file="pipeline09" compare="sim_size" delta="5"/>
        </test>
        <test>
            <conditional name="component_selector">
                <param name="component_type" value="None"/>
            </conditional>
            <param name="selected_module" value="ensemble"/>
            <param name="selected_estimator" value="AdaBoostRegressor"/>
            <output name="outfile" file="pipeline10" compare="sim_size" delta="5"/>
        </test>
        <test>
            <conditional name="component_selector">
                <param name="component_type" value="imblearn"/>
                <conditional name="imblearn_selector">
                    <param name="select_algorithm" value="under_sampling.EditedNearestNeighbours"/>
                </conditional>
            </conditional>
            <param name="selected_module" value="ensemble"/>
            <param name="selected_estimator" value="RandomForestClassifier"/>
            <output name="outfile" file="pipeline11" compare="sim_size" delta="5"/>
        </test>
        <test expect_failure="true">
            <conditional name="component_selector">
                <param name="component_type" value="None"/>
            </conditional>
            <param name="selected_module" value="ensemble"/>
            <param name="selected_estimator" value="RandomForestRegressor"/>
            <param name="text_params" value="n_estimators=__import__('os').system('ls ~')"/>
        </test>
        <test>
            <conditional name="component_selector">
                <param name="component_type" value="feature_selection"/>
                <conditional name="fs_algorithm_selector">
                    <param name="selected_algorithm" value="RFE"/>
                    <conditional name="estimator_selector">
                        <param name="selected_module" value="xgboost"/>
                        <param name="selected_estimator" value="XGBRegressor"/>
                        <param name="text_params" value="random_state=0"/>
                    </conditional>
                </conditional>
            </conditional>
            <section name="final_estimator">
                <conditional name="estimator_selector">
                    <param name="selected_module" value="none"/>
                </conditional>
            </section>
            <output name="outfile" file="pipeline12" compare="sim_size" delta="5"/>
        </test>
        <test>
            <conditional name="component_selector">
                <param name="component_type" value="None"/>
            </conditional>
            <param name="selected_module" value="ensemble"/>
            <param name="selected_estimator" value="RandomForestClassifier"/>
            <param name="output_type" value="Final_Estimator_Builder"/>
            <output name="outfile" file="RandomForestClassifier.zip" compare="sim_size" delta="5"/>
        </test>
        <test>
            <conditional name="component_selector">
                <param name="component_type" value="IRAPS"/>
            </conditional>
            <section name="final_estimator">
                <conditional name="estimator_selector">
                    <param name="selected_module" value="none"/>
                </conditional>
            </section>
            <param name="output_type" value="Final_Estimator_Builder"/>
            <output name="outfile" file="pipeline14" compare="sim_size" delta="5"/>
        </test>
        <test>
            <conditional name="component_selector">
                <param name="component_type" value="None"/>
            </conditional>
            <section name="final_estimator">
                <conditional name="estimator_selector">
                    <param name="selected_module" value="binarize_target"/>
                    <param name="clf_or_regr" value="BinarizeTargetClassifier"/>
                    <param name="wrapped_estimator" value="RandomForestClassifier.zip" ftype="zip"/>
                </conditional>
            </section>
            <param name="output_type" value="Final_Estimator_Builder"/>
            <output name="outfile" file="pipeline15" compare="sim_size" delta="5"/>
        </test>
        <test>
            <conditional name="component_selector">
                <param name="component_type" value="preprocessors"/>
                <conditional name="encoder_selection">
                    <param name="encoder_type" value="GenomeOneHotEncoder"/>
                    <param name="seq_length" value="1000"/>
                    <param name="padding" value="True"/>
                </conditional>
            </conditional>
            <section name="final_estimator">
                <conditional name="estimator_selector">
                    <param name="selected_module" value="custom_estimator"/>
                    <param name="c_estimator" value="keras_model02" ftype="zip"/>
                </conditional>
            </section>
            <output name="outfile" file="pipeline16" compare="sim_size" delta="5"/>
        </test>
    </tests>
    <help>
        <![CDATA[
**What it does**
Constructs a pipeline that contains a list of transfroms and a final estimator. Pipeline assembles several steps
that can be cross-validated together while setting different parameters.
please refer to `Scikit-learn pipeline Pipeline`_.

**Pre-processing components** allow None, one or a combination of up to 5 transformations from `sklearn.preprocessing`_, `feature_selection`_, `decomposition`_, `kernel_approximation`_, `cluster.FeatureAgglomeration`_ and/or `skrebate`_.

**Estimator** selector supports estimators from `xgboost`_ and many scikit-learn modules, including `svm`_, `linear_model`_, `ensemble`_, `naive_bayes`_, `tree`_ and `neighbors`_.


.. _`Scikit-learn pipeline Pipeline`: http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html
.. _`svm`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.svm
.. _`linear_model`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.linear_model
.. _`ensemble`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.ensemble
.. _`naive_bayes`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.naive_bayes
.. _`tree`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.tree
.. _`neighbors`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.neighbors
.. _`xgboost`: https://xgboost.readthedocs.io/en/latest/python/python_api.html

.. _`sklearn.preprocessing`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.preprocessing
.. _`feature_selection`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_selection
.. _`decomposition`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.decomposition
.. _`kernel_approximation`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.kernel_approximation
.. _`cluster.FeatureAgglomeration`: http://scikit-learn.org/stable/modules/generated/sklearn.cluster.FeatureAgglomeration.html
.. _`skrebate`: https://epistasislab.github.io/scikit-rebate/using/

        ]]>
    </help>
    <expand macro="sklearn_citation">
        <expand macro="skrebate_citation"/>
        <expand macro="xgboost_citation"/>
        <expand macro="imblearn_citation"/>
    </expand>
</tool>
author	bgruening
date	Fri, 01 Nov 2019 17:29:44 -0400
parents	775b004b7920
children	3f3c6dc38f3e