view search_model_validation.xml @ 19:cb5635e30842 draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
author bgruening
date Tue, 13 Apr 2021 18:41:36 +0000
parents c1ca24a1509d
children 301e07345c93
line wrap: on
line source

<tool id="sklearn_searchcv" name="Hyperparameter Search" version="@VERSION@" profile="20.05">
    <description>performs hyperparameter optimization using various SearchCVs</description>
    <macros>
        <import>main_macros.xml</import>
         <macro name="search_cv_estimator">
            <param name="infile_estimator" type="data" format="zip" label="Choose the dataset containing pipeline/estimator object"/>
            <param name="is_deep_learning" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="false" label="Is the estimator a deep learning model?"/>
            <section name="search_params_builder" title="Search parameters Builder" expanded="true">
            <param name="infile_params" type="data" format="tabular" optional="true" label="Choose the dataset containing parameter names" help="This dataset could be the output of `get_params` in the `Estimator Attributes` tool."/>
            <repeat name="param_set" min="1" max="30" title="Parameter settings for search:">
                <param name="sp_name" type="select" optional="true" label="Choose a parameter name (with current value)">
                    <options from_dataset="infile_params" startswith="@">
                    <column name="name" index="2"/>
                    <column name="value" index="1"/>
                    <filter type="unique_value" name="unique_param" column="1"/>
                    </options>
                </param>
                <param name="sp_list" type="text" value="" optional="true" label="Search list" help="list or array-like, for example: [1, 10, 100, 1000], [True, False] and ['auto', 'sqrt', None]. See `help` section for more examples">
                    <sanitizer>
                    <valid initial="default">
                        <add value="&apos;"/>
                        <add value="&quot;"/>
                        <add value="["/>
                        <add value="]"/>
                    </valid>
                    </sanitizer>
                </param>
            </repeat>
            </section>
        </macro>
    </macros>
    <expand macro="python_requirements"/>
    <expand macro="macro_stdio"/>
    <version_command>echo "@VERSION@"</version_command>
    <command><![CDATA[
        export HDF5_USE_FILE_LOCKING='FALSE';
        #if $input_options.selected_input == 'refseq_and_interval'
            bgzip -c '$input_options.target_file' > '${target_file.element_identifier}.gz' &&
            tabix -p bed '${target_file.element_identifier}.gz' &&
        #end if
        python '$__tool_directory__/search_model_validation.py'
            --inputs '$inputs'
            --estimator '$search_schemes.infile_estimator'
            #if $input_options.selected_input == 'seq_fasta'
                --fasta_path '$input_options.fasta_path'
            #elif $input_options.selected_input == 'refseq_and_interval'
                --ref_seq '$input_options.ref_genome_file'
                --interval '$input_options.interval_file'
                --targets "`pwd`/${target_file.element_identifier}.gz"
            #else
                --infile1 '$input_options.infile1'
            #end if
                --infile2 '$input_options.infile2'
            #if $save != 'save_no_fit'
                --outfile_result '$outfile_result'
            #end if
            #if $save == 'save_estimator'
                --outfile_object '$outfile_object'
            #end if
            #if $save == 'save_no_fit'
                --outfile_object '$outfile_object_no_fit'
            #end if
            #if $search_schemes.is_deep_learning == 'booltrue' and $save == 'save_estimator' and $outer_split.split_mode == 'nested_cv'
                --outfile_weights '$outfile_weights'
            #end if
            #if $search_schemes.options.cv_selector.selected_cv in ['GroupKFold', 'GroupShuffleSplit', 'LeaveOneGroupOut', 'LeavePGroupsOut']
                --groups '$search_schemes.options.cv_selector.groups_selector.infile_g'
            #end if
    ]]>
    </command>
    <configfiles>
        <inputs name="inputs" />
    </configfiles>
    <inputs>
        <conditional name="search_schemes">
            <param name="selected_search_scheme" type="select" label="Select a model selection search scheme">
                <option value="GridSearchCV" selected="true">GridSearchCV - Exhaustive search over specified parameter values for an estimator </option>
                <option value="RandomizedSearchCV">RandomizedSearchCV - Randomized search on hyper parameters for an estimator</option>
            </param>
            <when value="GridSearchCV">
                <expand macro="search_cv_estimator"/>
                <section name="options" title="Advanced Options for SearchCV" expanded="false">
                    <expand macro="search_cv_options"/>
                </section>
            </when>
            <when value="RandomizedSearchCV">
                <expand macro="search_cv_estimator"/>
                <section name="options" title="Advanced Options for SearchCV" expanded="false">
                    <expand macro="search_cv_options"/>
                    <param argument="n_iter" type="integer" value="10" label="Number of parameter settings that are sampled"/>
                    <expand macro="random_state"/>
                </section>
            </when>
        </conditional>
        <expand macro="sl_mixed_input_plus_sequence"/>
        <conditional name="outer_split">
            <param name="split_mode" type="select" label="Whether to hold a portion of samples for test exclusively?" help="Nested CV or train_test_split">
                <option value="no" selected="true">Nope</option>
                <option value="nested_cv">Yes - do nested CV</option>
            </param>
            <when value='no'/>
            <when value="nested_cv">
                <expand macro="cv_reduced" label="Select the outer cv splitter"/>
            </when>
        </conditional>
        <param name="save" type="select" label="Save best estimator?" help="For a non-deep learning model, save will output fitted best_estimator_ (refit must be true) or a list of cv_results_ from each outer split in nested CV mode. For a deep learning model, by checking the boolean option below the model input, the outputs are two parts, model skeleton and weights. Save Deep learning model for nested CV is not supported.">
            <option value="nope">Nope, save is unnecessary</option>
            <option value="save_estimator" selected="true">Fitted best estimator or Detailed cv_results_ from nested CV</option>
            <option value="save_no_fit">SearchCV object without fitting</option>
        </param>
    </inputs>
    <outputs>
        <data format="tabular" name="outfile_result">
            <filter>save != 'save_no_fit'</filter>
        </data>
        <data format="zip" name="outfile_object" label="Fitted estimator or estimator skeleton on ${on_string}">
            <filter>save == 'save_estimator' and outer_split['split_mode'] == 'no'</filter>
        </data>
        <data format="h5" name="outfile_weights" label="Weights trained on ${on_string}">
            <filter>search_schemes['is_deep_learning'] and save == 'save_estimator' and outer_split['split_mode'] == 'no'</filter>
        </data>
        <collection type="list" name="outfile_in_splits" label="cv_results_ from splits on ${on_string}">
            <filter>not search_schemes['is_deep_learning'] and save == 'save_estimator' and outer_split['split_mode'] == 'nested_cv'</filter>
            <discover_datasets format="tabular" pattern="__name__" directory="cv_results_in_folds"/>
        </collection>
        <data format="zip" name="outfile_object_no_fit" label="Unfitted SearchCV on ${on_string}">
            <filter>save == 'save_no_fit'</filter>
        </data>
    </outputs>
    <tests>
        <test>
            <param name="selected_search_scheme" value="GridSearchCV"/>
            <param name="infile_estimator" value="pipeline01" ftype="zip"/>
            <param name="infile_params" value="get_params01.tabular" ftype="tabular"/>
            <repeat name="param_set">
                <param name="sp_list" value="[1, 10, 100, 1000]"/>
                <param name="sp_name" value="svr__C"/>
            </repeat>
            <repeat name="param_set">
                <param name="sp_list" value="[-1, 3, 5, 7, 9]"/>
                <param name="sp_name" value="selectkbest__k"/>
            </repeat>
            <param name="error_score" value="false"/>
            <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
            <param name="header1" value="true" />
            <param name="selected_column_selector_option" value="all_columns"/>
            <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
            <param name="header2" value="true" />
            <param name="selected_column_selector_option2" value="all_columns"/>
            <output name="outfile_result">
                <assert_contents>
                    <has_n_columns n="13"/>
                    <has_text text="0.7938837807353147"/>
                    <has_text text="{'selectkbest__k': 9, 'svr__C': 1}"/>
                </assert_contents>
            </output>
        </test>
        <test expect_failure="true">
            <param name="selected_search_scheme" value="GridSearchCV"/>
            <param name="infile_estimator" value="pipeline01" ftype="zip"/>
            <param name="infile_params" value="get_params01.tabular" ftype="tabular"/>
            <repeat name="param_set">
                <param name="sp_list" value="[1, 10, 100, 1000]"/>
                <param name="sp_name" value="svr__C"/>
            </repeat>
            <repeat name="param_set">
                <param name="sp_list" value="[-1, 3, 5, 7, 9]"/>
                <param name="sp_name" value="selectkbest__k"/>
            </repeat>
            <param name="error_score" value="true"/>
            <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
            <param name="header1" value="true" />
            <param name="selected_column_selector_option" value="all_columns"/>
            <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
            <param name="header2" value="true" />
            <param name="selected_column_selector_option2" value="all_columns"/>
        </test>
        <test>
            <param name="selected_search_scheme" value="RandomizedSearchCV"/>
            <param name="infile_estimator" value="pipeline01" ftype="zip"/>
            <param name="infile_params" value="get_params01.tabular" ftype="tabular"/>
            <repeat name="param_set">
                <param name="sp_list" value="[1, 10, 100, 1000]"/>
                <param name="sp_name" value="svr__C"/>
            </repeat>
            <repeat name="param_set">
                <param name="sp_list" value="['linear', 'poly', 'rbf', 'sigmoid']"/>
                <param name="sp_name" value="svr__kernel"/>
            </repeat>
            <repeat name="param_set">
                <param name="sp_list" value="[3, 5, 7, 9]"/>
                <param name="sp_name" value="selectkbest__k"/>
            </repeat>
            <repeat name="param_set">
                <param name="sp_list" value="[True, False]"/>
                <param name="sp_name" value="robustscaler__with_centering"/>
            </repeat>
            <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
            <param name="header1" value="true" />
            <param name="selected_column_selector_option" value="all_columns"/>
            <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
            <param name="header2" value="true" />
            <param name="selected_column_selector_option2" value="all_columns"/>
            <output name="outfile_result" >
                <assert_contents>
                    <has_n_columns n="15" />
                    <has_text text="param_robustscaler__with_centering"/>
                </assert_contents>
            </output>
        </test>
        <test>
            <param name="selected_search_scheme" value="RandomizedSearchCV"/>
            <param name="infile_estimator" value="pipeline03" ftype="zip"/>
            <param name="infile_params" value="get_params03.tabular" ftype="tabular"/>
            <repeat name="param_set">
                <param name="sp_list" value="np_arange(50, 1001, 50)"/>
                <param name="sp_name" value="xgbclassifier__n_estimators"/>
            </repeat>
            <repeat name="param_set">
                <param name="sp_list" value="scipy_stats_randint(1, 51)"/>
                <param name="sp_name" value="xgbclassifier__max_depth"/>
            </repeat>
            <repeat name="param_set">
                <param name="sp_list" value="scipy_stats_uniform(0., 1.)"/>
                <param name="sp_name" value="xgbclassifier__gamma"/>
            </repeat>
            <repeat name="param_set">
                <param name="sp_list" value="[324089]"/>
                <param name="sp_name" value="xgbclassifier__random_state"/>
            </repeat>
            <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
            <param name="header1" value="true" />
            <param name="selected_column_selector_option" value="all_columns"/>
            <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
            <param name="header2" value="true" />
            <param name="selected_column_selector_option2" value="all_columns"/>
            <output name="outfile_result" >
                <assert_contents>
                    <has_n_columns n="15" />
                    <has_text text="param_xgbclassifier__max_depth"/>
                </assert_contents>
            </output>
        </test>
        <test>
            <param name="selected_search_scheme" value="GridSearchCV"/>
            <param name="infile_estimator" value="pipeline04" ftype="zip"/>
            <param name="infile_params" value="get_params04.tabular" ftype="tabular"/>
            <repeat name="param_set">
                <param name="sp_list" value="list(range(100, 1001, 100))"/>
                <param name="sp_name" value="linearsvc__random_state"/>
            </repeat>
            <repeat name="param_set">
                <param name="sp_list" value=": [sklearn_ensemble.ExtraTreesClassifier(n_estimators=100, random_state=324089)]"/>
                <param name="sp_name" value="selectfrommodel__estimator"/>
            </repeat>
            <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
            <param name="header1" value="true" />
            <param name="selected_column_selector_option" value="all_columns"/>
            <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
            <param name="header2" value="true" />
            <param name="selected_column_selector_option2" value="all_columns"/>
            <output name="outfile_result">
                <assert_contents>
                    <has_n_columns n="13"/>
                    <has_text text="0.05363984674329502"/>
                </assert_contents>
            </output>
        </test>
        <test>
            <param name="selected_search_scheme" value="GridSearchCV"/>
            <param name="infile_estimator" value="pipeline01" ftype="zip"/>
            <param name="infile_params" value="get_params01.tabular" ftype="tabular"/>
            <repeat name="param_set">
                <param name="sp_list" value="[1, 10, 100, 1000]"/>
                <param name="sp_name" value="svr__C"/>
            </repeat>
            <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
            <param name="header1" value="true" />
            <param name="selected_column_selector_option" value="all_columns"/>
            <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
            <param name="header2" value="true" />
            <param name="selected_column_selector_option2" value="all_columns"/>
            <param name="save" value="save_estimator"/>
            <output name="outfile_object" file="searchCV01" compare="sim_size" delta="10"/>
        </test>
        <test>
            <param name="selected_search_scheme" value="GridSearchCV"/>
            <param name="infile_estimator" value="pipeline06" ftype="zip"/>
            <param name="infile_params" value="get_params06.tabular" ftype="tabular"/>
            <repeat name="param_set">
                <param name="sp_list" value="[10, 50, 200, 1000]"/>
                <param name="sp_name" value="adaboostregressor__n_estimators"/>
            </repeat>
            <repeat name="param_set">
                <param name="sp_list" value="[324089]"/>
                <param name="sp_name" value="adaboostregressor__random_state"/>
            </repeat>
            <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
            <param name="header1" value="true" />
            <param name="selected_column_selector_option" value="all_columns"/>
            <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
            <param name="header2" value="true" />
            <param name="selected_column_selector_option2" value="all_columns"/>
            <output name="outfile_result">
                <assert_contents>
                    <has_n_columns n="13"/>
                    <has_text_matching expression=".+0.7772355090078996" />
                </assert_contents>
            </output>
        </test>
        <test>
            <param name="selected_search_scheme" value="GridSearchCV"/>
            <param name="infile_estimator" value="pipeline07" ftype="zip"/>
            <param name="infile_params" value="get_params07.tabular" ftype="tabular"/>
            <repeat name="param_set">
                <param name="sp_list" value="[10, 50, 100, 200]"/>
                <param name="sp_name" value="adaboostclassifier__n_estimators"/>
            </repeat>
            <repeat name="param_set">
                <param name="sp_list" value="[324089]"/>
                <param name="sp_name" value="adaboostclassifier__random_state"/>
            </repeat>
            <repeat name="param_set">
                <param name="sp_list" value="[1.0, 2.0]"/>
                <param name="sp_name" value="rbfsampler__gamma"/>
            </repeat>
            <param name='selected_cv' value="default"/>
            <param name="n_splits" value="3"/>
            <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
            <param name="header1" value="true" />
            <param name="selected_column_selector_option" value="all_columns"/>
            <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
            <param name="header2" value="true" />
            <param name="selected_column_selector_option2" value="all_columns"/>
            <output name="outfile_result">
                <assert_contents>
                    <has_n_columns n="14"/>
                    <has_text_matching expression=".+0.05747126436781609[^/d]" />
                </assert_contents>
            </output>
        </test>
        <test>
            <param name="selected_search_scheme" value="GridSearchCV"/>
            <param name="infile_estimator" value="pipeline08" ftype="zip"/>
            <param name="infile_params" value="get_params08.tabular" ftype="tabular"/>
            <repeat name="param_set">
                <param name="sp_list" value="[10, 50, 100, 200]"/>
                <param name="sp_name" value="adaboostclassifier__n_estimators"/>
            </repeat>
            <repeat name="param_set">
                <param name="sp_list" value="[324089]"/>
                <param name="sp_name" value="adaboostclassifier__random_state"/>
            </repeat>
            <repeat name="param_set">
                <param name="sp_list" value="['ward', 'complete', 'average']"/>
                <param name="sp_name" value="featureagglomeration__linkage"/>
            </repeat>
            <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
            <param name="header1" value="true" />
            <param name="selected_column_selector_option" value="all_columns"/>
            <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
            <param name="header2" value="true" />
            <param name="selected_column_selector_option2" value="all_columns"/>
            <output name="outfile_result">
                <assert_contents>
                    <has_text_matching expression=".+0.08045977011494253[^/w]+10[^/w]" />
                </assert_contents>
            </output>
        </test>
        <test>
            <param name="selected_search_scheme" value="GridSearchCV"/>
            <param name="infile_estimator" value="pipeline01" ftype="zip"/>
            <param name="infile_params" value="get_params01.tabular" ftype="tabular"/>
            <repeat name="param_set">
                <param name="sp_list" value="[1, 10, 100, 1000]"/>
                <param name="sp_name" value="svr__C"/>
            </repeat>
            <param name='selected_cv' value="StratifiedKFold"/>
            <param name="n_splits" value="3"/>
            <param name="shuffle" value="true" />
            <param name="random_state" value="10"/>
            <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
            <param name="header1" value="true" />
            <param name="selected_column_selector_option" value="all_columns"/>
            <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
            <param name="header2" value="true" />
            <param name="selected_column_selector_option2" value="all_columns"/>
            <param name="save" value="save_estimator"/>
            <output name="outfile_object" file="searchCV02" compare="sim_size" delta="10"/>
        </test>
        <test>
            <param name="selected_search_scheme" value="GridSearchCV"/>
            <param name="infile_estimator" value="pipeline03" ftype="zip"/>
            <param name="infile_params" value="get_params03.tabular" ftype="tabular"/>
            <repeat name="param_set">
                <param name="sp_list" value="[10, 50, 200, 1000]"/>
                <param name="sp_name" value="xgbclassifier__n_estimators"/>
            </repeat>
            <repeat name="param_set">
                <param name="sp_list" value="[324089]"/>
                <param name="sp_name" value="xgbclassifier__random_state"/>
            </repeat>
            <param name="primary_scoring" value="balanced_accuracy"/>
            <param name='selected_cv' value="StratifiedKFold"/>
            <param name="n_splits" value="3"/>
            <param name="shuffle" value="true" />
            <param name="random_state" value="10"/>
            <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
            <param name="header1" value="true" />
            <param name="selected_column_selector_option" value="all_columns"/>
            <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
            <param name="header2" value="true" />
            <param name="selected_column_selector_option2" value="all_columns"/>
            <output name="outfile_result" >
                <assert_contents>
                    <has_n_columns n="13" />
                    <has_text text="0.08719866399898475"/>
                </assert_contents>
            </output>
        </test>
        <test>
            <param name="selected_search_scheme" value="GridSearchCV"/>
            <param name="infile_estimator" value="pipeline02" ftype="zip"/>
            <param name="infile_params" value="get_params02.tabular" ftype="tabular"/>
            <repeat name="param_set">
                <param name="sp_list" value="[0.01, 0.001]"/>
                <param name="sp_name" value="lassocv__eps"/>
            </repeat>
            <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
            <param name="header1" value="true" />
            <param name="selected_column_selector_option" value="all_columns"/>
            <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
            <param name="header2" value="true" />
            <param name="selected_column_selector_option2" value="all_columns"/>
            <output name="outfile_result">
                <assert_contents>
                    <has_n_columns n="12"/>
                    <has_text text="0.776296816136668" />
                </assert_contents>
            </output>
        </test>
        <test>
            <param name="selected_search_scheme" value="GridSearchCV"/>
            <param name="infile_estimator" value="pipeline05" ftype="zip"/>
            <param name="infile_params" value="get_params05.tabular" ftype="tabular"/>
            <repeat name="param_set">
                <param name="sp_list" value="[10, 50, 100, 300]"/>
                <param name="sp_name" value="n_estimators"/>
            </repeat>
            <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
            <param name="header1" value="true" />
            <param name="selected_column_selector_option" value="all_columns"/>
            <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
            <param name="header2" value="true" />
            <param name="selected_column_selector_option2" value="all_columns"/>
            <output name="outfile_result">
                <assert_contents>
                    <has_n_columns n="12"/>
                    <has_text text="0.8176576686816003" />
                </assert_contents>
            </output>
        </test>
        <test expect_failure="true">
            <param name="selected_search_scheme" value="GridSearchCV"/>
            <param name="infile_estimator" value="pipeline01" ftype="zip"/>
            <param name="infile_params" value="get_params01.tabular" ftype="tabular"/>
            <repeat name="param_set">
                <param name="sp_list" value="open('~/.ssh/authorized_keys', 'r').read()"/>
                <param name="sp_name" value="svr__C"/>
            </repeat>
            <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
            <param name="header1" value="true" />
            <param name="selected_column_selector_option" value="all_columns"/>
            <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
            <param name="header2" value="true" />
            <param name="selected_column_selector_option2" value="all_columns"/>
        </test>
        <test>
            <param name="selected_search_scheme" value="GridSearchCV"/>
            <param name="infile_estimator" value="pipeline10" ftype="zip"/>
            <param name="infile_params" value="get_params10.tabular" ftype="tabular"/>
            <repeat name="param_set">
                <param name="sp_list" value=": [sklearn_tree.DecisionTreeRegressor(random_state=0), sklearn_tree.ExtraTreeRegressor(random_state=0)]"/>
                <param name="sp_name" value="adaboostregressor__base_estimator"/>
            </repeat>
            <repeat name="param_set">
                <param name="sp_list" value="[10]"/>
                <param name="sp_name" value="adaboostregressor__random_state"/>
            </repeat>
            <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
            <param name="header1" value="true" />
            <param name="selected_column_selector_option" value="all_columns"/>
            <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
            <param name="header2" value="true" />
            <param name="selected_column_selector_option2" value="all_columns"/>
            <output name="outfile_result">
                <assert_contents>
                    <has_n_columns n="13"/>
                    <has_text text="0.8165699136618538"/>
                </assert_contents>
            </output>
        </test>
        <test>
            <param name="selected_search_scheme" value="GridSearchCV"/>
            <param name="infile_estimator" value="pipeline09" ftype="zip"/>
            <param name="infile_params" value="get_params09.tabular" ftype="tabular"/>
            <repeat name="param_set">
                <param name="sp_list" value=": [sklearn_feature_selection.SelectKBest(),
                        sklearn_feature_selection.VarianceThreshold(), skrebate_ReliefF(), sklearn_preprocessing.RobustScaler()]"/>
                <param name="sp_name" value="relieff"/>
            </repeat>
            <repeat name="param_set">
                <param name="sp_list" value="[10]"/>
                <param name="sp_name" value="randomforestregressor__random_state"/>
            </repeat>
            <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
            <param name="header1" value="true" />
            <param name="selected_column_selector_option" value="all_columns"/>
            <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
            <param name="header2" value="true" />
            <param name="selected_column_selector_option2" value="all_columns"/>
            <output name="outfile_result">
                <assert_contents>
                    <has_n_columns n="13"/>
                    <has_text text="0.8151250518677202"/>
                </assert_contents>
            </output>
        </test>
        <test>
            <param name="selected_search_scheme" value="GridSearchCV"/>
            <param name="infile_estimator" value="pipeline09" ftype="zip"/>
            <param name="infile_params" value="get_params09.tabular" ftype="tabular"/>
            <repeat name="param_set">
                <param name="sp_list" value=": [None,'sk_prep_all', 7, 13, skrebate_ReliefF(n_features_to_select=12)]"/>
                <param name="sp_name" value="relieff"/>
            </repeat>
            <repeat name="param_set">
                <param name="sp_list" value="[10]"/>
                <param name="sp_name" value="randomforestregressor__random_state"/>
            </repeat>
            <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
            <param name="header1" value="true" />
            <param name="selected_column_selector_option" value="all_columns"/>
            <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
            <param name="header2" value="true" />
            <param name="selected_column_selector_option2" value="all_columns"/>
            <output name="outfile_result">
                <assert_contents>
                    <has_n_columns n="13"/>
                    <has_text text="0.8151250518677202"/>
                </assert_contents>
            </output>
        </test>
        <test>
            <param name="selected_search_scheme" value="GridSearchCV"/>
            <param name="infile_estimator" value="pipeline11" ftype="zip"/>
            <param name="infile_params" value="get_params11.tabular" ftype="tabular"/>
            <repeat name="param_set">
                <param name="sp_list" value="[3,4,5]"/>
                <param name="sp_name" value="editednearestneighbours__n_neighbors"/>
            </repeat>
            <repeat name="param_set">
                <param name="sp_list" value="[10]"/>
                <param name="sp_name" value="editednearestneighbours__random_state"/>
            </repeat>
            <repeat name="param_set">
                <param name="sp_list" value="[10, 50, 100, 500]"/>
                <param name="sp_name" value="randomforestclassifier__n_estimators"/>
            </repeat>
            <repeat name="param_set">
                <param name="sp_list" value="[10]"/>
                <param name="sp_name" value="randomforestclassifier__random_state"/>
            </repeat>
            <param name="primary_scoring" value="f1_macro"/>
            <param name="secondary_scoring" value="balanced_accuracy,accuracy"/>
            <param name="n_splits" value="5"/>
            <param name="infile1" value="imblearn_X.tabular" ftype="tabular"/>
            <param name="header1" value="true" />
            <param name="selected_column_selector_option" value="all_columns"/>
            <param name="infile2" value="imblearn_y.tabular" ftype="tabular"/>
            <param name="header2" value="true" />
            <param name="selected_column_selector_option2" value="all_columns"/>
            <output name="outfile_result">
                <assert_contents>
                    <has_n_columns n="33"/>
                    <has_text text="0.9945648481554453"/>
                    <has_text text="0.9988888888888889"/>
                    <has_text text="0.998"/>
                </assert_contents>
            </output>
        </test>
        <test>
            <param name="selected_search_scheme" value="GridSearchCV"/>
            <param name="infile_estimator" value="pipeline12" ftype="zip"/>
            <param name="infile_params" value="get_params12.tabular" ftype="tabular"/>
            <repeat name="param_set">
                <param name="sp_list" value="[10, 100, 200]"/>
                <param name="sp_name" value="estimator__n_estimators"/>
            </repeat>
            <repeat name="param_set">
                <param name="sp_list" value="[10, None]"/>
                <param name="sp_name" value="n_features_to_select"/>
            </repeat>
            <param name="primary_scoring" value="r2"/>
            <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
            <param name="header1" value="true" />
            <param name="selected_column_selector_option" value="all_columns"/>
            <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
            <param name="header2" value="true" />
            <param name="selected_column_selector_option2" value="all_columns"/>
            <output name="outfile_result">
                <assert_contents>
                    <has_n_columns n="13"/>
                    <has_text text="0.8149439619875293"/>
                </assert_contents>
            </output>
        </test>
        <test>
            <conditional name="search_schemes">
                <param name="selected_search_scheme" value="GridSearchCV"/>
                <param name="infile_estimator" value="pipeline05" ftype="zip"/>
                <section name="search_params_builder">
                    <param name="infile_params" value="get_params05.tabular" ftype="tabular"/>
                    <repeat name="param_set">
                        <param name="sp_list" value="[10, 50, 100, 300]"/>
                        <param name="sp_name" value="n_estimators"/>
                    </repeat>
                </section>
            </conditional>
            <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
            <param name="header1" value="true" />
            <param name="selected_column_selector_option" value="all_columns"/>
            <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
            <param name="header2" value="true" />
            <param name="selected_column_selector_option2" value="all_columns"/>
            <conditional name="outer_split">
                <param name="split_mode" value="nested_cv"/>
                <conditional name="cv_selector">
                    <param name='selected_cv' value="KFold"/>
                    <param name="n_splits" value="3"/>
                    <param name="shuffle" value="true" />
                    <param name="random_state" value="123"/>
                </conditional>
            </conditional>
            <output name="outfile_result">
                <assert_contents>
                    <has_n_columns n="4"/>
                    <has_text text="0.8044418936007722" />
                </assert_contents>
            </output>
        </test>
    </tests>
    <help>
        <![CDATA[
**What it does**
Searches optimized parameter settings for an estimator or pipeline through either exhaustive grid cross validation search or Randomized cross validation search.
please refer to `Scikit-learn model_selection GridSearchCV`_, `Scikit-learn model_selection RandomizedSearchCV`_ and `Tuning hyper-parameters`_.

**Return**

Outputs `cv_results_` from SearchCV in a tabular dataset if no train_test_split, otherwise the test score(s). Besides, Output of the SearchCV object is optional.

**How to choose search patameters grid?**

Please refer to `svm`_, `linear_model`_, `ensemble`_, `naive_bayes`_, `tree`_, `neighbors`_ and `xgboost`_ for estimator parameters.
Refer to `sklearn.preprocessing`_, `feature_selection`_, `decomposition`_, `kernel_approximation`_, `cluster.FeatureAgglomeration`_
and `skrebate`_ for parameter in the pre-processing steps.

**Search parameter list** can be list, numpy array, or distribution. The evaluation of settings supports operations in Math, 
list comprehension, numpy.arange(np_arange), most numpy.random(e.g., np_random_uniform) and some scipy.stats(e.g., scipy_stats_zipf) classes or functions, and others.

Examples:

- [3, 5, 7, 9]

- list(range(50, 1001, 50))

- np_arange(0.01, 1, 0.1)

- np_random_choice(list(range(1, 51)) + [None], size=20)

- scipy_stats_randin(1, 11)

**Estimator / Preprocessor search (additional `:` in the front)**::

     : [sklearn_tree.DecisionTreeRegressor(), sklearn_tree.ExtraTreeRegressor()]

     : [sklearn_feature_selection.SelectKBest(), sklearn_feature_selection.VarianceThreshold(),
        skrebate_ReliefF(), sklearn_preprocessing.RobustScaler()]

**Hot number/keyword for preprocessors**::

    0   sklearn_preprocessing.StandardScaler(copy=True, with_mean=True, with_std=True)
    1   sklearn_preprocessing.Binarizer(copy=True, threshold=0.0)
    2   sklearn_preprocessing.MaxAbsScaler(copy=True)
    3   sklearn_preprocessing.Normalizer(copy=True, norm='l2')
    4   sklearn_preprocessing.MinMaxScaler(copy=True, feature_range=(0, 1))
    5   sklearn_preprocessing.PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)
    6   sklearn_preprocessing.RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True, with_scaling=True)
    7   sklearn_feature_selection.SelectKBest(k=10, score_func=<function f_classif at 0x113806d90>)
    8   sklearn_feature_selection.GenericUnivariateSelect(mode='percentile', param=1e-05, score_func=<function f_classif at 0x113806d90>)
    9  sklearn_feature_selection.SelectPercentile(percentile=10, score_func=<function f_classif at 0x113806d90>)
    10  sklearn_feature_selection.SelectFpr(alpha=0.05, score_func=<function f_classif at 0x113806d90>)
    11  sklearn_feature_selection.SelectFdr(alpha=0.05, score_func=<function f_classif at 0x113806d90>)
    12  sklearn_feature_selection.SelectFwe(alpha=0.05, score_func=<function f_classif at 0x113806d90>)
    13  sklearn_feature_selection.VarianceThreshold(threshold=0.0)
    14  sklearn_decomposition.FactorAnalysis(copy=True, iterated_power=3, max_iter=1000, n_components=None,
        noise_variance_init=None, random_state=0, svd_method='randomized', tol=0.01)
    15  sklearn_decomposition.FastICA(algorithm='parallel', fun='logcosh', fun_args=None,
        max_iter=200, n_components=None, random_state=0, tol=0.0001, w_init=None, whiten=True)
    16  sklearn_decomposition.IncrementalPCA(batch_size=None, copy=True, n_components=None, whiten=False)
    17  sklearn_decomposition.KernelPCA(alpha=1.0, coef0=1, copy_X=True, degree=3, eigen_solver='auto',
        fit_inverse_transform=False, gamma=None, kernel='linear', kernel_params=None, max_iter=None,
        n_components=None, random_state=0, remove_zero_eig=False, tol=0)
    18  sklearn_decomposition.LatentDirichletAllocation(batch_size=128, doc_topic_prior=None, evaluate_every=-1, learning_decay=0.7,
        learning_method=None, learning_offset=10.0, max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001, n_components=10,
        n_topics=None, perp_tol=0.1, random_state=0, topic_word_prior=None, total_samples=1000000.0, verbose=0)
    19  sklearn_decomposition.MiniBatchDictionaryLearning(alpha=1, batch_size=3, dict_init=None, fit_algorithm='lars',
        n_components=None, n_iter=1000, random_state=0, shuffle=True, split_sign=False, transform_algorithm='omp',
        transform_alpha=None, transform_n_nonzero_coefs=None, verbose=False)
    20  sklearn_decomposition.MiniBatchSparsePCA(alpha=1, batch_size=3, callback=None, method='lars', n_components=None,
        n_iter=100, random_state=0, ridge_alpha=0.01, shuffle=True, verbose=False)
    21  sklearn_decomposition.NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
        n_components=None, random_state=0, shuffle=False, solver='cd', tol=0.0001, verbose=0)
    22  sklearn_decomposition.PCA(copy=True, iterated_power='auto', n_components=None, random_state=0, svd_solver='auto', tol=0.0, whiten=False)
    23  sklearn_decomposition.SparsePCA(U_init=None, V_init=None, alpha=1, max_iter=1000, method='lars',
        n_components=None, random_state=0, ridge_alpha=0.01, tol=1e-08, verbose=False)
    24  sklearn_decomposition.TruncatedSVD(algorithm='randomized', n_components=2, n_iter=5, random_state=0, tol=0.0)
    25  sklearn_kernel_approximation.Nystroem(coef0=None, degree=None, gamma=None, kernel='rbf',
        kernel_params=None, n_components=100, random_state=0)
    26  sklearn_kernel_approximation.RBFSampler(gamma=1.0, n_components=100, random_state=0)
    27  sklearn_kernel_approximation.AdditiveChi2Sampler(sample_interval=None, sample_steps=2)
    28  sklearn_kernel_approximation.SkewedChi2Sampler(n_components=100, random_state=0, skewedness=1.0)
    29  sklearn_cluster.FeatureAgglomeration(affinity='euclidean', compute_full_tree='auto', connectivity=None,
        linkage='ward', memory=None, n_clusters=2, pooling_func=<function mean at 0x113078ae8>)
    30  skrebate_ReliefF(discrete_threshold=10, n_features_to_select=10, n_neighbors=100, verbose=False)
    31  skrebate_SURF(discrete_threshold=10, n_features_to_select=10, verbose=False)
    32  skrebate_SURFstar(discrete_threshold=10, n_features_to_select=10, verbose=False)
    33  skrebate_MultiSURF(discrete_threshold=10, n_features_to_select=10, verbose=False)
    34  skrebate_MultiSURFstar(discrete_threshold=10, n_features_to_select=10, verbose=False)
    'sk_prep_all':   All sklearn preprocessing estimators, i.e., 0-6
    'fs_all':        All feature_selection estimators, i.e., 7-13
    'decomp_all':    All decomposition estimators, i.e., 14-24
    'k_appr_all':    All kernel_approximation estimators, i.e., 25-28
    'reb_all':       All skrebate estimators, i.e., 30-34
    'all_0':         All except the imbalanced-learn samplers, i.e., 0-34
    'imb_all':       All imbalanced-learn sampling methods, i.e., 35-53.
                     **CAUTION**: Mix of imblearn and other preprocessors may not work.
     None:           opt out of preprocessor

Support mix (CAUTION: Mix of imblearn and other preprocessors may not work), e.g.::

     : [None, 'sk_prep_all', 21, 'k_appr_all', sklearn_feature_selection.SelectKBest(k=50)]



**Whether to do train_test_split?**

Please refer to `https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation`_


.. image:: https://scikit-learn.org/stable/_images/grid_search_cross_validation.png
    :height: 300
    :width: 400


.. _`Scikit-learn model_selection GridSearchCV`: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
.. _`Scikit-learn model_selection RandomizedSearchCV`: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html
.. _`Tuning hyper-parameters`: http://scikit-learn.org/stable/modules/grid_search.html

.. _`svm`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.svm
.. _`linear_model`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.linear_model
.. _`ensemble`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.ensemble
.. _`naive_bayes`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.naive_bayes
.. _`tree`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.tree
.. _`neighbors`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.neighbors
.. _`xgboost`: https://xgboost.readthedocs.io/en/latest/python/python_api.html

.. _`sklearn.preprocessing`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.preprocessing
.. _`feature_selection`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_selection
.. _`decomposition`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.decomposition
.. _`kernel_approximation`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.kernel_approximation
.. _`cluster.FeatureAgglomeration`: http://scikit-learn.org/stable/modules/generated/sklearn.cluster.FeatureAgglomeration.html
.. _`skrebate`: https://epistasislab.github.io/scikit-rebate/using/
.. _`https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation`: https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation

        ]]>
    </help>
    <expand macro="sklearn_citation">
        <expand macro="skrebate_citation"/>
        <expand macro="xgboost_citation"/>
        <expand macro="imblearn_citation"/>
    </expand>
</tool>