Mercurial > repos > bgruening > sklearn_searchcv
view search_model_validation.xml @ 12:103aaea17119 draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 02087ce2966cf8b4aac9197a41171e7f986c11d1-dirty"
author | bgruening |
---|---|
date | Wed, 02 Oct 2019 03:51:08 -0400 |
parents | 68753d45815f |
children | c1ca24a1509d |
line wrap: on
line source
<tool id="sklearn_searchcv" name="Hyperparameter Search" version="@VERSION@"> <description>using exhausitive or randomized search</description> <macros> <import>main_macros.xml</import> </macros> <expand macro="python_requirements"/> <expand macro="macro_stdio"/> <version_command>echo "@VERSION@"</version_command> <command detect_errors="aggressive"> <![CDATA[ export HDF5_USE_FILE_LOCKING='FALSE'; #if $input_options.selected_input == 'refseq_and_interval' bgzip -c '$input_options.target_file' > '${target_file.element_identifier}.gz' && tabix -p bed '${target_file.element_identifier}.gz' && #end if python '$__tool_directory__/search_model_validation.py' --inputs '$inputs' --estimator '$search_schemes.infile_estimator' #if $input_options.selected_input == 'seq_fasta' --fasta_path '$input_options.fasta_path' #elif $input_options.selected_input == 'refseq_and_interval' --ref_seq '$input_options.ref_genome_file' --interval '$input_options.interval_file' --targets "`pwd`/${target_file.element_identifier}.gz" #else --infile1 '$input_options.infile1' #end if --infile2 '$input_options.infile2' --outfile_result "`pwd`/tmp_outfile_result" #if $save != 'nope' --outfile_object '$outfile_object' #end if #if $save == 'save_weights' --outfile_weights '$outfile_weights' #end if #if $search_schemes.options.cv_selector.selected_cv in ['GroupKFold', 'GroupShuffleSplit', 'LeaveOneGroupOut', 'LeavePGroupsOut'] --groups '$search_schemes.options.cv_selector.groups_selector.infile_g' #end if >'$outfile_result' && cp tmp_outfile_result '$outfile_result'; ]]> </command> <configfiles> <inputs name="inputs" /> </configfiles> <inputs> <conditional name="search_schemes"> <param name="selected_search_scheme" type="select" label="Select a model selection search scheme"> <option value="GridSearchCV" selected="true">GridSearchCV - Exhaustive search over specified parameter values for an estimator </option> <option value="RandomizedSearchCV">RandomizedSearchCV - Randomized search on hyper parameters for an estimator</option> </param> <when value="GridSearchCV"> <expand macro="search_cv_estimator"/> <section name="options" title="Advanced Options for SearchCV" expanded="false"> <expand macro="search_cv_options"/> </section> </when> <when value="RandomizedSearchCV"> <expand macro="search_cv_estimator"/> <section name="options" title="Advanced Options for SearchCV" expanded="false"> <expand macro="search_cv_options"/> <param argument="n_iter" type="integer" value="10" label="Number of parameter settings that are sampled"/> <expand macro="random_state"/> </section> </when> </conditional> <expand macro="sl_mixed_input_plus_sequence"/> <conditional name="outer_split"> <param name="split_mode" type="select" label="Whether to hold a portion of samples for test exclusively?" help="Nested CV or train_test_split"> <option value="no" selected="true">Nope</option> <option value="train_test_split">Yes - do a single train test split</option> <option value="nested_cv">Yes - do nested CV</option> </param> <when value='no'/> <when value='train_test_split'> <param argument="test_size" type="float" optional="True" value="0.25" label="Test size:"/> <!--param argument="train_size" type="float" optional="True" value="" label="Train size:"/>--> <param argument="random_state" type="integer" optional="True" value="" label="Random seed number:"/> <param argument="shuffle" type="select"> <option value="None">None - No shuffle</option> <option value="simple">Shuffle -- for regression problems</option> <option value="stratified">StratifiedShuffle -- will use the target values as class labels</option> <option value="group">GroupShuffle -- make sure group CV option is choosen</option> </param> </when> <when value="nested_cv"> <expand macro="cv_reduced" label="Select the outer cv splitter"/> </when> </conditional> <param name="save" type="select" label="Save best estimator?" help="For security reason, deep learning models will be saved into two datasets, model skeleton and weights. Caution: Save estimator doesn't work for nestCV or when refit is False."> <option value="nope" selected="true">Nope, save is unnecessary</option> <option value="save_estimator">Fitted estimator (excluding deep learning)</option> <option value="save_weights">Model skeleton and weights, for deep learning exclusively</option> </param> </inputs> <outputs> <data format="tabular" name="outfile_result"/> <data format="zip" name="outfile_object" label="Fitted estimator or estimator skeleton on ${on_string}"> <filter>save != 'nope'</filter> </data> <data format="h5" name="outfile_weights" label="Weights trained on ${on_string}"> <filter>save == 'save_weights'</filter> </data> </outputs> <tests> <test> <param name="selected_search_scheme" value="GridSearchCV"/> <param name="infile_estimator" value="pipeline01" ftype="zip"/> <param name="infile_params" value="get_params01.tabular" ftype="tabular"/> <repeat name="param_set"> <param name="sp_list" value="[1, 10, 100, 1000]"/> <param name="sp_name" value="svr__C"/> </repeat> <repeat name="param_set"> <param name="sp_list" value="[-1, 3, 5, 7, 9]"/> <param name="sp_name" value="selectkbest__k"/> </repeat> <param name="error_score" value="false"/> <param name="infile1" value="regression_X.tabular" ftype="tabular"/> <param name="header1" value="true" /> <param name="selected_column_selector_option" value="all_columns"/> <param name="infile2" value="regression_y.tabular" ftype="tabular"/> <param name="header2" value="true" /> <param name="selected_column_selector_option2" value="all_columns"/> <output name="outfile_result"> <assert_contents> <has_n_columns n="13"/> <has_text text="0.7938837807353147"/> <has_text text="{'selectkbest__k': 9, 'svr__C': 1}"/> </assert_contents> </output> </test> <test expect_failure="true"> <param name="selected_search_scheme" value="GridSearchCV"/> <param name="infile_estimator" value="pipeline01" ftype="zip"/> <param name="infile_params" value="get_params01.tabular" ftype="tabular"/> <repeat name="param_set"> <param name="sp_list" value="[1, 10, 100, 1000]"/> <param name="sp_name" value="svr__C"/> </repeat> <repeat name="param_set"> <param name="sp_list" value="[-1, 3, 5, 7, 9]"/> <param name="sp_name" value="selectkbest__k"/> </repeat> <param name="error_score" value="true"/> <param name="infile1" value="regression_X.tabular" ftype="tabular"/> <param name="header1" value="true" /> <param name="selected_column_selector_option" value="all_columns"/> <param name="infile2" value="regression_y.tabular" ftype="tabular"/> <param name="header2" value="true" /> <param name="selected_column_selector_option2" value="all_columns"/> </test> <test> <param name="selected_search_scheme" value="RandomizedSearchCV"/> <param name="infile_estimator" value="pipeline01" ftype="zip"/> <param name="infile_params" value="get_params01.tabular" ftype="tabular"/> <repeat name="param_set"> <param name="sp_list" value="[1, 10, 100, 1000]"/> <param name="sp_name" value="svr__C"/> </repeat> <repeat name="param_set"> <param name="sp_list" value="['linear', 'poly', 'rbf', 'sigmoid']"/> <param name="sp_name" value="svr__kernel"/> </repeat> <repeat name="param_set"> <param name="sp_list" value="[3, 5, 7, 9]"/> <param name="sp_name" value="selectkbest__k"/> </repeat> <repeat name="param_set"> <param name="sp_list" value="[True, False]"/> <param name="sp_name" value="robustscaler__with_centering"/> </repeat> <param name="infile1" value="regression_X.tabular" ftype="tabular"/> <param name="header1" value="true" /> <param name="selected_column_selector_option" value="all_columns"/> <param name="infile2" value="regression_y.tabular" ftype="tabular"/> <param name="header2" value="true" /> <param name="selected_column_selector_option2" value="all_columns"/> <output name="outfile_result" > <assert_contents> <has_n_columns n="15" /> <has_text text="param_robustscaler__with_centering"/> </assert_contents> </output> </test> <test> <param name="selected_search_scheme" value="RandomizedSearchCV"/> <param name="infile_estimator" value="pipeline03" ftype="zip"/> <param name="infile_params" value="get_params03.tabular" ftype="tabular"/> <repeat name="param_set"> <param name="sp_list" value="np_arange(50, 1001, 50)"/> <param name="sp_name" value="xgbclassifier__n_estimators"/> </repeat> <repeat name="param_set"> <param name="sp_list" value="scipy_stats_randint(1, 51)"/> <param name="sp_name" value="xgbclassifier__max_depth"/> </repeat> <repeat name="param_set"> <param name="sp_list" value="scipy_stats_uniform(0., 1.)"/> <param name="sp_name" value="xgbclassifier__gamma"/> </repeat> <repeat name="param_set"> <param name="sp_list" value="[324089]"/> <param name="sp_name" value="xgbclassifier__random_state"/> </repeat> <param name="infile1" value="regression_X.tabular" ftype="tabular"/> <param name="header1" value="true" /> <param name="selected_column_selector_option" value="all_columns"/> <param name="infile2" value="regression_y.tabular" ftype="tabular"/> <param name="header2" value="true" /> <param name="selected_column_selector_option2" value="all_columns"/> <output name="outfile_result" > <assert_contents> <has_n_columns n="15" /> <has_text text="param_xgbclassifier__max_depth"/> </assert_contents> </output> </test> <test> <param name="selected_search_scheme" value="GridSearchCV"/> <param name="infile_estimator" value="pipeline04" ftype="zip"/> <param name="infile_params" value="get_params04.tabular" ftype="tabular"/> <repeat name="param_set"> <param name="sp_list" value="list(range(100, 1001, 100))"/> <param name="sp_name" value="linearsvc__random_state"/> </repeat> <repeat name="param_set"> <param name="sp_list" value=": [sklearn_ensemble.ExtraTreesClassifier(n_estimators=100, random_state=324089)]"/> <param name="sp_name" value="selectfrommodel__estimator"/> </repeat> <param name="infile1" value="regression_X.tabular" ftype="tabular"/> <param name="header1" value="true" /> <param name="selected_column_selector_option" value="all_columns"/> <param name="infile2" value="regression_y.tabular" ftype="tabular"/> <param name="header2" value="true" /> <param name="selected_column_selector_option2" value="all_columns"/> <output name="outfile_result"> <assert_contents> <has_n_columns n="13"/> <has_text text="0.05363984674329502"/> </assert_contents> </output> </test> <test> <param name="selected_search_scheme" value="GridSearchCV"/> <param name="infile_estimator" value="pipeline01" ftype="zip"/> <param name="infile_params" value="get_params01.tabular" ftype="tabular"/> <repeat name="param_set"> <param name="sp_list" value="[1, 10, 100, 1000]"/> <param name="sp_name" value="svr__C"/> </repeat> <param name="infile1" value="regression_X.tabular" ftype="tabular"/> <param name="header1" value="true" /> <param name="selected_column_selector_option" value="all_columns"/> <param name="infile2" value="regression_y.tabular" ftype="tabular"/> <param name="header2" value="true" /> <param name="selected_column_selector_option2" value="all_columns"/> <param name="save" value="save_estimator"/> <output name="outfile_object" file="searchCV01" compare="sim_size" delta="10"/> </test> <test> <param name="selected_search_scheme" value="GridSearchCV"/> <param name="infile_estimator" value="pipeline06" ftype="zip"/> <param name="infile_params" value="get_params06.tabular" ftype="tabular"/> <repeat name="param_set"> <param name="sp_list" value="[10, 50, 200, 1000]"/> <param name="sp_name" value="adaboostregressor__n_estimators"/> </repeat> <repeat name="param_set"> <param name="sp_list" value="[324089]"/> <param name="sp_name" value="adaboostregressor__random_state"/> </repeat> <param name="infile1" value="regression_X.tabular" ftype="tabular"/> <param name="header1" value="true" /> <param name="selected_column_selector_option" value="all_columns"/> <param name="infile2" value="regression_y.tabular" ftype="tabular"/> <param name="header2" value="true" /> <param name="selected_column_selector_option2" value="all_columns"/> <output name="outfile_result"> <assert_contents> <has_n_columns n="13"/> <has_text_matching expression=".+0.7772355090078996" /> </assert_contents> </output> </test> <test> <param name="selected_search_scheme" value="GridSearchCV"/> <param name="infile_estimator" value="pipeline07" ftype="zip"/> <param name="infile_params" value="get_params07.tabular" ftype="tabular"/> <repeat name="param_set"> <param name="sp_list" value="[10, 50, 100, 200]"/> <param name="sp_name" value="adaboostclassifier__n_estimators"/> </repeat> <repeat name="param_set"> <param name="sp_list" value="[324089]"/> <param name="sp_name" value="adaboostclassifier__random_state"/> </repeat> <repeat name="param_set"> <param name="sp_list" value="[1.0, 2.0]"/> <param name="sp_name" value="rbfsampler__gamma"/> </repeat> <param name='selected_cv' value="default"/> <param name="n_splits" value="3"/> <param name="infile1" value="regression_X.tabular" ftype="tabular"/> <param name="header1" value="true" /> <param name="selected_column_selector_option" value="all_columns"/> <param name="infile2" value="regression_y.tabular" ftype="tabular"/> <param name="header2" value="true" /> <param name="selected_column_selector_option2" value="all_columns"/> <output name="outfile_result"> <assert_contents> <has_n_columns n="14"/> <has_text_matching expression=".+0.05747126436781609[^/d]" /> </assert_contents> </output> </test> <test> <param name="selected_search_scheme" value="GridSearchCV"/> <param name="infile_estimator" value="pipeline08" ftype="zip"/> <param name="infile_params" value="get_params08.tabular" ftype="tabular"/> <repeat name="param_set"> <param name="sp_list" value="[10, 50, 100, 200]"/> <param name="sp_name" value="adaboostclassifier__n_estimators"/> </repeat> <repeat name="param_set"> <param name="sp_list" value="[324089]"/> <param name="sp_name" value="adaboostclassifier__random_state"/> </repeat> <repeat name="param_set"> <param name="sp_list" value="['ward', 'complete', 'average']"/> <param name="sp_name" value="featureagglomeration__linkage"/> </repeat> <param name="infile1" value="regression_X.tabular" ftype="tabular"/> <param name="header1" value="true" /> <param name="selected_column_selector_option" value="all_columns"/> <param name="infile2" value="regression_y.tabular" ftype="tabular"/> <param name="header2" value="true" /> <param name="selected_column_selector_option2" value="all_columns"/> <output name="outfile_result"> <assert_contents> <has_text_matching expression=".+0.08045977011494253[^/w]+10[^/w]" /> </assert_contents> </output> </test> <test> <param name="selected_search_scheme" value="GridSearchCV"/> <param name="infile_estimator" value="pipeline01" ftype="zip"/> <param name="infile_params" value="get_params01.tabular" ftype="tabular"/> <repeat name="param_set"> <param name="sp_list" value="[1, 10, 100, 1000]"/> <param name="sp_name" value="svr__C"/> </repeat> <param name='selected_cv' value="StratifiedKFold"/> <param name="n_splits" value="3"/> <param name="shuffle" value="true" /> <param name="random_state" value="10"/> <param name="infile1" value="regression_X.tabular" ftype="tabular"/> <param name="header1" value="true" /> <param name="selected_column_selector_option" value="all_columns"/> <param name="infile2" value="regression_y.tabular" ftype="tabular"/> <param name="header2" value="true" /> <param name="selected_column_selector_option2" value="all_columns"/> <param name="save" value="save_estimator"/> <output name="outfile_object" file="searchCV02" compare="sim_size" delta="10"/> </test> <test> <param name="selected_search_scheme" value="GridSearchCV"/> <param name="infile_estimator" value="pipeline03" ftype="zip"/> <param name="infile_params" value="get_params03.tabular" ftype="tabular"/> <repeat name="param_set"> <param name="sp_list" value="[10, 50, 200, 1000]"/> <param name="sp_name" value="xgbclassifier__n_estimators"/> </repeat> <repeat name="param_set"> <param name="sp_list" value="[324089]"/> <param name="sp_name" value="xgbclassifier__random_state"/> </repeat> <param name="primary_scoring" value="balanced_accuracy"/> <param name='selected_cv' value="StratifiedKFold"/> <param name="n_splits" value="3"/> <param name="shuffle" value="true" /> <param name="random_state" value="10"/> <param name="infile1" value="regression_X.tabular" ftype="tabular"/> <param name="header1" value="true" /> <param name="selected_column_selector_option" value="all_columns"/> <param name="infile2" value="regression_y.tabular" ftype="tabular"/> <param name="header2" value="true" /> <param name="selected_column_selector_option2" value="all_columns"/> <output name="outfile_result" > <assert_contents> <has_n_columns n="13" /> <has_text text="0.09003449195911103"/> </assert_contents> </output> </test> <test> <param name="selected_search_scheme" value="GridSearchCV"/> <param name="infile_estimator" value="pipeline09" ftype="zip"/> <param name="infile_params" value="get_params09.tabular" ftype="tabular"/> <repeat name="param_set"> <param name="sp_list" value="[50, 100, 150, 200]"/> <param name="sp_name" value="relieff__n_neighbors"/> </repeat> <repeat name="param_set"> <param name="sp_list" value="[324089]"/> <param name="sp_name" value="randomforestregressor__random_state"/> </repeat> <param name="primary_scoring" value="explained_variance"/> <param name="secondary_scoring" value="neg_mean_squared_error,r2"/> <param name='selected_cv' value="StratifiedKFold"/> <param name="n_splits" value="3"/> <param name="shuffle" value="true" /> <param name="random_state" value="10"/> <param name="infile1" value="regression_X.tabular" ftype="tabular"/> <param name="header1" value="true" /> <param name="selected_column_selector_option" value="all_columns"/> <param name="infile2" value="regression_y.tabular" ftype="tabular"/> <param name="header2" value="true" /> <param name="selected_column_selector_option2" value="all_columns"/> <output name="outfile_result" > <assert_contents> <has_n_columns n="25" /> <has_text text="0.7879267424165166"/> <has_text text="0.787865425577799"/> <has_text text="-29.40436189868029"/> </assert_contents> </output> </test> <test> <param name="selected_search_scheme" value="GridSearchCV"/> <param name="infile_estimator" value="pipeline02" ftype="zip"/> <param name="infile_params" value="get_params02.tabular" ftype="tabular"/> <repeat name="param_set"> <param name="sp_list" value="[0.01, 0.001]"/> <param name="sp_name" value="lassocv__eps"/> </repeat> <param name="infile1" value="regression_X.tabular" ftype="tabular"/> <param name="header1" value="true" /> <param name="selected_column_selector_option" value="all_columns"/> <param name="infile2" value="regression_y.tabular" ftype="tabular"/> <param name="header2" value="true" /> <param name="selected_column_selector_option2" value="all_columns"/> <output name="outfile_result"> <assert_contents> <has_n_columns n="12"/> <has_text text="0.776296816136668" /> </assert_contents> </output> </test> <test> <param name="selected_search_scheme" value="GridSearchCV"/> <param name="infile_estimator" value="pipeline05" ftype="zip"/> <param name="infile_params" value="get_params05.tabular" ftype="tabular"/> <repeat name="param_set"> <param name="sp_list" value="[10, 50, 100, 300]"/> <param name="sp_name" value="randomforestregressor__n_estimators"/> </repeat> <param name="infile1" value="regression_X.tabular" ftype="tabular"/> <param name="header1" value="true" /> <param name="selected_column_selector_option" value="all_columns"/> <param name="infile2" value="regression_y.tabular" ftype="tabular"/> <param name="header2" value="true" /> <param name="selected_column_selector_option2" value="all_columns"/> <output name="outfile_result"> <assert_contents> <has_n_columns n="12"/> <has_text text="0.8176576686816003" /> </assert_contents> </output> </test> <test expect_failure="true"> <param name="selected_search_scheme" value="GridSearchCV"/> <param name="infile_estimator" value="pipeline01" ftype="zip"/> <param name="infile_params" value="get_params01.tabular" ftype="tabular"/> <repeat name="param_set"> <param name="sp_list" value="open('~/.ssh/authorized_keys', 'r').read()"/> <param name="sp_name" value="svr__C"/> </repeat> <param name="infile1" value="regression_X.tabular" ftype="tabular"/> <param name="header1" value="true" /> <param name="selected_column_selector_option" value="all_columns"/> <param name="infile2" value="regression_y.tabular" ftype="tabular"/> <param name="header2" value="true" /> <param name="selected_column_selector_option2" value="all_columns"/> </test> <test> <param name="selected_search_scheme" value="GridSearchCV"/> <param name="infile_estimator" value="pipeline10" ftype="zip"/> <param name="infile_params" value="get_params10.tabular" ftype="tabular"/> <repeat name="param_set"> <param name="sp_list" value=": [sklearn_tree.DecisionTreeRegressor(random_state=0), sklearn_tree.ExtraTreeRegressor(random_state=0)]"/> <param name="sp_name" value="adaboostregressor__base_estimator"/> </repeat> <repeat name="param_set"> <param name="sp_list" value="[10]"/> <param name="sp_name" value="adaboostregressor__random_state"/> </repeat> <param name="infile1" value="regression_X.tabular" ftype="tabular"/> <param name="header1" value="true" /> <param name="selected_column_selector_option" value="all_columns"/> <param name="infile2" value="regression_y.tabular" ftype="tabular"/> <param name="header2" value="true" /> <param name="selected_column_selector_option2" value="all_columns"/> <output name="outfile_result"> <assert_contents> <has_n_columns n="13"/> <has_text text="0.8165699136618538"/> </assert_contents> </output> </test> <test> <param name="selected_search_scheme" value="GridSearchCV"/> <param name="infile_estimator" value="pipeline09" ftype="zip"/> <param name="infile_params" value="get_params09.tabular" ftype="tabular"/> <repeat name="param_set"> <param name="sp_list" value=": [sklearn_feature_selection.SelectKBest(), sklearn_feature_selection.VarianceThreshold(), skrebate_ReliefF(), sklearn_preprocessing.RobustScaler()]"/> <param name="sp_name" value="relieff"/> </repeat> <repeat name="param_set"> <param name="sp_list" value="[10]"/> <param name="sp_name" value="randomforestregressor__random_state"/> </repeat> <param name="infile1" value="regression_X.tabular" ftype="tabular"/> <param name="header1" value="true" /> <param name="selected_column_selector_option" value="all_columns"/> <param name="infile2" value="regression_y.tabular" ftype="tabular"/> <param name="header2" value="true" /> <param name="selected_column_selector_option2" value="all_columns"/> <output name="outfile_result"> <assert_contents> <has_n_columns n="13"/> <has_text text="0.8151250518677202"/> </assert_contents> </output> </test> <test> <param name="selected_search_scheme" value="GridSearchCV"/> <param name="infile_estimator" value="pipeline09" ftype="zip"/> <param name="infile_params" value="get_params09.tabular" ftype="tabular"/> <repeat name="param_set"> <param name="sp_list" value=": [None,'sk_prep_all', 7, 13, skrebate_ReliefF(n_features_to_select=12)]"/> <param name="sp_name" value="relieff"/> </repeat> <repeat name="param_set"> <param name="sp_list" value="[10]"/> <param name="sp_name" value="randomforestregressor__random_state"/> </repeat> <param name="infile1" value="regression_X.tabular" ftype="tabular"/> <param name="header1" value="true" /> <param name="selected_column_selector_option" value="all_columns"/> <param name="infile2" value="regression_y.tabular" ftype="tabular"/> <param name="header2" value="true" /> <param name="selected_column_selector_option2" value="all_columns"/> <output name="outfile_result"> <assert_contents> <has_n_columns n="13"/> <has_text text="0.8151250518677202"/> </assert_contents> </output> </test> <test> <param name="selected_search_scheme" value="GridSearchCV"/> <param name="infile_estimator" value="pipeline11" ftype="zip"/> <param name="infile_params" value="get_params11.tabular" ftype="tabular"/> <repeat name="param_set"> <param name="sp_list" value="[3,4,5]"/> <param name="sp_name" value="editednearestneighbours__n_neighbors"/> </repeat> <repeat name="param_set"> <param name="sp_list" value="[10]"/> <param name="sp_name" value="editednearestneighbours__random_state"/> </repeat> <repeat name="param_set"> <param name="sp_list" value="[10, 50, 100, 500]"/> <param name="sp_name" value="randomforestclassifier__n_estimators"/> </repeat> <repeat name="param_set"> <param name="sp_list" value="[10]"/> <param name="sp_name" value="randomforestclassifier__random_state"/> </repeat> <param name="primary_scoring" value="f1_macro"/> <param name="secondary_scoring" value="balanced_accuracy,accuracy"/> <param name="n_splits" value="5"/> <param name="infile1" value="imblearn_X.tabular" ftype="tabular"/> <param name="header1" value="true" /> <param name="selected_column_selector_option" value="all_columns"/> <param name="infile2" value="imblearn_y.tabular" ftype="tabular"/> <param name="header2" value="true" /> <param name="selected_column_selector_option2" value="all_columns"/> <output name="outfile_result"> <assert_contents> <has_n_columns n="33"/> <has_text text="0.9945648481554453"/> <has_text text="0.9988888888888889"/> <has_text text="0.998"/> </assert_contents> </output> </test> <test> <param name="selected_search_scheme" value="GridSearchCV"/> <param name="infile_estimator" value="pipeline12" ftype="zip"/> <param name="infile_params" value="get_params12.tabular" ftype="tabular"/> <repeat name="param_set"> <param name="sp_list" value="[10, 100, 200]"/> <param name="sp_name" value="rfe__estimator__n_estimators"/> </repeat> <repeat name="param_set"> <param name="sp_list" value="[10, None]"/> <param name="sp_name" value="rfe__n_features_to_select"/> </repeat> <param name="primary_scoring" value="r2"/> <param name="infile1" value="regression_X.tabular" ftype="tabular"/> <param name="header1" value="true" /> <param name="selected_column_selector_option" value="all_columns"/> <param name="infile2" value="regression_y.tabular" ftype="tabular"/> <param name="header2" value="true" /> <param name="selected_column_selector_option2" value="all_columns"/> <output name="outfile_result"> <assert_contents> <has_n_columns n="13"/> <has_text text="0.8149439619875293"/> </assert_contents> </output> </test> <test> <conditional name="search_schemes"> <param name="selected_search_scheme" value="GridSearchCV"/> <param name="infile_estimator" value="pipeline05" ftype="zip"/> <section name="search_params_builder"> <param name="infile_params" value="get_params05.tabular" ftype="tabular"/> <repeat name="param_set"> <param name="sp_list" value="[10, 50, 100, 300]"/> <param name="sp_name" value="randomforestregressor__n_estimators"/> </repeat> </section> </conditional> <param name="infile1" value="regression_X.tabular" ftype="tabular"/> <param name="header1" value="true" /> <param name="selected_column_selector_option" value="all_columns"/> <param name="infile2" value="regression_y.tabular" ftype="tabular"/> <param name="header2" value="true" /> <param name="selected_column_selector_option2" value="all_columns"/> <conditional name="outer_split"> <param name="split_mode" value="train_test_split"/> <param name="shuffle" value="simple"/> <param name="random_state" value="123"/> </conditional> <output name="outfile_result"> <assert_contents> <has_n_columns n="1"/> <has_text text="0.8124083594523798"/> </assert_contents> </output> </test> <test> <conditional name="search_schemes"> <param name="selected_search_scheme" value="GridSearchCV"/> <param name="infile_estimator" value="pipeline05" ftype="zip"/> <section name="search_params_builder"> <param name="infile_params" value="get_params05.tabular" ftype="tabular"/> <repeat name="param_set"> <param name="sp_list" value="[10, 50, 100, 300]"/> <param name="sp_name" value="randomforestregressor__n_estimators"/> </repeat> </section> </conditional> <param name="infile1" value="regression_X.tabular" ftype="tabular"/> <param name="header1" value="true" /> <param name="selected_column_selector_option" value="all_columns"/> <param name="infile2" value="regression_y.tabular" ftype="tabular"/> <param name="header2" value="true" /> <param name="selected_column_selector_option2" value="all_columns"/> <conditional name="outer_split"> <param name="split_mode" value="nested_cv"/> <conditional name="cv_selector"> <param name='selected_cv' value="KFold"/> <param name="n_splits" value="3"/> <param name="shuffle" value="true" /> <param name="random_state" value="123"/> </conditional> </conditional> <output name="outfile_result"> <assert_contents> <has_n_columns n="4"/> <has_text text="0.8044418936007722" /> </assert_contents> </output> </test> </tests> <help> <![CDATA[ **What it does** Searches optimized parameter settings for an estimator or pipeline through either exhaustive grid cross validation search or Randomized cross validation search. please refer to `Scikit-learn model_selection GridSearchCV`_, `Scikit-learn model_selection RandomizedSearchCV`_ and `Tuning hyper-parameters`_. **Return** Outputs `cv_results_` from SearchCV in a tabular dataset if no train_test_split, otherwise the test score(s). Besides, Output of the SearchCV object is optional. **How to choose search patameters grid?** Please refer to `svm`_, `linear_model`_, `ensemble`_, `naive_bayes`_, `tree`_, `neighbors`_ and `xgboost`_ for estimator parameters. Refer to `sklearn.preprocessing`_, `feature_selection`_, `decomposition`_, `kernel_approximation`_, `cluster.FeatureAgglomeration`_ and `skrebate`_ for parameter in the pre-processing steps. **Search parameter list** can be list, numpy array, or distribution. The evaluation of settings supports operations in Math, list comprehension, numpy.arange(np_arange), most numpy.random(e.g., np_random_uniform) and some scipy.stats(e.g., scipy_stats_zipf) classes or functions, and others. Examples: - [3, 5, 7, 9] - list(range(50, 1001, 50)) - np_arange(0.01, 1, 0.1) - np_random_choice(list(range(1, 51)) + [None], size=20) - scipy_stats_randin(1, 11) **Estimator / Preprocessor search (additional `:` in the front)**:: : [sklearn_tree.DecisionTreeRegressor(), sklearn_tree.ExtraTreeRegressor()] : [sklearn_feature_selection.SelectKBest(), sklearn_feature_selection.VarianceThreshold(), skrebate_ReliefF(), sklearn_preprocessing.RobustScaler()] **Hot number/keyword for preprocessors**:: 0 sklearn_preprocessing.StandardScaler(copy=True, with_mean=True, with_std=True) 1 sklearn_preprocessing.Binarizer(copy=True, threshold=0.0) 2 sklearn_preprocessing.MaxAbsScaler(copy=True) 3 sklearn_preprocessing.Normalizer(copy=True, norm='l2') 4 sklearn_preprocessing.MinMaxScaler(copy=True, feature_range=(0, 1)) 5 sklearn_preprocessing.PolynomialFeatures(degree=2, include_bias=True, interaction_only=False) 6 sklearn_preprocessing.RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True, with_scaling=True) 7 sklearn_feature_selection.SelectKBest(k=10, score_func=<function f_classif at 0x113806d90>) 8 sklearn_feature_selection.GenericUnivariateSelect(mode='percentile', param=1e-05, score_func=<function f_classif at 0x113806d90>) 9 sklearn_feature_selection.SelectPercentile(percentile=10, score_func=<function f_classif at 0x113806d90>) 10 sklearn_feature_selection.SelectFpr(alpha=0.05, score_func=<function f_classif at 0x113806d90>) 11 sklearn_feature_selection.SelectFdr(alpha=0.05, score_func=<function f_classif at 0x113806d90>) 12 sklearn_feature_selection.SelectFwe(alpha=0.05, score_func=<function f_classif at 0x113806d90>) 13 sklearn_feature_selection.VarianceThreshold(threshold=0.0) 14 sklearn_decomposition.FactorAnalysis(copy=True, iterated_power=3, max_iter=1000, n_components=None, noise_variance_init=None, random_state=0, svd_method='randomized', tol=0.01) 15 sklearn_decomposition.FastICA(algorithm='parallel', fun='logcosh', fun_args=None, max_iter=200, n_components=None, random_state=0, tol=0.0001, w_init=None, whiten=True) 16 sklearn_decomposition.IncrementalPCA(batch_size=None, copy=True, n_components=None, whiten=False) 17 sklearn_decomposition.KernelPCA(alpha=1.0, coef0=1, copy_X=True, degree=3, eigen_solver='auto', fit_inverse_transform=False, gamma=None, kernel='linear', kernel_params=None, max_iter=None, n_components=None, random_state=0, remove_zero_eig=False, tol=0) 18 sklearn_decomposition.LatentDirichletAllocation(batch_size=128, doc_topic_prior=None, evaluate_every=-1, learning_decay=0.7, learning_method=None, learning_offset=10.0, max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001, n_components=10, n_topics=None, perp_tol=0.1, random_state=0, topic_word_prior=None, total_samples=1000000.0, verbose=0) 19 sklearn_decomposition.MiniBatchDictionaryLearning(alpha=1, batch_size=3, dict_init=None, fit_algorithm='lars', n_components=None, n_iter=1000, random_state=0, shuffle=True, split_sign=False, transform_algorithm='omp', transform_alpha=None, transform_n_nonzero_coefs=None, verbose=False) 20 sklearn_decomposition.MiniBatchSparsePCA(alpha=1, batch_size=3, callback=None, method='lars', n_components=None, n_iter=100, random_state=0, ridge_alpha=0.01, shuffle=True, verbose=False) 21 sklearn_decomposition.NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200, n_components=None, random_state=0, shuffle=False, solver='cd', tol=0.0001, verbose=0) 22 sklearn_decomposition.PCA(copy=True, iterated_power='auto', n_components=None, random_state=0, svd_solver='auto', tol=0.0, whiten=False) 23 sklearn_decomposition.SparsePCA(U_init=None, V_init=None, alpha=1, max_iter=1000, method='lars', n_components=None, random_state=0, ridge_alpha=0.01, tol=1e-08, verbose=False) 24 sklearn_decomposition.TruncatedSVD(algorithm='randomized', n_components=2, n_iter=5, random_state=0, tol=0.0) 25 sklearn_kernel_approximation.Nystroem(coef0=None, degree=None, gamma=None, kernel='rbf', kernel_params=None, n_components=100, random_state=0) 26 sklearn_kernel_approximation.RBFSampler(gamma=1.0, n_components=100, random_state=0) 27 sklearn_kernel_approximation.AdditiveChi2Sampler(sample_interval=None, sample_steps=2) 28 sklearn_kernel_approximation.SkewedChi2Sampler(n_components=100, random_state=0, skewedness=1.0) 29 sklearn_cluster.FeatureAgglomeration(affinity='euclidean', compute_full_tree='auto', connectivity=None, linkage='ward', memory=None, n_clusters=2, pooling_func=<function mean at 0x113078ae8>) 30 skrebate_ReliefF(discrete_threshold=10, n_features_to_select=10, n_neighbors=100, verbose=False) 31 skrebate_SURF(discrete_threshold=10, n_features_to_select=10, verbose=False) 32 skrebate_SURFstar(discrete_threshold=10, n_features_to_select=10, verbose=False) 33 skrebate_MultiSURF(discrete_threshold=10, n_features_to_select=10, verbose=False) 34 skrebate_MultiSURFstar(discrete_threshold=10, n_features_to_select=10, verbose=False) 'sk_prep_all': All sklearn preprocessing estimators, i.e., 0-6 'fs_all': All feature_selection estimators, i.e., 7-13 'decomp_all': All decomposition estimators, i.e., 14-24 'k_appr_all': All kernel_approximation estimators, i.e., 25-28 'reb_all': All skrebate estimators, i.e., 30-34 'all_0': All except the imbalanced-learn samplers, i.e., 0-34 'imb_all': All imbalanced-learn sampling methods, i.e., 35-53. **CAUTION**: Mix of imblearn and other preprocessors may not work. None: opt out of preprocessor Support mix (CAUTION: Mix of imblearn and other preprocessors may not work), e.g.:: : [None, 'sk_prep_all', 21, 'k_appr_all', sklearn_feature_selection.SelectKBest(k=50)] **Whether to do train_test_split?** Please refer to `https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation`_ .. image:: https://scikit-learn.org/stable/_images/grid_search_cross_validation.png :height: 300 :width: 400 .. _`Scikit-learn model_selection GridSearchCV`: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html .. _`Scikit-learn model_selection RandomizedSearchCV`: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html .. _`Tuning hyper-parameters`: http://scikit-learn.org/stable/modules/grid_search.html .. _`svm`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.svm .. _`linear_model`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.linear_model .. _`ensemble`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.ensemble .. _`naive_bayes`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.naive_bayes .. _`tree`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.tree .. _`neighbors`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.neighbors .. _`xgboost`: https://xgboost.readthedocs.io/en/latest/python/python_api.html .. _`sklearn.preprocessing`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.preprocessing .. _`feature_selection`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_selection .. _`decomposition`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.decomposition .. _`kernel_approximation`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.kernel_approximation .. _`cluster.FeatureAgglomeration`: http://scikit-learn.org/stable/modules/generated/sklearn.cluster.FeatureAgglomeration.html .. _`skrebate`: https://epistasislab.github.io/scikit-rebate/using/ .. _`https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation`: https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation ]]> </help> <expand macro="sklearn_citation"> <expand macro="skrebate_citation"/> <expand macro="xgboost_citation"/> <expand macro="imblearn_citation"/> </expand> </tool>