comparison search_model_validation.xml @ 15:c1ca24a1509d draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 5b2ac730ec6d3b762faa9034eddd19ad1b347476"
author bgruening
date Mon, 16 Dec 2019 05:41:39 -0500
parents 103aaea17119
children cb5635e30842
comparison
equal deleted inserted replaced
14:4af699d766e4 15:c1ca24a1509d
1 <tool id="sklearn_searchcv" name="Hyperparameter Search" version="@VERSION@"> 1 <tool id="sklearn_searchcv" name="Hyperparameter Search" version="@VERSION@">
2 <description>using exhausitive or randomized search</description> 2 <description>performs hyperparameter optimization using various SearchCVs</description>
3 <macros> 3 <macros>
4 <import>main_macros.xml</import> 4 <import>main_macros.xml</import>
5 <macro name="search_cv_estimator">
6 <param name="infile_estimator" type="data" format="zip" label="Choose the dataset containing pipeline/estimator object"/>
7 <param name="is_deep_learning" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="false" label="Is the estimator a deep learning model?"/>
8 <section name="search_params_builder" title="Search parameters Builder" expanded="true">
9 <param name="infile_params" type="data" format="tabular" optional="true" label="Choose the dataset containing parameter names" help="This dataset could be the output of `get_params` in the `Estimator Attributes` tool."/>
10 <repeat name="param_set" min="1" max="30" title="Parameter settings for search:">
11 <param name="sp_name" type="select" optional="true" label="Choose a parameter name (with current value)">
12 <options from_dataset="infile_params" startswith="@">
13 <column name="name" index="2"/>
14 <column name="value" index="1"/>
15 <filter type="unique_value" name="unique_param" column="1"/>
16 </options>
17 </param>
18 <param name="sp_list" type="text" value="" optional="true" label="Search list" help="list or array-like, for example: [1, 10, 100, 1000], [True, False] and ['auto', 'sqrt', None]. See `help` section for more examples">
19 <sanitizer>
20 <valid initial="default">
21 <add value="&apos;"/>
22 <add value="&quot;"/>
23 <add value="["/>
24 <add value="]"/>
25 </valid>
26 </sanitizer>
27 </param>
28 </repeat>
29 </section>
30 </macro>
5 </macros> 31 </macros>
6 <expand macro="python_requirements"/> 32 <expand macro="python_requirements"/>
7 <expand macro="macro_stdio"/> 33 <expand macro="macro_stdio"/>
8 <version_command>echo "@VERSION@"</version_command> 34 <version_command>echo "@VERSION@"</version_command>
9 <command detect_errors="aggressive"> 35 <command>
10 <![CDATA[ 36 <![CDATA[
11 export HDF5_USE_FILE_LOCKING='FALSE'; 37 export HDF5_USE_FILE_LOCKING='FALSE';
12 #if $input_options.selected_input == 'refseq_and_interval' 38 #if $input_options.selected_input == 'refseq_and_interval'
13 bgzip -c '$input_options.target_file' > '${target_file.element_identifier}.gz' && 39 bgzip -c '$input_options.target_file' > '${target_file.element_identifier}.gz' &&
14 tabix -p bed '${target_file.element_identifier}.gz' && 40 tabix -p bed '${target_file.element_identifier}.gz' &&
24 --targets "`pwd`/${target_file.element_identifier}.gz" 50 --targets "`pwd`/${target_file.element_identifier}.gz"
25 #else 51 #else
26 --infile1 '$input_options.infile1' 52 --infile1 '$input_options.infile1'
27 #end if 53 #end if
28 --infile2 '$input_options.infile2' 54 --infile2 '$input_options.infile2'
55 #if $save != 'save_no_fit'
29 --outfile_result "`pwd`/tmp_outfile_result" 56 --outfile_result "`pwd`/tmp_outfile_result"
30 #if $save != 'nope' 57 #end if
58 #if $save == 'save_estimator'
31 --outfile_object '$outfile_object' 59 --outfile_object '$outfile_object'
32 #end if 60 #end if
33 #if $save == 'save_weights' 61 #if $save == 'save_no_fit'
62 --outfile_object '$outfile_object_no_fit'
63 #end if
64 #if $search_schemes.is_deep_learning == 'booltrue' and $save == 'save_estimator' and $outer_split.split_mode == 'nested_cv'
34 --outfile_weights '$outfile_weights' 65 --outfile_weights '$outfile_weights'
35 #end if 66 #end if
36 #if $search_schemes.options.cv_selector.selected_cv in ['GroupKFold', 'GroupShuffleSplit', 'LeaveOneGroupOut', 'LeavePGroupsOut'] 67 #if $search_schemes.options.cv_selector.selected_cv in ['GroupKFold', 'GroupShuffleSplit', 'LeaveOneGroupOut', 'LeavePGroupsOut']
37 --groups '$search_schemes.options.cv_selector.groups_selector.infile_g' 68 --groups '$search_schemes.options.cv_selector.groups_selector.infile_g'
38 #end if 69 #end if
39 >'$outfile_result' && cp tmp_outfile_result '$outfile_result'; 70 #if $save != 'save_no_fit'
71 >'$outfile_result' && cp '$outfile_result' "`pwd`/../tool_stdout"
72 && cp "`pwd`/tmp_outfile_result" '$outfile_result';
73 #end if
40 74
41 ]]> 75 ]]>
42 </command> 76 </command>
43 <configfiles> 77 <configfiles>
44 <inputs name="inputs" /> 78 <inputs name="inputs" />
66 </conditional> 100 </conditional>
67 <expand macro="sl_mixed_input_plus_sequence"/> 101 <expand macro="sl_mixed_input_plus_sequence"/>
68 <conditional name="outer_split"> 102 <conditional name="outer_split">
69 <param name="split_mode" type="select" label="Whether to hold a portion of samples for test exclusively?" help="Nested CV or train_test_split"> 103 <param name="split_mode" type="select" label="Whether to hold a portion of samples for test exclusively?" help="Nested CV or train_test_split">
70 <option value="no" selected="true">Nope</option> 104 <option value="no" selected="true">Nope</option>
71 <option value="train_test_split">Yes - do a single train test split</option>
72 <option value="nested_cv">Yes - do nested CV</option> 105 <option value="nested_cv">Yes - do nested CV</option>
73 </param> 106 </param>
74 <when value='no'/> 107 <when value='no'/>
75 <when value='train_test_split'>
76 <param argument="test_size" type="float" optional="True" value="0.25" label="Test size:"/>
77 <!--param argument="train_size" type="float" optional="True" value="" label="Train size:"/>-->
78 <param argument="random_state" type="integer" optional="True" value="" label="Random seed number:"/>
79 <param argument="shuffle" type="select">
80 <option value="None">None - No shuffle</option>
81 <option value="simple">Shuffle -- for regression problems</option>
82 <option value="stratified">StratifiedShuffle -- will use the target values as class labels</option>
83 <option value="group">GroupShuffle -- make sure group CV option is choosen</option>
84 </param>
85 </when>
86 <when value="nested_cv"> 108 <when value="nested_cv">
87 <expand macro="cv_reduced" label="Select the outer cv splitter"/> 109 <expand macro="cv_reduced" label="Select the outer cv splitter"/>
88 </when> 110 </when>
89 </conditional> 111 </conditional>
90 <param name="save" type="select" label="Save best estimator?" help="For security reason, deep learning models will be saved into two datasets, model skeleton and weights. Caution: Save estimator doesn't work for nestCV or when refit is False."> 112 <param name="save" type="select" label="Save best estimator?" help="For a non-deep learning model, save will output fitted best_estimator_ (refit must be true) or a list of cv_results_ from each outer split in nested CV mode. For a deep learning model, by checking the boolean option below the model input, the outputs are two parts, model skeleton and weights. Save Deep learning model for nested CV is not supported.">
91 <option value="nope" selected="true">Nope, save is unnecessary</option> 113 <option value="nope">Nope, save is unnecessary</option>
92 <option value="save_estimator">Fitted estimator (excluding deep learning)</option> 114 <option value="save_estimator" selected="true">Fitted best estimator or Detailed cv_results_ from nested CV</option>
93 <option value="save_weights">Model skeleton and weights, for deep learning exclusively</option> 115 <option value="save_no_fit">SearchCV object without fitting</option>
94 </param> 116 </param>
95 </inputs> 117 </inputs>
96 <outputs> 118 <outputs>
97 <data format="tabular" name="outfile_result"/> 119 <data format="tabular" name="outfile_result">
120 <filter>save != 'save_no_fit'</filter>
121 </data>
98 <data format="zip" name="outfile_object" label="Fitted estimator or estimator skeleton on ${on_string}"> 122 <data format="zip" name="outfile_object" label="Fitted estimator or estimator skeleton on ${on_string}">
99 <filter>save != 'nope'</filter> 123 <filter>save == 'save_estimator' and outer_split['split_mode'] == 'no'</filter>
100 </data> 124 </data>
101 <data format="h5" name="outfile_weights" label="Weights trained on ${on_string}"> 125 <data format="h5" name="outfile_weights" label="Weights trained on ${on_string}">
102 <filter>save == 'save_weights'</filter> 126 <filter>search_schemes['is_deep_learning'] and save == 'save_estimator' and outer_split['split_mode'] == 'no'</filter>
127 </data>
128 <collection type="list" name="outfile_in_splits" label="cv_results_ from splits on ${on_string}">
129 <filter>not search_schemes['is_deep_learning'] and save == 'save_estimator' and outer_split['split_mode'] == 'nested_cv'</filter>
130 <discover_datasets format="tabular" pattern="__name__" directory="cv_results_in_folds"/>
131 </collection>
132 <data format="zip" name="outfile_object_no_fit" label="Unfitted SearchCV on ${on_string}">
133 <filter>save == 'save_no_fit'</filter>
103 </data> 134 </data>
104 </outputs> 135 </outputs>
105 <tests> 136 <tests>
106 <test> 137 <test>
107 <param name="selected_search_scheme" value="GridSearchCV"/> 138 <param name="selected_search_scheme" value="GridSearchCV"/>
387 <param name="header2" value="true" /> 418 <param name="header2" value="true" />
388 <param name="selected_column_selector_option2" value="all_columns"/> 419 <param name="selected_column_selector_option2" value="all_columns"/>
389 <output name="outfile_result" > 420 <output name="outfile_result" >
390 <assert_contents> 421 <assert_contents>
391 <has_n_columns n="13" /> 422 <has_n_columns n="13" />
392 <has_text text="0.09003449195911103"/> 423 <has_text text="0.08719866399898475"/>
393 </assert_contents>
394 </output>
395 </test>
396 <test>
397 <param name="selected_search_scheme" value="GridSearchCV"/>
398 <param name="infile_estimator" value="pipeline09" ftype="zip"/>
399 <param name="infile_params" value="get_params09.tabular" ftype="tabular"/>
400 <repeat name="param_set">
401 <param name="sp_list" value="[50, 100, 150, 200]"/>
402 <param name="sp_name" value="relieff__n_neighbors"/>
403 </repeat>
404 <repeat name="param_set">
405 <param name="sp_list" value="[324089]"/>
406 <param name="sp_name" value="randomforestregressor__random_state"/>
407 </repeat>
408 <param name="primary_scoring" value="explained_variance"/>
409 <param name="secondary_scoring" value="neg_mean_squared_error,r2"/>
410 <param name='selected_cv' value="StratifiedKFold"/>
411 <param name="n_splits" value="3"/>
412 <param name="shuffle" value="true" />
413 <param name="random_state" value="10"/>
414 <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
415 <param name="header1" value="true" />
416 <param name="selected_column_selector_option" value="all_columns"/>
417 <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
418 <param name="header2" value="true" />
419 <param name="selected_column_selector_option2" value="all_columns"/>
420 <output name="outfile_result" >
421 <assert_contents>
422 <has_n_columns n="25" />
423 <has_text text="0.7879267424165166"/>
424 <has_text text="0.787865425577799"/>
425 <has_text text="-29.40436189868029"/>
426 </assert_contents> 424 </assert_contents>
427 </output> 425 </output>
428 </test> 426 </test>
429 <test> 427 <test>
430 <param name="selected_search_scheme" value="GridSearchCV"/> 428 <param name="selected_search_scheme" value="GridSearchCV"/>
451 <param name="selected_search_scheme" value="GridSearchCV"/> 449 <param name="selected_search_scheme" value="GridSearchCV"/>
452 <param name="infile_estimator" value="pipeline05" ftype="zip"/> 450 <param name="infile_estimator" value="pipeline05" ftype="zip"/>
453 <param name="infile_params" value="get_params05.tabular" ftype="tabular"/> 451 <param name="infile_params" value="get_params05.tabular" ftype="tabular"/>
454 <repeat name="param_set"> 452 <repeat name="param_set">
455 <param name="sp_list" value="[10, 50, 100, 300]"/> 453 <param name="sp_list" value="[10, 50, 100, 300]"/>
456 <param name="sp_name" value="randomforestregressor__n_estimators"/> 454 <param name="sp_name" value="n_estimators"/>
457 </repeat> 455 </repeat>
458 <param name="infile1" value="regression_X.tabular" ftype="tabular"/> 456 <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
459 <param name="header1" value="true" /> 457 <param name="header1" value="true" />
460 <param name="selected_column_selector_option" value="all_columns"/> 458 <param name="selected_column_selector_option" value="all_columns"/>
461 <param name="infile2" value="regression_y.tabular" ftype="tabular"/> 459 <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
601 <param name="selected_search_scheme" value="GridSearchCV"/> 599 <param name="selected_search_scheme" value="GridSearchCV"/>
602 <param name="infile_estimator" value="pipeline12" ftype="zip"/> 600 <param name="infile_estimator" value="pipeline12" ftype="zip"/>
603 <param name="infile_params" value="get_params12.tabular" ftype="tabular"/> 601 <param name="infile_params" value="get_params12.tabular" ftype="tabular"/>
604 <repeat name="param_set"> 602 <repeat name="param_set">
605 <param name="sp_list" value="[10, 100, 200]"/> 603 <param name="sp_list" value="[10, 100, 200]"/>
606 <param name="sp_name" value="rfe__estimator__n_estimators"/> 604 <param name="sp_name" value="estimator__n_estimators"/>
607 </repeat> 605 </repeat>
608 <repeat name="param_set"> 606 <repeat name="param_set">
609 <param name="sp_list" value="[10, None]"/> 607 <param name="sp_list" value="[10, None]"/>
610 <param name="sp_name" value="rfe__n_features_to_select"/> 608 <param name="sp_name" value="n_features_to_select"/>
611 </repeat> 609 </repeat>
612 <param name="primary_scoring" value="r2"/> 610 <param name="primary_scoring" value="r2"/>
613 <param name="infile1" value="regression_X.tabular" ftype="tabular"/> 611 <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
614 <param name="header1" value="true" /> 612 <param name="header1" value="true" />
615 <param name="selected_column_selector_option" value="all_columns"/> 613 <param name="selected_column_selector_option" value="all_columns"/>
629 <param name="infile_estimator" value="pipeline05" ftype="zip"/> 627 <param name="infile_estimator" value="pipeline05" ftype="zip"/>
630 <section name="search_params_builder"> 628 <section name="search_params_builder">
631 <param name="infile_params" value="get_params05.tabular" ftype="tabular"/> 629 <param name="infile_params" value="get_params05.tabular" ftype="tabular"/>
632 <repeat name="param_set"> 630 <repeat name="param_set">
633 <param name="sp_list" value="[10, 50, 100, 300]"/> 631 <param name="sp_list" value="[10, 50, 100, 300]"/>
634 <param name="sp_name" value="randomforestregressor__n_estimators"/> 632 <param name="sp_name" value="n_estimators"/>
635 </repeat>
636 </section>
637 </conditional>
638 <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
639 <param name="header1" value="true" />
640 <param name="selected_column_selector_option" value="all_columns"/>
641 <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
642 <param name="header2" value="true" />
643 <param name="selected_column_selector_option2" value="all_columns"/>
644 <conditional name="outer_split">
645 <param name="split_mode" value="train_test_split"/>
646 <param name="shuffle" value="simple"/>
647 <param name="random_state" value="123"/>
648 </conditional>
649 <output name="outfile_result">
650 <assert_contents>
651 <has_n_columns n="1"/>
652 <has_text text="0.8124083594523798"/>
653 </assert_contents>
654 </output>
655 </test>
656 <test>
657 <conditional name="search_schemes">
658 <param name="selected_search_scheme" value="GridSearchCV"/>
659 <param name="infile_estimator" value="pipeline05" ftype="zip"/>
660 <section name="search_params_builder">
661 <param name="infile_params" value="get_params05.tabular" ftype="tabular"/>
662 <repeat name="param_set">
663 <param name="sp_list" value="[10, 50, 100, 300]"/>
664 <param name="sp_name" value="randomforestregressor__n_estimators"/>
665 </repeat> 633 </repeat>
666 </section> 634 </section>
667 </conditional> 635 </conditional>
668 <param name="infile1" value="regression_X.tabular" ftype="tabular"/> 636 <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
669 <param name="header1" value="true" /> 637 <param name="header1" value="true" />