sklearn_searchcv: search_model_validation.xml comparison

comparison search_model_validation.xml @ 10:82b6104d4682 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 60f0fbc0eafd7c11bc60fb6c77f2937782efd8a9-dirty

author	bgruening
date	Fri, 09 Aug 2019 07:12:16 -0400
parents	1c4a241bef5c
children	68753d45815f

comparison

equal deleted inserted replaced

-:21d3e08b1a48
+:82b6104d4682
 <import>main_macros.xml</import>
 </macros>
 <expand macro="python_requirements"/>
 <expand macro="macro_stdio"/>
 <version_command>echo "@VERSION@"</version_command>
-<command>
+<command detect_errors="aggressive">
 <![CDATA[
+export HDF5_USE_FILE_LOCKING='FALSE';
+#if $input_options.selected_input == 'refseq_and_interval'
+bgzip -c '$input_options.target_file' > '${target_file.element_identifier}.gz' &&
+tabix -p bed '${target_file.element_identifier}.gz' &&
+#end if
 python '$__tool_directory__/search_model_validation.py'
 --inputs '$inputs'
 --estimator '$search_schemes.infile_estimator'
+#if $input_options.selected_input == 'seq_fasta'
+--fasta_path '$input_options.fasta_path'
+#elif $input_options.selected_input == 'refseq_and_interval'
+--ref_seq '$input_options.ref_genome_file'
+--interval '$input_options.interval_file'
+--targets "`pwd`/${target_file.element_identifier}.gz"
+#else
 --infile1 '$input_options.infile1'
+#end if
 --infile2 '$input_options.infile2'
---outfile_result '$outfile_result'
+--outfile_result "`pwd`/tmp_outfile_result"
-#if $save
+#if $save != 'nope'
 --outfile_object '$outfile_object'
 #end if
+#if $save == 'save_weights'
+--outfile_weights '$outfile_weights'
+#end if
 #if $search_schemes.options.cv_selector.selected_cv in ['GroupKFold', 'GroupShuffleSplit', 'LeaveOneGroupOut', 'LeavePGroupsOut']
---groups '$inputs,$search_schemes.options.cv_selector.groups_selector.infile_g'
+--groups '$search_schemes.options.cv_selector.groups_selector.infile_g'
 #end if
+>'$outfile_result' && cp tmp_outfile_result '$outfile_result';
 ]]>
 </command>
 <configfiles>
 <inputs name="inputs" />
 <param argument="n_iter" type="integer" value="10" label="Number of parameter settings that are sampled"/>
 <expand macro="random_state"/>
 </section>
 </when>
 </conditional>
-<param name="save" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="true" label="Save the searchCV object"/>
 <expand macro="sl_mixed_input"/>
-<conditional name="train_test_split">
+<conditional name="outer_split">
-<param name="do_split" type="select" label="Whether to hold a portion of samples for test exclusively?" help="train_test_split">
+<param name="split_mode" type="select" label="Whether to hold a portion of samples for test exclusively?" help="Nested CV or train_test_split">
-<option value="no">Nope</option>
+<option value="no" selected="true">Nope</option>
-<option value="yes">Yes - I do</option>
+<option value="train_test_split">Yes - do a single train test split</option>
+<option value="nested_cv">Yes - do nested CV</option>
 </param>
 <when value='no'/>
-<when value='yes'>
+<when value='train_test_split'>
 <param argument="test_size" type="float" optional="True" value="0.25" label="Test size:"/>
-<param argument="train_size" type="float" optional="True" value="" label="Train size:"/>
+<!--param argument="train_size" type="float" optional="True" value="" label="Train size:"/>-->
 <param argument="random_state" type="integer" optional="True" value="" label="Random seed number:"/>
 <param argument="shuffle" type="select">
 <option value="None">None - No shuffle</option>
 <option value="simple">Shuffle -- for regression problems</option>
 <option value="stratified">StratifiedShuffle -- will use the target values as class labels</option>
 <option value="group">GroupShuffle -- make sure group CV option is choosen</option>
 </param>
 </when>
+<when value="nested_cv">
+<expand macro="cv_reduced" label="Select the outer cv splitter"/>
+</when>
 </conditional>
+<param name="save" type="select" label="Save best estimator?" help="For security reason, deep learning models will be saved into two datasets, model skeleton and weights.">
+<option value="nope" selected="true">Nope, save is unnecessary</option>
+<option value="save_estimator">Fitted estimator (excluding deep learning)</option>
+<option value="save_weights">Model skeleton and weights, for deep learning exclusively</option>
+</param>
 </inputs>
 <outputs>
 <data format="tabular" name="outfile_result"/>
-<data format="zip" name="outfile_object" label="${search_schemes.selected_search_scheme} on ${on_string}">
+<data format="zip" name="outfile_object" label="Fitted estimator or estimator skeleton on ${on_string}">
-<filter>save</filter>
+<filter>save != 'nope'</filter>
+</data>
+<data format="h5" name="outfile_weights" label="Weights trained on ${on_string}">
+<filter>save == 'save_weights'</filter>
 </data>
 </outputs>
 <tests>
 <test>
 <param name="selected_search_scheme" value="GridSearchCV"/>
 <param name="header1" value="true" />
 <param name="selected_column_selector_option" value="all_columns"/>
 <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
 <param name="header2" value="true" />
 <param name="selected_column_selector_option2" value="all_columns"/>
+<param name="save" value="save_estimator"/>
 <output name="outfile_object" file="searchCV01" compare="sim_size" delta="10"/>
 </test>
 <test>
 <param name="selected_search_scheme" value="GridSearchCV"/>
 <param name="infile_estimator" value="pipeline06" ftype="zip"/>
 <param name="header1" value="true" />
 <param name="selected_column_selector_option" value="all_columns"/>
 <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
 <param name="header2" value="true" />
 <param name="selected_column_selector_option2" value="all_columns"/>
+<param name="save" value="save_estimator"/>
 <output name="outfile_object" file="searchCV02" compare="sim_size" delta="10"/>
 </test>
 <test>
 <param name="selected_search_scheme" value="GridSearchCV"/>
 <param name="infile_estimator" value="pipeline03" ftype="zip"/>
 <test>
 <param name="selected_search_scheme" value="GridSearchCV"/>
 <param name="infile_estimator" value="pipeline09" ftype="zip"/>
 <param name="infile_params" value="get_params09.tabular" ftype="tabular"/>
 <repeat name="param_set">
-<param name="sp_list" value=": [None,'sk_prep_all', 8, 14, skrebate_ReliefF(n_features_to_select=12)]"/>
+<param name="sp_list" value=": [None,'sk_prep_all', 7, 13, skrebate_ReliefF(n_features_to_select=12)]"/>
 <param name="sp_name" value="relieff"/>
 </repeat>
 <repeat name="param_set">
 <param name="sp_list" value="[10]"/>
 <param name="sp_name" value="randomforestregressor__random_state"/>
 <has_n_columns n="13"/>
 <has_text text="0.8149439619875293"/>
 </assert_contents>
 </output>
 </test>
-<!--test>
+<test>
 <conditional name="search_schemes">
 <param name="selected_search_scheme" value="GridSearchCV"/>
 <param name="infile_estimator" value="pipeline05" ftype="zip"/>
 <section name="search_params_builder">
 <param name="infile_params" value="get_params05.tabular" ftype="tabular"/>
 <param name="header1" value="true" />
 <param name="selected_column_selector_option" value="all_columns"/>
 <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
 <param name="header2" value="true" />
 <param name="selected_column_selector_option2" value="all_columns"/>
+<conditional name="outer_split">
+<param name="split_mode" value="train_test_split"/>
+<param name="shuffle" value="simple"/>
+<param name="random_state" value="123"/>
+</conditional>
 <output name="outfile_result">
 <assert_contents>
 <has_n_columns n="1"/>
-<has_text text="0.7986842219788204" />
+<has_text text="0.8124083594523798"/>
 </assert_contents>
 </output>
-</test-->
+</test>
+<test>
+<conditional name="search_schemes">
+<param name="selected_search_scheme" value="GridSearchCV"/>
+<param name="infile_estimator" value="pipeline05" ftype="zip"/>
+<section name="search_params_builder">
+<param name="infile_params" value="get_params05.tabular" ftype="tabular"/>
+<repeat name="param_set">
+<param name="sp_list" value="[10, 50, 100, 300]"/>
+<param name="sp_name" value="randomforestregressor__n_estimators"/>
+</repeat>
+</section>
+</conditional>
+<param name="infile1" value="regression_X.tabular" ftype="tabular"/>
+<param name="header1" value="true" />
+<param name="selected_column_selector_option" value="all_columns"/>
+<param name="infile2" value="regression_y.tabular" ftype="tabular"/>
+<param name="header2" value="true" />
+<param name="selected_column_selector_option2" value="all_columns"/>
+<conditional name="outer_split">
+<param name="split_mode" value="nested_cv"/>
+<conditional name="cv_selector">
+<param name='selected_cv' value="KFold"/>
+<param name="n_splits" value="3"/>
+<param name="shuffle" value="true" />
+<param name="random_state" value="123"/>
+</conditional>
+</conditional>
+<output name="outfile_result">
+<assert_contents>
+<has_n_columns n="4"/>
+<has_text text="0.8044418936007722" />
+</assert_contents>
+</output>
+</test>
 </tests>
 <help>
 <![CDATA[
 **What it does**
 Searches optimized parameter settings for an estimator or pipeline through either exhaustive grid cross validation search or Randomized cross validation search.
 **Hot number/keyword for preprocessors**::
 0   sklearn_preprocessing.StandardScaler(copy=True, with_mean=True, with_std=True)
 1   sklearn_preprocessing.Binarizer(copy=True, threshold=0.0)
-2   sklearn_preprocessing.Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)
+2   sklearn_preprocessing.MaxAbsScaler(copy=True)
-3   sklearn_preprocessing.MaxAbsScaler(copy=True)
+3   sklearn_preprocessing.Normalizer(copy=True, norm='l2')
-4   sklearn_preprocessing.Normalizer(copy=True, norm='l2')
+4   sklearn_preprocessing.MinMaxScaler(copy=True, feature_range=(0, 1))
-5   sklearn_preprocessing.MinMaxScaler(copy=True, feature_range=(0, 1))
+5   sklearn_preprocessing.PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)
-6   sklearn_preprocessing.PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)
+6   sklearn_preprocessing.RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True, with_scaling=True)
-7   sklearn_preprocessing.RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True, with_scaling=True)
+7   sklearn_feature_selection.SelectKBest(k=10, score_func=<function f_classif at 0x113806d90>)
-8   sklearn_feature_selection.SelectKBest(k=10, score_func=<function f_classif at 0x113806d90>)
+8   sklearn_feature_selection.GenericUnivariateSelect(mode='percentile', param=1e-05, score_func=<function f_classif at 0x113806d90>)
-9   sklearn_feature_selection.GenericUnivariateSelect(mode='percentile', param=1e-05, score_func=<function f_classif at 0x113806d90>)
+9  sklearn_feature_selection.SelectPercentile(percentile=10, score_func=<function f_classif at 0x113806d90>)
-10  sklearn_feature_selection.SelectPercentile(percentile=10, score_func=<function f_classif at 0x113806d90>)
+10  sklearn_feature_selection.SelectFpr(alpha=0.05, score_func=<function f_classif at 0x113806d90>)
-11  sklearn_feature_selection.SelectFpr(alpha=0.05, score_func=<function f_classif at 0x113806d90>)
+11  sklearn_feature_selection.SelectFdr(alpha=0.05, score_func=<function f_classif at 0x113806d90>)
-12  sklearn_feature_selection.SelectFdr(alpha=0.05, score_func=<function f_classif at 0x113806d90>)
+12  sklearn_feature_selection.SelectFwe(alpha=0.05, score_func=<function f_classif at 0x113806d90>)
-13  sklearn_feature_selection.SelectFwe(alpha=0.05, score_func=<function f_classif at 0x113806d90>)
+13  sklearn_feature_selection.VarianceThreshold(threshold=0.0)
-14  sklearn_feature_selection.VarianceThreshold(threshold=0.0)
+14  sklearn_decomposition.FactorAnalysis(copy=True, iterated_power=3, max_iter=1000, n_components=None,
-15  sklearn_decomposition.FactorAnalysis(copy=True, iterated_power=3, max_iter=1000, n_components=None,
 noise_variance_init=None, random_state=0, svd_method='randomized', tol=0.01)
-16  sklearn_decomposition.FastICA(algorithm='parallel', fun='logcosh', fun_args=None,
+15  sklearn_decomposition.FastICA(algorithm='parallel', fun='logcosh', fun_args=None,
 max_iter=200, n_components=None, random_state=0, tol=0.0001, w_init=None, whiten=True)
-17  sklearn_decomposition.IncrementalPCA(batch_size=None, copy=True, n_components=None, whiten=False)
+16  sklearn_decomposition.IncrementalPCA(batch_size=None, copy=True, n_components=None, whiten=False)
-18  sklearn_decomposition.KernelPCA(alpha=1.0, coef0=1, copy_X=True, degree=3, eigen_solver='auto',
+17  sklearn_decomposition.KernelPCA(alpha=1.0, coef0=1, copy_X=True, degree=3, eigen_solver='auto',
 fit_inverse_transform=False, gamma=None, kernel='linear', kernel_params=None, max_iter=None,
 n_components=None, random_state=0, remove_zero_eig=False, tol=0)
-19  sklearn_decomposition.LatentDirichletAllocation(batch_size=128, doc_topic_prior=None, evaluate_every=-1, learning_decay=0.7,
+18  sklearn_decomposition.LatentDirichletAllocation(batch_size=128, doc_topic_prior=None, evaluate_every=-1, learning_decay=0.7,
 learning_method=None, learning_offset=10.0, max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001, n_components=10,
 n_topics=None, perp_tol=0.1, random_state=0, topic_word_prior=None, total_samples=1000000.0, verbose=0)
-20  sklearn_decomposition.MiniBatchDictionaryLearning(alpha=1, batch_size=3, dict_init=None, fit_algorithm='lars',
+19  sklearn_decomposition.MiniBatchDictionaryLearning(alpha=1, batch_size=3, dict_init=None, fit_algorithm='lars',
 n_components=None, n_iter=1000, random_state=0, shuffle=True, split_sign=False, transform_algorithm='omp',
 transform_alpha=None, transform_n_nonzero_coefs=None, verbose=False)
-21  sklearn_decomposition.MiniBatchSparsePCA(alpha=1, batch_size=3, callback=None, method='lars', n_components=None,
+20  sklearn_decomposition.MiniBatchSparsePCA(alpha=1, batch_size=3, callback=None, method='lars', n_components=None,
 n_iter=100, random_state=0, ridge_alpha=0.01, shuffle=True, verbose=False)
-22  sklearn_decomposition.NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
+21  sklearn_decomposition.NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
 n_components=None, random_state=0, shuffle=False, solver='cd', tol=0.0001, verbose=0)
-23  sklearn_decomposition.PCA(copy=True, iterated_power='auto', n_components=None, random_state=0, svd_solver='auto', tol=0.0, whiten=False)
+22  sklearn_decomposition.PCA(copy=True, iterated_power='auto', n_components=None, random_state=0, svd_solver='auto', tol=0.0, whiten=False)
-24  sklearn_decomposition.SparsePCA(U_init=None, V_init=None, alpha=1, max_iter=1000, method='lars',
+23  sklearn_decomposition.SparsePCA(U_init=None, V_init=None, alpha=1, max_iter=1000, method='lars',
 n_components=None, random_state=0, ridge_alpha=0.01, tol=1e-08, verbose=False)
-25  sklearn_decomposition.TruncatedSVD(algorithm='randomized', n_components=2, n_iter=5, random_state=0, tol=0.0)
+24  sklearn_decomposition.TruncatedSVD(algorithm='randomized', n_components=2, n_iter=5, random_state=0, tol=0.0)
-26  sklearn_kernel_approximation.Nystroem(coef0=None, degree=None, gamma=None, kernel='rbf',
+25  sklearn_kernel_approximation.Nystroem(coef0=None, degree=None, gamma=None, kernel='rbf',
 kernel_params=None, n_components=100, random_state=0)
-27  sklearn_kernel_approximation.RBFSampler(gamma=1.0, n_components=100, random_state=0)
+26  sklearn_kernel_approximation.RBFSampler(gamma=1.0, n_components=100, random_state=0)
-28  sklearn_kernel_approximation.AdditiveChi2Sampler(sample_interval=None, sample_steps=2)
+27  sklearn_kernel_approximation.AdditiveChi2Sampler(sample_interval=None, sample_steps=2)
-29  sklearn_kernel_approximation.SkewedChi2Sampler(n_components=100, random_state=0, skewedness=1.0)
+28  sklearn_kernel_approximation.SkewedChi2Sampler(n_components=100, random_state=0, skewedness=1.0)
-30  sklearn_cluster.FeatureAgglomeration(affinity='euclidean', compute_full_tree='auto', connectivity=None,
+29  sklearn_cluster.FeatureAgglomeration(affinity='euclidean', compute_full_tree='auto', connectivity=None,
 linkage='ward', memory=None, n_clusters=2, pooling_func=<function mean at 0x113078ae8>)
-31  skrebate_ReliefF(discrete_threshold=10, n_features_to_select=10, n_neighbors=100, verbose=False)
+30  skrebate_ReliefF(discrete_threshold=10, n_features_to_select=10, n_neighbors=100, verbose=False)
-32  skrebate_SURF(discrete_threshold=10, n_features_to_select=10, verbose=False)
+31  skrebate_SURF(discrete_threshold=10, n_features_to_select=10, verbose=False)
-33  skrebate_SURFstar(discrete_threshold=10, n_features_to_select=10, verbose=False)
+32  skrebate_SURFstar(discrete_threshold=10, n_features_to_select=10, verbose=False)
-34  skrebate_MultiSURF(discrete_threshold=10, n_features_to_select=10, verbose=False)
+33  skrebate_MultiSURF(discrete_threshold=10, n_features_to_select=10, verbose=False)
-35  skrebate_MultiSURFstar(discrete_threshold=10, n_features_to_select=10, verbose=False)
+34  skrebate_MultiSURFstar(discrete_threshold=10, n_features_to_select=10, verbose=False)
-'sk_prep_all':   All sklearn preprocessing estimators, i.e., 0-7
+'sk_prep_all':   All sklearn preprocessing estimators, i.e., 0-6
-'fs_all':        All feature_selection estimators, i.e., 8-14
+'fs_all':        All feature_selection estimators, i.e., 7-13
-'decomp_all':    All decomposition estimators, i.e., 15-25
+'decomp_all':    All decomposition estimators, i.e., 14-24
-'k_appr_all':    All kernel_approximation estimators, i.e., 26-29
+'k_appr_all':    All kernel_approximation estimators, i.e., 25-28
-'reb_all':       All skrebate estimators, i.e., 31-35
+'reb_all':       All skrebate estimators, i.e., 30-34
-'all_0':         All except the imbalanced-learn samplers, i.e., 0-35
+'all_0':         All except the imbalanced-learn samplers, i.e., 0-34
-'imb_all':       All imbalanced-learn sampling methods, i.e., 36-54.
+'imb_all':       All imbalanced-learn sampling methods, i.e., 35-53.
 **CAUTION**: Mix of imblearn and other preprocessors may not work.
 None:           opt out of preprocessor
 Support mix (CAUTION: Mix of imblearn and other preprocessors may not work), e.g.::
-: [None, 'sk_prep_all', 22, 'k_appr_all', sklearn_feature_selection.SelectKBest(k=50)]
+: [None, 'sk_prep_all', 21, 'k_appr_all', sklearn_feature_selection.SelectKBest(k=50)]
 **Whether to do train_test_split?**

Mercurial > repos > bgruening > sklearn_searchcv

comparison search_model_validation.xml @ 10:82b6104d4682 draft