Mercurial > repos > bgruening > sklearn_searchcv
diff search_model_validation.xml @ 10:82b6104d4682 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 60f0fbc0eafd7c11bc60fb6c77f2937782efd8a9-dirty
author | bgruening |
---|---|
date | Fri, 09 Aug 2019 07:12:16 -0400 |
parents | 1c4a241bef5c |
children | 68753d45815f |
line wrap: on
line diff
--- a/search_model_validation.xml Tue Jul 09 19:26:54 2019 -0400 +++ b/search_model_validation.xml Fri Aug 09 07:12:16 2019 -0400 @@ -6,20 +6,37 @@ <expand macro="python_requirements"/> <expand macro="macro_stdio"/> <version_command>echo "@VERSION@"</version_command> - <command> + <command detect_errors="aggressive"> <![CDATA[ + export HDF5_USE_FILE_LOCKING='FALSE'; + #if $input_options.selected_input == 'refseq_and_interval' + bgzip -c '$input_options.target_file' > '${target_file.element_identifier}.gz' && + tabix -p bed '${target_file.element_identifier}.gz' && + #end if python '$__tool_directory__/search_model_validation.py' --inputs '$inputs' --estimator '$search_schemes.infile_estimator' + #if $input_options.selected_input == 'seq_fasta' + --fasta_path '$input_options.fasta_path' + #elif $input_options.selected_input == 'refseq_and_interval' + --ref_seq '$input_options.ref_genome_file' + --interval '$input_options.interval_file' + --targets "`pwd`/${target_file.element_identifier}.gz" + #else --infile1 '$input_options.infile1' + #end if --infile2 '$input_options.infile2' - --outfile_result '$outfile_result' - #if $save + --outfile_result "`pwd`/tmp_outfile_result" + #if $save != 'nope' --outfile_object '$outfile_object' #end if + #if $save == 'save_weights' + --outfile_weights '$outfile_weights' + #end if #if $search_schemes.options.cv_selector.selected_cv in ['GroupKFold', 'GroupShuffleSplit', 'LeaveOneGroupOut', 'LeavePGroupsOut'] - --groups '$inputs,$search_schemes.options.cv_selector.groups_selector.infile_g' + --groups '$search_schemes.options.cv_selector.groups_selector.infile_g' #end if + >'$outfile_result' && cp tmp_outfile_result '$outfile_result'; ]]> </command> @@ -47,17 +64,17 @@ </section> </when> </conditional> - <param name="save" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="true" label="Save the searchCV object"/> <expand macro="sl_mixed_input"/> - <conditional name="train_test_split"> - <param name="do_split" type="select" label="Whether to hold a portion of samples for test exclusively?" help="train_test_split"> - <option value="no">Nope</option> - <option value="yes">Yes - I do</option> + <conditional name="outer_split"> + <param name="split_mode" type="select" label="Whether to hold a portion of samples for test exclusively?" help="Nested CV or train_test_split"> + <option value="no" selected="true">Nope</option> + <option value="train_test_split">Yes - do a single train test split</option> + <option value="nested_cv">Yes - do nested CV</option> </param> <when value='no'/> - <when value='yes'> + <when value='train_test_split'> <param argument="test_size" type="float" optional="True" value="0.25" label="Test size:"/> - <param argument="train_size" type="float" optional="True" value="" label="Train size:"/> + <!--param argument="train_size" type="float" optional="True" value="" label="Train size:"/>--> <param argument="random_state" type="integer" optional="True" value="" label="Random seed number:"/> <param argument="shuffle" type="select"> <option value="None">None - No shuffle</option> @@ -66,12 +83,23 @@ <option value="group">GroupShuffle -- make sure group CV option is choosen</option> </param> </when> + <when value="nested_cv"> + <expand macro="cv_reduced" label="Select the outer cv splitter"/> + </when> </conditional> + <param name="save" type="select" label="Save best estimator?" help="For security reason, deep learning models will be saved into two datasets, model skeleton and weights."> + <option value="nope" selected="true">Nope, save is unnecessary</option> + <option value="save_estimator">Fitted estimator (excluding deep learning)</option> + <option value="save_weights">Model skeleton and weights, for deep learning exclusively</option> + </param> </inputs> <outputs> <data format="tabular" name="outfile_result"/> - <data format="zip" name="outfile_object" label="${search_schemes.selected_search_scheme} on ${on_string}"> - <filter>save</filter> + <data format="zip" name="outfile_object" label="Fitted estimator or estimator skeleton on ${on_string}"> + <filter>save != 'nope'</filter> + </data> + <data format="h5" name="outfile_weights" label="Weights trained on ${on_string}"> + <filter>save == 'save_weights'</filter> </data> </outputs> <tests> @@ -227,6 +255,7 @@ <param name="infile2" value="regression_y.tabular" ftype="tabular"/> <param name="header2" value="true" /> <param name="selected_column_selector_option2" value="all_columns"/> + <param name="save" value="save_estimator"/> <output name="outfile_object" file="searchCV01" compare="sim_size" delta="10"/> </test> <test> @@ -331,6 +360,7 @@ <param name="infile2" value="regression_y.tabular" ftype="tabular"/> <param name="header2" value="true" /> <param name="selected_column_selector_option2" value="all_columns"/> + <param name="save" value="save_estimator"/> <output name="outfile_object" file="searchCV02" compare="sim_size" delta="10"/> </test> <test> @@ -509,7 +539,7 @@ <param name="infile_estimator" value="pipeline09" ftype="zip"/> <param name="infile_params" value="get_params09.tabular" ftype="tabular"/> <repeat name="param_set"> - <param name="sp_list" value=": [None,'sk_prep_all', 8, 14, skrebate_ReliefF(n_features_to_select=12)]"/> + <param name="sp_list" value=": [None,'sk_prep_all', 7, 13, skrebate_ReliefF(n_features_to_select=12)]"/> <param name="sp_name" value="relieff"/> </repeat> <repeat name="param_set"> @@ -593,7 +623,7 @@ </assert_contents> </output> </test> - <!--test> + <test> <conditional name="search_schemes"> <param name="selected_search_scheme" value="GridSearchCV"/> <param name="infile_estimator" value="pipeline05" ftype="zip"/> @@ -611,13 +641,52 @@ <param name="infile2" value="regression_y.tabular" ftype="tabular"/> <param name="header2" value="true" /> <param name="selected_column_selector_option2" value="all_columns"/> + <conditional name="outer_split"> + <param name="split_mode" value="train_test_split"/> + <param name="shuffle" value="simple"/> + <param name="random_state" value="123"/> + </conditional> <output name="outfile_result"> <assert_contents> <has_n_columns n="1"/> - <has_text text="0.7986842219788204" /> + <has_text text="0.8124083594523798"/> </assert_contents> </output> - </test--> + </test> + <test> + <conditional name="search_schemes"> + <param name="selected_search_scheme" value="GridSearchCV"/> + <param name="infile_estimator" value="pipeline05" ftype="zip"/> + <section name="search_params_builder"> + <param name="infile_params" value="get_params05.tabular" ftype="tabular"/> + <repeat name="param_set"> + <param name="sp_list" value="[10, 50, 100, 300]"/> + <param name="sp_name" value="randomforestregressor__n_estimators"/> + </repeat> + </section> + </conditional> + <param name="infile1" value="regression_X.tabular" ftype="tabular"/> + <param name="header1" value="true" /> + <param name="selected_column_selector_option" value="all_columns"/> + <param name="infile2" value="regression_y.tabular" ftype="tabular"/> + <param name="header2" value="true" /> + <param name="selected_column_selector_option2" value="all_columns"/> + <conditional name="outer_split"> + <param name="split_mode" value="nested_cv"/> + <conditional name="cv_selector"> + <param name='selected_cv' value="KFold"/> + <param name="n_splits" value="3"/> + <param name="shuffle" value="true" /> + <param name="random_state" value="123"/> + </conditional> + </conditional> + <output name="outfile_result"> + <assert_contents> + <has_n_columns n="4"/> + <has_text text="0.8044418936007722" /> + </assert_contents> + </output> + </test> </tests> <help> <![CDATA[ @@ -661,66 +730,65 @@ 0 sklearn_preprocessing.StandardScaler(copy=True, with_mean=True, with_std=True) 1 sklearn_preprocessing.Binarizer(copy=True, threshold=0.0) - 2 sklearn_preprocessing.Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0) - 3 sklearn_preprocessing.MaxAbsScaler(copy=True) - 4 sklearn_preprocessing.Normalizer(copy=True, norm='l2') - 5 sklearn_preprocessing.MinMaxScaler(copy=True, feature_range=(0, 1)) - 6 sklearn_preprocessing.PolynomialFeatures(degree=2, include_bias=True, interaction_only=False) - 7 sklearn_preprocessing.RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True, with_scaling=True) - 8 sklearn_feature_selection.SelectKBest(k=10, score_func=<function f_classif at 0x113806d90>) - 9 sklearn_feature_selection.GenericUnivariateSelect(mode='percentile', param=1e-05, score_func=<function f_classif at 0x113806d90>) - 10 sklearn_feature_selection.SelectPercentile(percentile=10, score_func=<function f_classif at 0x113806d90>) - 11 sklearn_feature_selection.SelectFpr(alpha=0.05, score_func=<function f_classif at 0x113806d90>) - 12 sklearn_feature_selection.SelectFdr(alpha=0.05, score_func=<function f_classif at 0x113806d90>) - 13 sklearn_feature_selection.SelectFwe(alpha=0.05, score_func=<function f_classif at 0x113806d90>) - 14 sklearn_feature_selection.VarianceThreshold(threshold=0.0) - 15 sklearn_decomposition.FactorAnalysis(copy=True, iterated_power=3, max_iter=1000, n_components=None, + 2 sklearn_preprocessing.MaxAbsScaler(copy=True) + 3 sklearn_preprocessing.Normalizer(copy=True, norm='l2') + 4 sklearn_preprocessing.MinMaxScaler(copy=True, feature_range=(0, 1)) + 5 sklearn_preprocessing.PolynomialFeatures(degree=2, include_bias=True, interaction_only=False) + 6 sklearn_preprocessing.RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True, with_scaling=True) + 7 sklearn_feature_selection.SelectKBest(k=10, score_func=<function f_classif at 0x113806d90>) + 8 sklearn_feature_selection.GenericUnivariateSelect(mode='percentile', param=1e-05, score_func=<function f_classif at 0x113806d90>) + 9 sklearn_feature_selection.SelectPercentile(percentile=10, score_func=<function f_classif at 0x113806d90>) + 10 sklearn_feature_selection.SelectFpr(alpha=0.05, score_func=<function f_classif at 0x113806d90>) + 11 sklearn_feature_selection.SelectFdr(alpha=0.05, score_func=<function f_classif at 0x113806d90>) + 12 sklearn_feature_selection.SelectFwe(alpha=0.05, score_func=<function f_classif at 0x113806d90>) + 13 sklearn_feature_selection.VarianceThreshold(threshold=0.0) + 14 sklearn_decomposition.FactorAnalysis(copy=True, iterated_power=3, max_iter=1000, n_components=None, noise_variance_init=None, random_state=0, svd_method='randomized', tol=0.01) - 16 sklearn_decomposition.FastICA(algorithm='parallel', fun='logcosh', fun_args=None, + 15 sklearn_decomposition.FastICA(algorithm='parallel', fun='logcosh', fun_args=None, max_iter=200, n_components=None, random_state=0, tol=0.0001, w_init=None, whiten=True) - 17 sklearn_decomposition.IncrementalPCA(batch_size=None, copy=True, n_components=None, whiten=False) - 18 sklearn_decomposition.KernelPCA(alpha=1.0, coef0=1, copy_X=True, degree=3, eigen_solver='auto', + 16 sklearn_decomposition.IncrementalPCA(batch_size=None, copy=True, n_components=None, whiten=False) + 17 sklearn_decomposition.KernelPCA(alpha=1.0, coef0=1, copy_X=True, degree=3, eigen_solver='auto', fit_inverse_transform=False, gamma=None, kernel='linear', kernel_params=None, max_iter=None, n_components=None, random_state=0, remove_zero_eig=False, tol=0) - 19 sklearn_decomposition.LatentDirichletAllocation(batch_size=128, doc_topic_prior=None, evaluate_every=-1, learning_decay=0.7, + 18 sklearn_decomposition.LatentDirichletAllocation(batch_size=128, doc_topic_prior=None, evaluate_every=-1, learning_decay=0.7, learning_method=None, learning_offset=10.0, max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001, n_components=10, n_topics=None, perp_tol=0.1, random_state=0, topic_word_prior=None, total_samples=1000000.0, verbose=0) - 20 sklearn_decomposition.MiniBatchDictionaryLearning(alpha=1, batch_size=3, dict_init=None, fit_algorithm='lars', + 19 sklearn_decomposition.MiniBatchDictionaryLearning(alpha=1, batch_size=3, dict_init=None, fit_algorithm='lars', n_components=None, n_iter=1000, random_state=0, shuffle=True, split_sign=False, transform_algorithm='omp', transform_alpha=None, transform_n_nonzero_coefs=None, verbose=False) - 21 sklearn_decomposition.MiniBatchSparsePCA(alpha=1, batch_size=3, callback=None, method='lars', n_components=None, + 20 sklearn_decomposition.MiniBatchSparsePCA(alpha=1, batch_size=3, callback=None, method='lars', n_components=None, n_iter=100, random_state=0, ridge_alpha=0.01, shuffle=True, verbose=False) - 22 sklearn_decomposition.NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200, + 21 sklearn_decomposition.NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200, n_components=None, random_state=0, shuffle=False, solver='cd', tol=0.0001, verbose=0) - 23 sklearn_decomposition.PCA(copy=True, iterated_power='auto', n_components=None, random_state=0, svd_solver='auto', tol=0.0, whiten=False) - 24 sklearn_decomposition.SparsePCA(U_init=None, V_init=None, alpha=1, max_iter=1000, method='lars', + 22 sklearn_decomposition.PCA(copy=True, iterated_power='auto', n_components=None, random_state=0, svd_solver='auto', tol=0.0, whiten=False) + 23 sklearn_decomposition.SparsePCA(U_init=None, V_init=None, alpha=1, max_iter=1000, method='lars', n_components=None, random_state=0, ridge_alpha=0.01, tol=1e-08, verbose=False) - 25 sklearn_decomposition.TruncatedSVD(algorithm='randomized', n_components=2, n_iter=5, random_state=0, tol=0.0) - 26 sklearn_kernel_approximation.Nystroem(coef0=None, degree=None, gamma=None, kernel='rbf', + 24 sklearn_decomposition.TruncatedSVD(algorithm='randomized', n_components=2, n_iter=5, random_state=0, tol=0.0) + 25 sklearn_kernel_approximation.Nystroem(coef0=None, degree=None, gamma=None, kernel='rbf', kernel_params=None, n_components=100, random_state=0) - 27 sklearn_kernel_approximation.RBFSampler(gamma=1.0, n_components=100, random_state=0) - 28 sklearn_kernel_approximation.AdditiveChi2Sampler(sample_interval=None, sample_steps=2) - 29 sklearn_kernel_approximation.SkewedChi2Sampler(n_components=100, random_state=0, skewedness=1.0) - 30 sklearn_cluster.FeatureAgglomeration(affinity='euclidean', compute_full_tree='auto', connectivity=None, + 26 sklearn_kernel_approximation.RBFSampler(gamma=1.0, n_components=100, random_state=0) + 27 sklearn_kernel_approximation.AdditiveChi2Sampler(sample_interval=None, sample_steps=2) + 28 sklearn_kernel_approximation.SkewedChi2Sampler(n_components=100, random_state=0, skewedness=1.0) + 29 sklearn_cluster.FeatureAgglomeration(affinity='euclidean', compute_full_tree='auto', connectivity=None, linkage='ward', memory=None, n_clusters=2, pooling_func=<function mean at 0x113078ae8>) - 31 skrebate_ReliefF(discrete_threshold=10, n_features_to_select=10, n_neighbors=100, verbose=False) - 32 skrebate_SURF(discrete_threshold=10, n_features_to_select=10, verbose=False) - 33 skrebate_SURFstar(discrete_threshold=10, n_features_to_select=10, verbose=False) - 34 skrebate_MultiSURF(discrete_threshold=10, n_features_to_select=10, verbose=False) - 35 skrebate_MultiSURFstar(discrete_threshold=10, n_features_to_select=10, verbose=False) - 'sk_prep_all': All sklearn preprocessing estimators, i.e., 0-7 - 'fs_all': All feature_selection estimators, i.e., 8-14 - 'decomp_all': All decomposition estimators, i.e., 15-25 - 'k_appr_all': All kernel_approximation estimators, i.e., 26-29 - 'reb_all': All skrebate estimators, i.e., 31-35 - 'all_0': All except the imbalanced-learn samplers, i.e., 0-35 - 'imb_all': All imbalanced-learn sampling methods, i.e., 36-54. + 30 skrebate_ReliefF(discrete_threshold=10, n_features_to_select=10, n_neighbors=100, verbose=False) + 31 skrebate_SURF(discrete_threshold=10, n_features_to_select=10, verbose=False) + 32 skrebate_SURFstar(discrete_threshold=10, n_features_to_select=10, verbose=False) + 33 skrebate_MultiSURF(discrete_threshold=10, n_features_to_select=10, verbose=False) + 34 skrebate_MultiSURFstar(discrete_threshold=10, n_features_to_select=10, verbose=False) + 'sk_prep_all': All sklearn preprocessing estimators, i.e., 0-6 + 'fs_all': All feature_selection estimators, i.e., 7-13 + 'decomp_all': All decomposition estimators, i.e., 14-24 + 'k_appr_all': All kernel_approximation estimators, i.e., 25-28 + 'reb_all': All skrebate estimators, i.e., 30-34 + 'all_0': All except the imbalanced-learn samplers, i.e., 0-34 + 'imb_all': All imbalanced-learn sampling methods, i.e., 35-53. **CAUTION**: Mix of imblearn and other preprocessors may not work. None: opt out of preprocessor Support mix (CAUTION: Mix of imblearn and other preprocessors may not work), e.g.:: - : [None, 'sk_prep_all', 22, 'k_appr_all', sklearn_feature_selection.SelectKBest(k=50)] + : [None, 'sk_prep_all', 21, 'k_appr_all', sklearn_feature_selection.SelectKBest(k=50)]