diff search_model_validation.xml @ 3:f9fea8323bcb draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit d00173591e4a783a4c1cb2664e4bb192ab5414f7
author bgruening
date Fri, 17 Aug 2018 12:26:17 -0400
parents 907bb0418c9f
children 2e6540c11251
line wrap: on
line diff
--- a/search_model_validation.xml	Tue Aug 07 05:45:28 2018 -0400
+++ b/search_model_validation.xml	Fri Aug 17 12:26:17 2018 -0400
@@ -4,7 +4,7 @@
         <import>main_macros.xml</import>
     </macros>
     <expand macro="python_requirements">
-        <requirement type="package" version="0.9.12">asteval</requirement>
+        <requirement type="package" version="0.6">skrebate</requirement>
     </expand>
     <expand macro="macro_stdio"/>
     <version_command>echo "@VERSION@"</version_command>
@@ -18,22 +18,16 @@
         <configfile name="sklearn_search_model_validation_script">
             <![CDATA[
 import sys
+import os
 import json
 import pandas
-import re
 import pickle
-import numpy as np
-import xgboost
-import scipy
-from asteval import Interpreter, make_symbol_table
-from sklearn import metrics, preprocessing, model_selection, ensemble
-from sklearn.pipeline import Pipeline
+from sklearn import model_selection
+from sklearn.exceptions import FitFailedWarning
 
-@COLUMNS_FUNCTION@
-@GET_ESTIMATOR_FUNCTION@
-@SAFE_EVAL_FUNCTION@
-@GET_SEARCH_PARAMS_FUNCTION@
-@GET_CV_FUNCTION@
+execfile("$__tool_directory__/utils.py")
+
+warnings.simplefilter('ignore')
 
 input_json_path = sys.argv[1]
 with open(input_json_path, "r") as param_handler:
@@ -88,8 +82,14 @@
 
 options = params["search_schemes"]["options"]
 options['cv'] = get_cv( options['cv'].strip() )
-if 'scoring' in options and options['scoring'] == '':
-    options['scoring'] = None
+options['n_jobs'] = N_JOBS
+options['scoring'] = get_scoring(options['scoring'])
+if options['error_score']:
+    options['error_score'] = 'raise'
+else:
+    options['error_score'] = 0
+if options['refit'] and isinstance(options['scoring'], dict):
+    options['refit'] = 'primary'
 if 'pre_dispatch' in options and options['pre_dispatch'] == '':
     options['pre_dispatch'] = None
 
@@ -98,7 +98,14 @@
 search_params = get_search_params(params_builder)
 searcher = optimizers(pipeline, search_params, **options)
 
-searcher.fit(X, y)
+warnings.simplefilter('always', FitFailedWarning)
+with warnings.catch_warnings(record=True) as w:
+    try:
+        searcher.fit(X, y)
+    except ValueError:
+        pass
+    for warning in w:
+        print(repr(warning.message))
 
 cv_result = pandas.DataFrame(searcher.cv_results_)
 cv_result.to_csv(path_or_buf=outfile_result, sep='\t', header=True, index=False)
@@ -150,9 +157,10 @@
                 <param name="selected_param_type" value="final_estimator_p"/>
             </conditional>
             <conditional name="search_param_selector">
-                <param name="search_p" value="k: [3, 5, 7, 9]"/>
+                <param name="search_p" value="k: [-1, 3, 5, 7, 9]"/>
                 <param name="selected_param_type" value="prep_2_p"/>
             </conditional>
+            <param name="error_score" value="false"/>
             <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
             <param name="header1" value="true" />
             <param name="selected_column_selector_option" value="all_columns"/>
@@ -162,6 +170,7 @@
             <output name="outfile_result" >
                 <assert_contents>
                     <has_text_matching expression="[^/d]+0.7938837807353147[^/d]+{u'estimator__C': 1, u'preprocessing_2__k': 9}[^/d]+1" />
+                    <has_text text="0.0"/>
                 </assert_contents>
             </output>
         </test>
@@ -209,7 +218,7 @@
                 <param name="selected_param_type" value="final_estimator_p"/>
             </conditional>
             <conditional name="search_param_selector">
-                <param name="search_p" value="gamma: np_random_uniform(low=0., high=1., size=2)"/>
+                <param name="search_p" value="gamma: scipy_stats_uniform(0., 1.)"/>
                 <param name="selected_param_type" value="final_estimator_p"/>
             </conditional>
             <conditional name="search_param_selector">
@@ -363,6 +372,61 @@
             <param name="selected_column_selector_option2" value="all_columns"/>
             <output name="outfile_estimator" file="searchCV01" compare="sim_size" delta="1"/>
         </test>
+        <test>
+            <param name="selected_search_scheme" value="GridSearchCV"/>
+            <param name="infile_pipeline" value="pipeline03"/>
+            <conditional name="search_param_selector">
+                <param name="search_p" value="n_estimators: [10, 50, 200, 1000]"/>
+                <param name="selected_param_type" value="final_estimator_p"/>
+            </conditional>
+            <conditional name="search_param_selector">
+                <param name="search_p" value="random_state: [324089]"/>
+                <param name="selected_param_type" value="final_estimator_p"/>
+            </conditional>
+            <param name="primary_scoring" value="balanced_accuracy"/>
+            <param name="cv" value="StratifiedKFold(n_splits=3, shuffle=True, random_state=10)"/>
+            <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
+            <param name="header1" value="true" />
+            <param name="selected_column_selector_option" value="all_columns"/>
+            <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
+            <param name="header2" value="true" />
+            <param name="selected_column_selector_option2" value="all_columns"/>
+            <output name="outfile_result" >
+                <assert_contents>
+                    <has_n_columns n="13" />
+                    <has_text text="0.05366527890058046"/>
+                </assert_contents>
+            </output>
+        </test>
+        <test>
+            <param name="selected_search_scheme" value="GridSearchCV"/>
+            <param name="infile_pipeline" value="pipeline09"/>
+            <conditional name="search_param_selector">
+                <param name="search_p" value="n_neighbors: [50, 100, 150, 200]"/>
+                <param name="selected_param_type" value="prep_1_p"/>
+            </conditional>
+            <conditional name="search_param_selector">
+                <param name="search_p" value="random_state: [324089]"/>
+                <param name="selected_param_type" value="final_estimator_p"/>
+            </conditional>
+            <param name="primary_scoring" value="explained_variance"/>
+            <param name="secondary_scoring" value="neg_mean_squared_error,r2"/>
+            <param name="cv" value="StratifiedKFold(n_splits=3, shuffle=True, random_state=10)"/>
+            <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
+            <param name="header1" value="true" />
+            <param name="selected_column_selector_option" value="all_columns"/>
+            <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
+            <param name="header2" value="true" />
+            <param name="selected_column_selector_option2" value="all_columns"/>
+            <output name="outfile_result" >
+                <assert_contents>
+                    <has_n_columns n="25" />
+                    <has_text text="0.7881203921915186"/>
+                    <has_text text="0.7880692034558879"/>
+                    <has_text text="-29.381892762877825"/>
+                </assert_contents>
+            </output>
+        </test>
     </tests>
     <help>
         <![CDATA[
@@ -373,7 +437,7 @@
 **How to choose search patameters?**
 
 Please refer to `svm`_, `linear_model`_, `ensemble`_, `naive_bayes`_, `tree`_, `neighbors`_ and `xgboost`_ for estimator parameters.
-Refer to `sklearn.preprocessing`_, `feature_selection`_, `decomposition`_, `kernel_approximation`_ and `cluster.FeatureAgglomeration`_ for parameter in the pre-processing steps.
+Refer to `sklearn.preprocessing`_, `feature_selection`_, `decomposition`_, `kernel_approximation`_, `cluster.FeatureAgglomeration`_ and `skrebate`_ for parameter in the pre-processing steps.
 
 **Search parameter input** accepts parameter and setting in key:value pair. One pair per input box. Setting can be list, numpy array, or distribution.
 The evaluation of settings supports operations in Math, list comprehension, numpy.arange(np_arange), most numpy.random(e.g., np_random_uniform) and some scipy.stats(e.g., scipy_stats_zipf) classes or functions, and others.
@@ -410,6 +474,7 @@
 .. _`decomposition`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.decomposition
 .. _`kernel_approximation`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.kernel_approximation
 .. _`cluster.FeatureAgglomeration`: http://scikit-learn.org/stable/modules/generated/sklearn.cluster.FeatureAgglomeration.html
+.. _`skrebate`: https://epistasislab.github.io/scikit-rebate/using/
 
         ]]>
     </help>