Mercurial > repos > bgruening > sklearn_model_validation

--- a/main_macros.xml	Tue May 22 19:33:14 2018 -0400
+++ b/main_macros.xml	Wed May 30 08:27:01 2018 -0400
@@ -16,6 +16,47 @@
   return y
   </token>

+## generate an instance for one of sklearn.feature_selection classes
+## must call "@COLUMNS_FUNCTION@"
+  <token name="@FEATURE_SELECTOR_FUNCTION@">
+def feature_selector(inputs):
+  selector = inputs["selected_algorithm"]
+  selector = getattr(sklearn.feature_selection, selector)
+  options = inputs["options"]
+
+  if inputs['selected_algorithm'] == 'SelectFromModel':
+    if not options['threshold'] or options['threshold'] == 'None':
+      options['threshold'] = None
+      if 'extra_estimator' in inputs and inputs['extra_estimator']['has_estimator'] == 'no_load':
+        fitted_estimator = pickle.load(open("inputs['extra_estimator']['fitted_estimator']", 'r'))
+        new_selector = selector(fitted_estimator, prefit=True, **options)
+      else:
+        estimator=inputs["estimator"]
+        if inputs["extra_estimator"]["has_estimator"]=='no':
+          estimator=inputs["extra_estimator"]["new_estimator"]
+        estimator=eval(estimator.replace('__dq__', '"').replace("__sq__","'"))
+        new_selector = selector(estimator, **options)
+
+  elif inputs['selected_algorithm'] in ['RFE', 'RFECV']:
+    if 'scoring' in options and (not options['scoring'] or options['scoring'] == 'None'):
+      options['scoring'] = None
+    estimator=inputs["estimator"]
+    if inputs["extra_estimator"]["has_estimator"]=='no':
+      estimator=inputs["extra_estimator"]["new_estimator"]
+    estimator=eval(estimator.replace('__dq__', '"').replace("__sq__","'"))
+    new_selector = selector(estimator, **options)
+
+  elif inputs['selected_algorithm'] == "VarianceThreshold":
+    new_selector = selector(**options)
+
+  else:
+    score_func = inputs["score_func"]
+    score_func = getattr(sklearn.feature_selection, score_func)
+    new_selector = selector(score_func, **options)
+
+  return new_selector
+  </token>
+
   <xml name="python_requirements">
       <requirements>
           <requirement type="package" version="2.7">python</requirement>
@@ -794,6 +835,13 @@
     </when>
     <yield/>
   </xml>
+  <xml name="estimator_input_no_fit">
+    <expand macro="feature_selection_estimator" />
+    <conditional name="extra_estimator">
+      <expand macro="feature_selection_extra_estimator" />
+      <expand macro="feature_selection_estimator_choices" />
+    </conditional>
+  </xml>
   <xml name="feature_selection_all">
     <conditional name="feature_selection_algorithms">
       <param name="selected_algorithm" type="select" label="Select a feature selection algorithm">
@@ -975,8 +1023,8 @@
     <param argument="scoring" type="text" value="" optional="true" label="scoring" help="A metric used to evaluate the estimator"/>
   </xml>

-  <xml name="pre_dispatch">
-    <param argument="pre_dispatch" type="text" value="all" optional="true" label="pre_dispatch" help="Number of predispatched jobs for parallel execution"/>
+  <xml name="pre_dispatch" token_type="text" token_default_value="all" token_help="Number of predispatched jobs for parallel execution">
+    <param argument="pre_dispatch" type="@TYPE@" value="@DEFAULT_VALUE@" optional="true" label="pre_dispatch" help="@HELP@"/>
   </xml>

   <!-- Outputs -->
--- a/model_validation.xml	Tue May 22 19:33:14 2018 -0400
+++ b/model_validation.xml	Wed May 30 08:27:01 2018 -0400
@@ -18,13 +18,17 @@
 import sys
 import json
 import pandas
+import ast
 import pickle
 import numpy as np
 import sklearn.model_selection
 from sklearn import svm, linear_model, ensemble
+from sklearn.pipeline import Pipeline

 @COLUMNS_FUNCTION@

+@FEATURE_SELECTOR_FUNCTION@
+
 input_json_path = sys.argv[1]
 params = json.load(open(input_json_path, "r"))

@@ -51,50 +55,90 @@
 )
 y=y.ravel()

-validator = params["model_validation_functions"]["selected_function"]
-validator = getattr(sklearn.model_selection, validator)
 options = params["model_validation_functions"]["options"]
 if 'scoring' in options and options['scoring'] == '':
     options['scoring'] = None
+if 'pre_dispatch' in options and options['pre_dispatch'] == '':
+    options['pre_dispatch'] = None

+pipeline_steps = []
+
+## Set up feature selector and add to pipeline steps.
+if params['feature_selection']['do_feature_selection'] == 'Yes':
+    feature_selector = feature_selector(params['feature_selection']['feature_selection_algorithms'])
+    pipeline_steps.append( ('feature_selector', feature_selector))
+
+## Set up estimator and add to pipeline.
 estimator=params["model_validation_functions"]["estimator"]
 if params["model_validation_functions"]["extra_estimator"]["has_estimator"] == 'no':
     estimator = params["model_validation_functions"]["extra_estimator"]["new_estimator"]
 estimator = eval(estimator.replace('__dq__', '"').replace("__sq__","'"))

-#if $model_validation_functions.selected_function == 'cross_validate':
-res = validator(estimator, X, y, **options)
-rval = res["$model_validation_functions.return_type"]
+pipeline_steps.append( ('estimator', estimator) )
+
+pipeline = Pipeline(pipeline_steps)
+
+## Set up validator, run pipeline through validator and return results.

-#elif $model_validation_functions.selected_function == 'learning_curve':
-options['train_sizes'] = eval(options['train_sizes'])
-train_sizes_abs, train_scores, test_scores = validator(estimator, X, y, **options)
-rval = eval("$model_validation_functions.return_type")
+validator = params["model_validation_functions"]["selected_function"]
+validator = getattr(sklearn.model_selection, validator)
+
+selected_function = params["model_validation_functions"]["selected_function"]
+rval_type = params["model_validation_functions"].get("return_type", None)

-#elif $model_validation_functions.selected_function == 'permutation_test_score':
-score, permutation_scores, pvalue = validator(estimator, X, y, **options)
-rval = eval("$model_validation_functions.return_type")
-if "$model_validation_functions.return_type" in ["score", "pvalue"]:
-    rval = [rval]
-
-#elif $model_validation_functions.selected_function == 'validation_curve':
-options['param_range'] = eval(options['param_range'])
-train_scores, test_scores = validator(estimator, X, y, **options)
-rval = eval("$model_validation_functions.return_type")
-
-#else:
-rval = validator(estimator, X, y, **options)
-#end if
+if selected_function == 'cross_validate':
+    res = validator(pipeline, X, y, **options)
+    rval = res[rval_type]
+elif selected_function == 'learning_curve':
+    options['train_sizes'] = eval(options['train_sizes'])
+    train_sizes_abs, train_scores, test_scores = validator(pipeline, X, y, **options)
+    rval = eval(rval_type)
+elif selected_function == 'permutation_test_score':
+    score, permutation_scores, pvalue = validator(pipeline, X, y, **options)
+    rval = eval(rval_type)
+    if rval_type in ["score", "pvalue"]:
+        rval = [rval]
+elif selected_function == 'validation_curve':
+    options['param_name'] = 'estimator__' + options['param_name']
+    options['param_range'] = eval(options['param_range'])
+    train_scores, test_scores = validator(pipeline, X, y, **options)
+    rval = eval(rval_type)
+elif selected_function == 'GridSearchCV':
+    param_grid = params["model_validation_functions"]["param_grid"].replace("__sq__","'")\
+        .replace('__dq__','"').replace("__oc__", "{").replace("__cc__", "}")\
+        .replace("__ob__", "[").replace("__cb__", "]")
+    param_grid = ast.literal_eval(param_grid)
+    grid = validator(pipeline, param_grid, **options)
+    grid.fit(X, y)
+    rval = getattr(grid, rval_type)
+    if rval_type in ["best_estimator_", "best_score_", "best_index_"]:
+        rval = [rval]
+else:
+    rval = validator(pipeline, X, y, **options)

 rval = pandas.DataFrame(rval)
-rval.to_csv(path_or_buf="$outfile", sep='\t', header=False, index=False)
+if rval_type and rval_type == "cv_results_":
+    rval.to_csv(path_or_buf="$outfile", sep='\t', header=True, index=False)
+else:
+    rval.to_csv(path_or_buf="$outfile", sep='\t', header=False, index=False)

             ]]>
         </configfile>
     </configfiles>
     <inputs>
+        <conditional name="feature_selection">
+            <param name="do_feature_selection" type="select" label="Do feature selection?">
+                <option value="No" selected="true"/>
+                <option value="Yes"/>
+            </param>
+            <when value="No"/>
+            <when value="Yes">
+                <expand macro="feature_selection_all"/>
+            </when>
+        </conditional>
         <conditional name="model_validation_functions">
             <param name="selected_function" type="select" label="Select a model validation function">
+                <option value="GridSearchCV">GridSearchCV - Exhaustive search over specified parameter values for an estimator </option>
                 <option value="cross_validate">cross_validate - Evaluate metric(s) by cross-validation and also record fit/score times</option>
                 <option value="cross_val_predict">cross_val_predict - Generate cross-validated estimates for each input data point</option>
                 <option value="cross_val_score">cross_val_score - Evaluate a score by cross-validation</option>
@@ -102,12 +146,28 @@
                 <option value="permutation_test_score">permutation_test_score - Evaluate the significance of a cross-validated score with permutations</option>
                 <option value="validation_curve">validation_curve - Validation curve</option>
             </param>
+            <when value="GridSearchCV">
+                <expand macro="estimator_input_no_fit" />
+                <param argument="param_grid" type="text" value="[{'feature_selector__k': [3, 5, 7, 9], 'estimator__C': [1, 10, 100, 1000]}]" label="param_grid" help="Dictionary with parameters names (string) as keys and lists of parameter settings to try as values, or a list of such dictionaries, in which case the grids spanned by each dictionary in the list are explored"/>
+                <section name="options" title="Other Options" expanded="false">
+                    <expand macro="scoring"/>
+                    <expand macro="model_validation_common_options"/>
+                    <expand macro="pre_dispatch" value="2*n_jobs" help="Controls the number of jobs that get dispatched during parallel execution"/>
+                    <param argument="iid" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="true" label="iid" help="Data is identically distributed?"/>
+                    <param argument="refit" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="true" label="refit" help="Refit an estimator using the best found parameters on the whole dataset."/>
+                    <!--error_score-->
+                    <param argument="return_train_score" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="false" label="return_train_score" help=""/>
+                </section>
+                <param name="return_type" type="select" label="Select a return type">
+                    <option value="cv_results_" selected="true">cv_results_</option>
+                    <option value="best_estimator_">best_estimator_</option>
+                    <option value="best_score_">best_score_</option>
+                    <option value="best_params_">best_params_</option>
+                    <option value="best_index_">best_index_</option>
+                </param>
+            </when>
             <when value="cross_validate">
-                <expand macro="feature_selection_estimator" />
-                <conditional name="extra_estimator">
-                    <expand macro="feature_selection_extra_estimator" />
-                    <expand macro="feature_selection_estimator_choices" />
-                </conditional>
+                <expand macro="estimator_input_no_fit" />
                 <section name="options" title="Other Options" expanded="false">
                     <!--groups-->
                     <expand macro="model_validation_common_options"/>
@@ -123,18 +183,12 @@
                 </param>
             </when>
             <when value="cross_val_predict">
-                <expand macro="feature_selection_estimator" />
-                <conditional name="extra_estimator">
-                    <expand macro="feature_selection_extra_estimator" />
-                    <expand macro="feature_selection_estimator_choices" />
-                </conditional>
+                <expand macro="estimator_input_no_fit" />
                 <section name="options" title="Other Options" expanded="false">
                     <!--groups-->
-                    <param argument="cv" type="integer" value="" optional="true" label="cv" help="The number of folds in a (Stratified)KFold" />
-                    <expand macro="n_jobs"/>
-                    <expand macro="verbose"/>
+                    <expand macro="model_validation_common_options" />
                     <!--fit_params-->
-                    <param argument="pre_dispatch" type="integer" value="" optional="true" label="pre_dispatch" help="Controls the number of jobs that get dispatched during parallel execution" />
+                    <expand macro="pre_dispatch" value="2*n_jobs’" help="Controls the number of jobs that get dispatched during parallel execution"/>
                     <param argument="method" type="select" label="Invokes the passed method name of the passed estimator">
                         <option value="predict" selected="true">predict</option>
                         <option value="predict_proba">predict_proba</option>
@@ -142,11 +196,7 @@
                 </section>
             </when>
             <when value="cross_val_score">
-                <expand macro="feature_selection_estimator" />
-                <conditional name="extra_estimator">
-                    <expand macro="feature_selection_extra_estimator" />
-                    <expand macro="feature_selection_estimator_choices" />
-                </conditional>
+                <expand macro="estimator_input_no_fit" />
                 <section name="options" title="Other Options" expanded="false">
                     <!--groups-->
                     <expand macro="model_validation_common_options"/>
@@ -156,11 +206,7 @@
                 </section>
             </when>
             <when value="learning_curve">
-                <expand macro="feature_selection_estimator" />
-                <conditional name="extra_estimator">
-                    <expand macro="feature_selection_extra_estimator" />
-                    <expand macro="feature_selection_estimator_choices" />
-                </conditional>
+                <expand macro="estimator_input_no_fit" />
                 <section name="options" title="Other Options" expanded="false">
                     <!--groups-->
                     <expand macro="model_validation_common_options"/>
@@ -178,11 +224,7 @@
                 </param>
             </when>
             <when value="permutation_test_score">
-                <expand macro="feature_selection_estimator" />
-                <conditional name="extra_estimator">
-                    <expand macro="feature_selection_extra_estimator" />
-                    <expand macro="feature_selection_estimator_choices" />
-                </conditional>
+                <expand macro="estimator_input_no_fit" />
                 <section name="options" title="Other Options" expanded="false">
                     <!--groups-->
                     <expand macro="model_validation_common_options"/>
@@ -197,11 +239,7 @@
                 </param>
             </when>
             <when value="validation_curve">
-                <expand macro="feature_selection_estimator" />
-                <conditional name="extra_estimator">
-                    <expand macro="feature_selection_extra_estimator" />
-                    <expand macro="feature_selection_estimator_choices" />
-                </conditional>
+                <expand macro="estimator_input_no_fit" />
                 <section name="options" title="Other Options" expanded="false">
                     <param name="param_name" type="text" value="gamma" label="param_name" help="Name of the parameter that will be varied"/>
                     <param name="param_range" type="text" value="np.logspace(-6, -1, 5)" label="param_range" help="The values of the parameter that will be evaluated."/>
@@ -287,6 +325,23 @@
             <param name="return_type" value="test_scores"/>
             <output name="outfile" file="mv_result06.tabular"/>
         </test>
+        <test>
+            <param name="do_feature_selection" value="Yes"/>
+            <param name="selected_algorithm" value="SelectKBest"/>
+            <param name="score_func" value="chi2"/>
+            <param name="selected_function" value="GridSearchCV"/>
+            <param name="estimator" value="svm.SVR(kernel=&quot;linear&quot;)"/>
+            <param name="has_estimator" value="yes"/>
+            <param name="param_grid" value="[{'feature_selector__k': [3, 7], 'estimator__C': [1, 100]}]"/>
+            <param name="return_type" value="best_score_"/>
+            <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
+            <param name="header1" value="true" />
+            <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17"/>
+            <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
+            <param name="header2" value="true" />
+            <param name="col2" value="1"/>
+            <output name="outfile" file="mv_result07.tabular"/>
+        </test>
     </tests>
     <help>
         <![CDATA[
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/mv_result07.tabular	Wed May 30 08:27:01 2018 -0400
@@ -0,0 +1,1 @@
+0.7824428015300172