Repository 'sklearn_model_validation'
hg clone https://toolshed.g2.bx.psu.edu/repos/bgruening/sklearn_model_validation

Changeset 2:dd502cb0d567 (2018-05-30)
Previous changeset 1:02eadaaa4bf7 (2018-05-22) Next changeset 3:424d8d21744d (2018-06-05)
Commit message:
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 4ed8c4f6ef9ece81797a398b17a99bbaf49a6978
modified:
main_macros.xml
model_validation.xml
added:
test-data/mv_result07.tabular
b
diff -r 02eadaaa4bf7 -r dd502cb0d567 main_macros.xml
--- a/main_macros.xml Tue May 22 19:33:14 2018 -0400
+++ b/main_macros.xml Wed May 30 08:27:01 2018 -0400
[
@@ -16,6 +16,47 @@
   return y
   </token>
 
+## generate an instance for one of sklearn.feature_selection classes
+## must call "@COLUMNS_FUNCTION@"
+  <token name="@FEATURE_SELECTOR_FUNCTION@">
+def feature_selector(inputs):
+  selector = inputs["selected_algorithm"]
+  selector = getattr(sklearn.feature_selection, selector)
+  options = inputs["options"]
+
+  if inputs['selected_algorithm'] == 'SelectFromModel':
+    if not options['threshold'] or options['threshold'] == 'None':
+      options['threshold'] = None
+      if 'extra_estimator' in inputs and inputs['extra_estimator']['has_estimator'] == 'no_load':
+        fitted_estimator = pickle.load(open("inputs['extra_estimator']['fitted_estimator']", 'r'))
+        new_selector = selector(fitted_estimator, prefit=True, **options)
+      else:
+        estimator=inputs["estimator"]
+        if inputs["extra_estimator"]["has_estimator"]=='no':
+          estimator=inputs["extra_estimator"]["new_estimator"]
+        estimator=eval(estimator.replace('__dq__', '"').replace("__sq__","'"))
+        new_selector = selector(estimator, **options)
+
+  elif inputs['selected_algorithm'] in ['RFE', 'RFECV']:
+    if 'scoring' in options and (not options['scoring'] or options['scoring'] == 'None'):
+      options['scoring'] = None
+    estimator=inputs["estimator"]
+    if inputs["extra_estimator"]["has_estimator"]=='no':
+      estimator=inputs["extra_estimator"]["new_estimator"]
+    estimator=eval(estimator.replace('__dq__', '"').replace("__sq__","'"))
+    new_selector = selector(estimator, **options)
+
+  elif inputs['selected_algorithm'] == "VarianceThreshold":
+    new_selector = selector(**options)
+
+  else:
+    score_func = inputs["score_func"]
+    score_func = getattr(sklearn.feature_selection, score_func)
+    new_selector = selector(score_func, **options)
+
+  return new_selector
+  </token>
+
   <xml name="python_requirements">
       <requirements>
           <requirement type="package" version="2.7">python</requirement>
@@ -794,6 +835,13 @@
     </when>
     <yield/>
   </xml>
+  <xml name="estimator_input_no_fit">
+    <expand macro="feature_selection_estimator" />
+    <conditional name="extra_estimator">
+      <expand macro="feature_selection_extra_estimator" />
+      <expand macro="feature_selection_estimator_choices" />
+    </conditional>
+  </xml>
   <xml name="feature_selection_all">
     <conditional name="feature_selection_algorithms">
       <param name="selected_algorithm" type="select" label="Select a feature selection algorithm">
@@ -975,8 +1023,8 @@
     <param argument="scoring" type="text" value="" optional="true" label="scoring" help="A metric used to evaluate the estimator"/>
   </xml>
 
-  <xml name="pre_dispatch">
-    <param argument="pre_dispatch" type="text" value="all" optional="true" label="pre_dispatch" help="Number of predispatched jobs for parallel execution"/>
+  <xml name="pre_dispatch" token_type="text" token_default_value="all" token_help="Number of predispatched jobs for parallel execution">
+    <param argument="pre_dispatch" type="@TYPE@" value="@DEFAULT_VALUE@" optional="true" label="pre_dispatch" help="@HELP@"/>
   </xml>
 
   <!-- Outputs -->
b
diff -r 02eadaaa4bf7 -r dd502cb0d567 model_validation.xml
--- a/model_validation.xml Tue May 22 19:33:14 2018 -0400
+++ b/model_validation.xml Wed May 30 08:27:01 2018 -0400
[
b'@@ -18,13 +18,17 @@\n import sys\n import json\n import pandas\n+import ast\n import pickle\n import numpy as np\n import sklearn.model_selection\n from sklearn import svm, linear_model, ensemble\n+from sklearn.pipeline import Pipeline\n \n @COLUMNS_FUNCTION@\n \n+@FEATURE_SELECTOR_FUNCTION@\n+\n input_json_path = sys.argv[1]\n params = json.load(open(input_json_path, "r"))\n \n@@ -51,50 +55,90 @@\n )\n y=y.ravel()\n \n-validator = params["model_validation_functions"]["selected_function"]\n-validator = getattr(sklearn.model_selection, validator)\n options = params["model_validation_functions"]["options"]\n if \'scoring\' in options and options[\'scoring\'] == \'\':\n     options[\'scoring\'] = None\n+if \'pre_dispatch\' in options and options[\'pre_dispatch\'] == \'\':\n+    options[\'pre_dispatch\'] = None\n \n+pipeline_steps = []\n+\n+## Set up feature selector and add to pipeline steps.\n+if params[\'feature_selection\'][\'do_feature_selection\'] == \'Yes\':\n+    feature_selector = feature_selector(params[\'feature_selection\'][\'feature_selection_algorithms\'])\n+    pipeline_steps.append( (\'feature_selector\', feature_selector))\n+\n+## Set up estimator and add to pipeline.\n estimator=params["model_validation_functions"]["estimator"]\n if params["model_validation_functions"]["extra_estimator"]["has_estimator"] == \'no\':\n     estimator = params["model_validation_functions"]["extra_estimator"]["new_estimator"]\n estimator = eval(estimator.replace(\'__dq__\', \'"\').replace("__sq__","\'"))\n \n-#if $model_validation_functions.selected_function == \'cross_validate\':\n-res = validator(estimator, X, y, **options)\n-rval = res["$model_validation_functions.return_type"]\n+pipeline_steps.append( (\'estimator\', estimator) )\n+\n+pipeline = Pipeline(pipeline_steps)\n+\n+## Set up validator, run pipeline through validator and return results.\n \n-#elif $model_validation_functions.selected_function == \'learning_curve\':\n-options[\'train_sizes\'] = eval(options[\'train_sizes\'])\n-train_sizes_abs, train_scores, test_scores = validator(estimator, X, y, **options)\n-rval = eval("$model_validation_functions.return_type")\n+validator = params["model_validation_functions"]["selected_function"]\n+validator = getattr(sklearn.model_selection, validator)\n+\n+selected_function = params["model_validation_functions"]["selected_function"]\n+rval_type = params["model_validation_functions"].get("return_type", None)\n \n-#elif $model_validation_functions.selected_function == \'permutation_test_score\':\n-score, permutation_scores, pvalue = validator(estimator, X, y, **options)\n-rval = eval("$model_validation_functions.return_type")\n-if "$model_validation_functions.return_type" in ["score", "pvalue"]:\n-    rval = [rval]\n-\n-#elif $model_validation_functions.selected_function == \'validation_curve\':\n-options[\'param_range\'] = eval(options[\'param_range\'])\n-train_scores, test_scores = validator(estimator, X, y, **options)\n-rval = eval("$model_validation_functions.return_type")\n-\n-#else:\n-rval = validator(estimator, X, y, **options)\n-#end if\n+if selected_function == \'cross_validate\':\n+    res = validator(pipeline, X, y, **options)\n+    rval = res[rval_type]\n+elif selected_function == \'learning_curve\':\n+    options[\'train_sizes\'] = eval(options[\'train_sizes\'])\n+    train_sizes_abs, train_scores, test_scores = validator(pipeline, X, y, **options)\n+    rval = eval(rval_type)\n+elif selected_function == \'permutation_test_score\':\n+    score, permutation_scores, pvalue = validator(pipeline, X, y, **options)\n+    rval = eval(rval_type)\n+    if rval_type in ["score", "pvalue"]:\n+        rval = [rval]\n+elif selected_function == \'validation_curve\':\n+    options[\'param_name\'] = \'estimator__\' + options[\'param_name\']\n+    options[\'param_range\'] = eval(options[\'param_range\'])\n+    train_scores, test_scores = validator(pipeline, X, y, **options)\n+    rval = eval(rval_type)\n+elif selected_function == \'GridSearchCV\':\n+    param_grid = params["model_validation_functions"]["param_grid"].replace("__sq__","\'")\\\n+        .replace(\'__dq__\',\'"\').replace("__oc__", "{").replace("__c'..b're">\n-                <expand macro="feature_selection_estimator" />\n-                <conditional name="extra_estimator">\n-                    <expand macro="feature_selection_extra_estimator" />\n-                    <expand macro="feature_selection_estimator_choices" />\n-                </conditional>\n+                <expand macro="estimator_input_no_fit" />\n                 <section name="options" title="Other Options" expanded="false">\n                     <!--groups-->\n                     <expand macro="model_validation_common_options"/>\n@@ -156,11 +206,7 @@\n                 </section>\n             </when>\n             <when value="learning_curve">\n-                <expand macro="feature_selection_estimator" />\n-                <conditional name="extra_estimator">\n-                    <expand macro="feature_selection_extra_estimator" />\n-                    <expand macro="feature_selection_estimator_choices" />\n-                </conditional>\n+                <expand macro="estimator_input_no_fit" />\n                 <section name="options" title="Other Options" expanded="false">\n                     <!--groups-->\n                     <expand macro="model_validation_common_options"/>\n@@ -178,11 +224,7 @@\n                 </param>\n             </when>\n             <when value="permutation_test_score">\n-                <expand macro="feature_selection_estimator" />\n-                <conditional name="extra_estimator">\n-                    <expand macro="feature_selection_extra_estimator" />\n-                    <expand macro="feature_selection_estimator_choices" />\n-                </conditional>\n+                <expand macro="estimator_input_no_fit" />\n                 <section name="options" title="Other Options" expanded="false">\n                     <!--groups-->\n                     <expand macro="model_validation_common_options"/>\n@@ -197,11 +239,7 @@\n                 </param>\n             </when>\n             <when value="validation_curve">\n-                <expand macro="feature_selection_estimator" />\n-                <conditional name="extra_estimator">\n-                    <expand macro="feature_selection_extra_estimator" />\n-                    <expand macro="feature_selection_estimator_choices" />\n-                </conditional>\n+                <expand macro="estimator_input_no_fit" />\n                 <section name="options" title="Other Options" expanded="false">\n                     <param name="param_name" type="text" value="gamma" label="param_name" help="Name of the parameter that will be varied"/>\n                     <param name="param_range" type="text" value="np.logspace(-6, -1, 5)" label="param_range" help="The values of the parameter that will be evaluated."/>\n@@ -287,6 +325,23 @@\n             <param name="return_type" value="test_scores"/>\n             <output name="outfile" file="mv_result06.tabular"/>\n         </test>\n+        <test>\n+            <param name="do_feature_selection" value="Yes"/>\n+            <param name="selected_algorithm" value="SelectKBest"/>\n+            <param name="score_func" value="chi2"/>\n+            <param name="selected_function" value="GridSearchCV"/>\n+            <param name="estimator" value="svm.SVR(kernel=&quot;linear&quot;)"/>\n+            <param name="has_estimator" value="yes"/>\n+            <param name="param_grid" value="[{\'feature_selector__k\': [3, 7], \'estimator__C\': [1, 100]}]"/>\n+            <param name="return_type" value="best_score_"/>\n+            <param name="infile1" value="regression_X.tabular" ftype="tabular"/>\n+            <param name="header1" value="true" />\n+            <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17"/>\n+            <param name="infile2" value="regression_y.tabular" ftype="tabular"/>\n+            <param name="header2" value="true" />\n+            <param name="col2" value="1"/>\n+            <output name="outfile" file="mv_result07.tabular"/>\n+        </test>\n     </tests>\n     <help>\n         <![CDATA[\n'
b
diff -r 02eadaaa4bf7 -r dd502cb0d567 test-data/mv_result07.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/mv_result07.tabular Wed May 30 08:27:01 2018 -0400
b
@@ -0,0 +1,1 @@
+0.7824428015300172