changeset 19:4570575d060c draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit d00173591e4a783a4c1cb2664e4bb192ab5414f7
author bgruening
date Fri, 17 Aug 2018 12:28:21 -0400
parents 0b3144c0b4ee
children 038cecaa9e7c
files ensemble.xml main_macros.xml test-data/pipeline09 test-data/pipeline10 utils.py
diffstat 5 files changed, 448 insertions(+), 271 deletions(-) [+]
line wrap: on
line diff
--- a/ensemble.xml	Tue Aug 07 05:47:03 2018 -0400
+++ b/ensemble.xml	Fri Aug 17 12:28:21 2018 -0400
@@ -15,6 +15,7 @@
         <configfile name="ensemble_script">
 <![CDATA[
 import sys
+import os
 import json
 import numpy as np
 import sklearn.ensemble
@@ -22,8 +23,7 @@
 import pickle
 from scipy.io import mmread
 
-@COLUMNS_FUNCTION@
-@GET_X_y_FUNCTION@
+execfile("$__tool_directory__/utils.py")
 
 # Get inputs, outputs.
 input_json_path = sys.argv[1]
@@ -47,6 +47,8 @@
 if params["selected_tasks"]["selected_task"] == "train":
     algorithm = params["selected_tasks"]["selected_algorithms"]["selected_algorithm"]
     options = params["selected_tasks"]["selected_algorithms"]["options"]
+    if algorithm in ['RandomForestClassifier', 'RandomForestRegressor']:
+        options['n_jobs'] = N_JOBS
     if "select_max_features" in options:
         if options["select_max_features"]["max_features"] == "number_input":
             options["select_max_features"]["max_features"] = options["select_max_features"]["num_max_features"]
@@ -107,7 +109,6 @@
                     <expand macro="max_leaf_nodes"/>
                     <expand macro="bootstrap"/>
                     <expand macro="warm_start" checked="false"/>
-                    <expand macro="n_jobs"/>
                     <expand macro="random_state"/>
                     <expand macro="oob_score"/>
                     <!--class_weight=None-->
@@ -167,7 +168,6 @@
                     <expand macro="min_impurity_decrease"/>
                     <expand macro="bootstrap"/>
                     <expand macro="oob_score"/>
-                    <expand macro="n_jobs"/>
                     <expand macro="random_state"/>
                     <expand macro="verbose"/>
                     <expand macro="warm_start" checked="false"/>
--- a/main_macros.xml	Tue Aug 07 05:47:03 2018 -0400
+++ b/main_macros.xml	Fri Aug 17 12:28:21 2018 -0400
@@ -1,216 +1,13 @@
 <macros>
   <token name="@VERSION@">0.9</token>
 
-  <token name="@COLUMNS_FUNCTION@">
-def read_columns(f, c=None, c_option='by_index_number', return_df=False, **args):
-  data = pandas.read_csv(f, **args)
-  if c_option == 'by_index_number':
-    cols = list(map(lambda x: x - 1, c))
-    data = data.iloc[:,cols]
-  if c_option == 'all_but_by_index_number':
-    cols = list(map(lambda x: x - 1, c))
-    data.drop(data.columns[cols], axis=1, inplace=True)
-  if c_option == 'by_header_name':
-    cols = [e.strip() for e in c.split(',')]
-    data = data[cols]
-  if c_option == 'all_but_by_header_name':
-    cols = [e.strip() for e in c.split(',')]
-    data.drop(cols, axis=1, inplace=True)
-  y = data.values
-  if return_df:
-    return y, data
-  else:
-    return y
-  return y
-  </token>
-
-## generate an instance for one of sklearn.feature_selection classes
-  <token name="@FEATURE_SELECTOR_FUNCTION@">
-def feature_selector(inputs):
-  selector = inputs["selected_algorithm"]
-  selector = getattr(sklearn.feature_selection, selector)
-  options = inputs["options"]
-
-  if inputs['selected_algorithm'] == 'SelectFromModel':
-    if not options['threshold'] or options['threshold'] == 'None':
-      options['threshold'] = None
-    if inputs['model_inputter']['input_mode'] == 'prefitted':
-      model_file = inputs['model_inputter']['fitted_estimator']
-      with open(model_file, 'rb') as model_handler:
-        fitted_estimator = pickle.load(model_handler)
-      new_selector = selector(fitted_estimator, prefit=True, **options)
-    else:
-      estimator_json = inputs['model_inputter']["estimator_selector"]
-      estimator = get_estimator(estimator_json)
-      new_selector = selector(estimator, **options)
-
-  elif inputs['selected_algorithm'] in ['RFE', 'RFECV']:
-    if 'scoring' in options and (not options['scoring'] or options['scoring'] == 'None'):
-      options['scoring'] = None
-    estimator=get_estimator(inputs["estimator_selector"])
-    new_selector = selector(estimator, **options)
-
-  elif inputs['selected_algorithm'] == "VarianceThreshold":
-    new_selector = selector(**options)
-
-  else:
-    score_func = inputs["score_func"]
-    score_func = getattr(sklearn.feature_selection, score_func)
-    new_selector = selector(score_func, **options)
-
-  return new_selector
-  </token>
-
-  <token name="@GET_X_y_FUNCTION@">
-def get_X_y(params, file1, file2):
-  input_type = params["selected_tasks"]["selected_algorithms"]["input_options"]["selected_input"]
-  if input_type=="tabular":
-    header = 'infer' if params["selected_tasks"]["selected_algorithms"]["input_options"]["header1"] else None
-    column_option = params["selected_tasks"]["selected_algorithms"]["input_options"]["column_selector_options_1"]["selected_column_selector_option"]
-    if column_option in ["by_index_number", "all_but_by_index_number", "by_header_name", "all_but_by_header_name"]:
-      c = params["selected_tasks"]["selected_algorithms"]["input_options"]["column_selector_options_1"]["col1"]
-    else:
-      c = None
-    X = read_columns(
-      file1,
-      c = c,
-      c_option = column_option,
-      sep='\t',
-      header=header,
-      parse_dates=True
-    )
-  else:
-    X = mmread(file1)
-
-  header = 'infer' if params["selected_tasks"]["selected_algorithms"]["input_options"]["header2"] else None
-  column_option = params["selected_tasks"]["selected_algorithms"]["input_options"]["column_selector_options_2"]["selected_column_selector_option2"]
-  if column_option in ["by_index_number", "all_but_by_index_number", "by_header_name", "all_but_by_header_name"]:
-    c = params["selected_tasks"]["selected_algorithms"]["input_options"]["column_selector_options_2"]["col2"]
-  else:
-    c = None
-  y = read_columns(
-    file2,
-    c = c,
-    c_option = column_option,
-    sep='\t',
-    header=header,
-    parse_dates=True
-  )
-  y=y.ravel()
-  return X, y
-  </token>
-
-  <token name="@SAFE_EVAL_FUNCTION@">
-def safe_eval(literal):
-
-  FROM_SCIPY_STATS = [  'bernoulli', 'binom', 'boltzmann', 'dlaplace', 'geom', 'hypergeom',
-                        'logser', 'nbinom', 'planck', 'poisson', 'randint', 'skellam', 'zipf' ]
-
-  FROM_NUMPY_RANDOM = [ 'beta', 'binomial', 'bytes', 'chisquare', 'choice', 'dirichlet', 'division',
-                        'exponential', 'f', 'gamma', 'geometric', 'gumbel', 'hypergeometric',
-                        'laplace', 'logistic', 'lognormal', 'logseries', 'mtrand', 'multinomial',
-                        'multivariate_normal', 'negative_binomial', 'noncentral_chisquare', 'noncentral_f',
-                        'normal', 'pareto', 'permutation', 'poisson', 'power', 'rand', 'randint',
-                        'randn', 'random', 'random_integers', 'random_sample', 'ranf', 'rayleigh',
-                        'sample', 'seed', 'set_state', 'shuffle', 'standard_cauchy', 'standard_exponential',
-                        'standard_gamma', 'standard_normal', 'standard_t', 'triangular', 'uniform',
-                        'vonmises', 'wald', 'weibull', 'zipf' ]
-
-  # File opening and other unneeded functions could be dropped
-  UNWANTED = ['open', 'type', 'dir', 'id', 'str', 'repr']
-
-  # Allowed symbol table. Add more if needed.
-  new_syms = {
-    'np_arange': getattr(np, 'arange'),
-    'ensemble_ExtraTreesClassifier': getattr(ensemble, 'ExtraTreesClassifier')
-  }
-
-  syms = make_symbol_table(use_numpy=False, **new_syms)
-
-  for method in FROM_SCIPY_STATS:
-    syms['scipy_stats_' + method] = getattr(scipy.stats, method)
-
-  for func in FROM_NUMPY_RANDOM:
-    syms['np_random_' + func] = getattr(np.random, func)
-
-  for key in UNWANTED:
-    syms.pop(key, None)
-
-  aeval = Interpreter(symtable=syms, use_numpy=False, minimal=False,
-                    no_if=True, no_for=True, no_while=True, no_try=True,
-                    no_functiondef=True, no_ifexp=True, no_listcomp=False,
-                    no_augassign=False, no_assert=True, no_delete=True,
-                    no_raise=True, no_print=True)
-
-  return aeval(literal)
-  </token>
-
-  <token name="@GET_SEARCH_PARAMS_FUNCTION@">
-def get_search_params(params_builder):
-  search_params = {}
-
-  for p in params_builder['param_set']:
-    search_p = p['search_param_selector']['search_p']
-    if search_p.strip() == '':
-      continue
-    param_type = p['search_param_selector']['selected_param_type']
-
-    lst = search_p.split(":")
-    assert (len(lst) == 2), "Error, make sure there is one and only one colon in search parameter input."
-    literal = lst[1].strip()
-    ev = safe_eval(literal)
-    if param_type == "final_estimator_p":
-      search_params["estimator__" + lst[0].strip()] = ev
-    else:
-      search_params["preprocessing_" + param_type[5:6] + "__" + lst[0].strip()] = ev
-
-  return search_params
-  </token>
-
-  <token name="@GET_ESTIMATOR_FUNCTION@">
-def get_estimator(estimator_json):
-  estimator_module = estimator_json['selected_module']
-  estimator_cls = estimator_json['selected_estimator']
-
-  if estimator_module == "xgboost":
-    cls = getattr(xgboost, estimator_cls)
-  else:
-    module = getattr(sklearn, estimator_module)
-    cls = getattr(module, estimator_cls)
-
-  estimator = cls()
-
-  estimator_params = estimator_json['text_params'].strip()
-  if estimator_params != "":
-    try:
-      params = ast.literal_eval('{' + estimator_params + '}')
-    except ValueError:
-      sys.exit("Unsupported parameter input: `%s`" %estimator_params)
-    estimator.set_params(**params)
-
-  return estimator
-  </token>
-
-  <token name="@GET_CV_FUNCTION@">
-def get_cv(literal):
-  if literal == "":
-    return None
-  if re.match(r'^\d+$', literal):
-    return int(literal)
-  m = re.match(r'^(?P&lt;method&gt;\w+)\((?P&lt;args&gt;.*)\)$', literal)
-  if m:
-    my_class = getattr( model_selection, m.group('method') )
-    args = safe_eval( 'dict('+ m.group('args') + ')' )
-    return my_class( **args )
-  sys.exit("Unsupported CV input: %s" %literal)
-  </token>
-
   <xml name="python_requirements">
       <requirements>
           <requirement type="package" version="2.7">python</requirement>
           <requirement type="package" version="0.19.1">scikit-learn</requirement>
           <requirement type="package" version="0.22.0">pandas</requirement>
           <requirement type="package" version="0.72.1">xgboost</requirement>
+          <requirement type="package" version="0.9.12">asteval</requirement>
           <yield />
       </requirements>
   </xml>
@@ -439,10 +236,6 @@
     <param argument="fit_intercept" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolfalse" checked="@CHECKED@" label="Estimate the intercept" help="If false, the data is assumed to be already centered."/>
   </xml>
 
-  <xml name="n_jobs" token_default_value="1" token_label="The number of jobs to run in parallel for both fit and predict">
-    <param argument="n_jobs" type="integer" value="@DEFAULT_VALUE@" optional="true" label="@LABEL@" help="If -1, then the number of jobs is set to the number of cores"/>
-  </xml>
-
   <xml name="n_iter" token_default_value="5" token_help_text="The number of passes over the training data (aka epochs). ">
     <param argument="n_iter" type="integer" optional="true" value="@DEFAULT_VALUE@" label="Number of iterations" help="@HELP_TEXT@"/>
   </xml>
@@ -542,7 +335,7 @@
     <conditional name="column_selector_options_1">
       <expand macro="samples_column_selector_options" multiple="@MULTIPLE1@"/>
     </conditional>
-    <param name="infile2" type="data" format="tabular" label="Dataset containing class labels:"/>
+    <param name="infile2" type="data" format="tabular" label="Dataset containing class labels or target values:"/>
     <param name="header2" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolfalse" checked="False" label="Does the dataset contain header:" />
     <conditional name="column_selector_options_2">
       <expand macro="samples_column_selector_options" column_option="selected_column_selector_option2" col_name="col2" multiple="@MULTIPLE2@" infile="infile2"/>
@@ -1031,6 +824,16 @@
     </when>
   </xml>
 
+  <xml name="cv">
+    <param argument="cv" type="text" value="" optional="true" label="cv" help="Optional. Integer or evalable splitter object, e.g., StratifiedKFold(n_splits=3, shuffle=True, random_state=10). Leave blank for default." >
+      <sanitizer>
+        <valid initial="default">
+          <add value="&apos;"/>
+        </valid>
+      </sanitizer>
+    </param>
+  </xml>
+
   <xml name="feature_selection_all">
     <conditional name="fs_algorithm_selector">
       <param name="selected_algorithm" type="select" label="Select a feature selection algorithm">
@@ -1109,10 +912,9 @@
         <expand macro="estimator_selector_all"/>
         <section name="options" title="Advanced Options" expanded="False">
           <param argument="step" type="float" value="1" label="step" optional="true" help="Default = 1. " />
-          <param argument="cv" type="integer" value="" optional="true" label="cv" help="Determines the cross-validation splitting strategy" />
-          <param argument="scoring" type="text" value="" optional="true" label="scoring" help="A string (see model evaluation documentation) or a scorer callable object / function with signature scorer(estimator, X, y)."/>
+          <expand macro="cv"/>
+          <expand macro="scoring_selection"/>
           <param argument="verbose" type="integer" value="0" label="verbose" help="Controls verbosity of output." />
-          <param argument="n_jobs" type="integer" value="1" label="n_jobs" help="Number of cores to run in parallel while fitting across folds. Defaults to 1 core."/>
         </section>
       </when>
       <when value="VarianceThreshold">
@@ -1159,14 +961,106 @@
   </xml>
 
   <xml name="model_validation_common_options">
-    <param argument="cv" type="text" value="" size="50" optional="true" label="cv" help="Optional. Integer or evalable splitter object, e.g., StratifiedKFold(n_splits=3, shuffle=True, random_state=10). Leave blank for default." />
-    <expand macro="n_jobs"/>
+    <expand macro="cv"/>
     <expand macro="verbose"/>
     <yield/>
   </xml>
 
-  <xml name="scoring">
-    <param argument="scoring" type="text" value="" optional="true" label="scoring" help="A metric used to evaluate the estimator"/>
+  <xml name="scoring_selection">
+    <conditional name="scoring">
+      <param name="primary_scoring" type="select" multiple="false" label="Select the primary metric (scoring):" help="Metric to refit the best estimator.">
+        <option value="default" selected="true">default with estimator</option>
+        <option value="accuracy">Classification -- 'accuracy'</option>
+        <option value="balanced_accuracy">Classification -- 'balanced_accuracy'</option>
+        <option value="average_precision">Classification -- 'average_precision'</option>
+        <option value="f1">Classification -- 'f1'</option>
+        <option value="f1_micro">Classification -- 'f1_micro'</option>
+        <option value="f1_macro">Classification -- 'f1_macro'</option>
+        <option value="f1_weighted">Classification -- 'f1_weighted'</option>
+        <option value="f1_samples">Classification -- 'f1_samples'</option>
+        <option value="neg_log_loss">Classification -- 'neg_log_loss'</option>
+        <option value="precision">Classification -- 'precision'</option>
+        <option value="precision_micro">Classification -- 'precision_micro'</option>
+        <option value="precision_macro">Classification -- 'precision_macro'</option>
+        <option value="precision_wighted">Classification -- 'precision_wighted'</option>
+        <option value="precision_samples">Classification -- 'precision_samples'</option>
+        <option value="recall">Classification -- 'recall'</option>
+        <option value="recall_micro">Classification -- 'recall_micro'</option>
+        <option value="recall_macro">Classification -- 'recall_macro'</option>
+        <option value="recall_wighted">Classification -- 'recall_wighted'</option>
+        <option value="recall_samples">Classification -- 'recall_samples'</option>
+        <option value="roc_auc">Classification -- 'roc_auc'</option>
+        <option value="explained_variance">Regression -- 'explained_variance'</option>
+        <option value="neg_mean_absolute_error">Regression -- 'neg_mean_absolute_error'</option>
+        <option value="neg_mean_squared_error">Regression -- 'neg_mean_squared_error'</option>
+        <option value="neg_mean_squared_log_error">Regression -- 'neg_mean_squared_log_error'</option>
+        <option value="neg_median_absolute_error">Regression -- 'neg_median_absolute_error'</option>
+        <option value="r2">Regression -- 'r2'</option>
+      </param>
+      <when value="default"/>
+      <when value="accuracy"><expand macro="secondary_scoring_selection_classification"/></when>
+      <when value="balanced_accuracy"><expand macro="secondary_scoring_selection_classification"/></when>
+      <when value="average_precision"><expand macro="secondary_scoring_selection_classification"/></when>
+      <when value="f1"><expand macro="secondary_scoring_selection_classification"/></when>
+      <when value="f1_micro"><expand macro="secondary_scoring_selection_classification"/></when>
+      <when value="f1_macro"><expand macro="secondary_scoring_selection_classification"/></when>
+      <when value="f1_weighted"><expand macro="secondary_scoring_selection_classification"/></when>
+      <when value="f1_samples"><expand macro="secondary_scoring_selection_classification"/></when>
+      <when value="neg_log_loss"><expand macro="secondary_scoring_selection_classification"/></when>
+      <when value="precision"><expand macro="secondary_scoring_selection_classification"/></when>
+      <when value="precision_micro"><expand macro="secondary_scoring_selection_classification"/></when>
+      <when value="precision_macro"><expand macro="secondary_scoring_selection_classification"/></when>
+      <when value="precision_wighted"><expand macro="secondary_scoring_selection_classification"/></when>
+      <when value="precision_samples"><expand macro="secondary_scoring_selection_classification"/></when>
+      <when value="recall"><expand macro="secondary_scoring_selection_classification"/></when>
+      <when value="recall_micro"><expand macro="secondary_scoring_selection_classification"/></when>
+      <when value="recall_macro"><expand macro="secondary_scoring_selection_classification"/></when>
+      <when value="recall_wighted"><expand macro="secondary_scoring_selection_classification"/></when>
+      <when value="recall_samples"><expand macro="secondary_scoring_selection_classification"/></when>
+      <when value="roc_auc"><expand macro="secondary_scoring_selection_classification"/></when>
+      <when value="explained_variance"><expand macro="secondary_scoring_selection_regression"/></when>
+      <when value="neg_mean_absolute_error"><expand macro="secondary_scoring_selection_regression"/></when>
+      <when value="neg_mean_squared_error"><expand macro="secondary_scoring_selection_regression"/></when>
+      <when value="neg_mean_squared_log_error"><expand macro="secondary_scoring_selection_regression"/></when>
+      <when value="neg_median_absolute_error"><expand macro="secondary_scoring_selection_regression"/></when>
+      <when value="r2"><expand macro="secondary_scoring_selection_regression"/></when>
+    </conditional>
+  </xml>
+
+  <xml name="secondary_scoring_selection_classification">
+    <param name="secondary_scoring" type="select" multiple="true" label="Additional scoring used in multi-metric mode:" help="If the same metric with the primary is chosen, the metric will be ignored.">
+      <option value="accuracy">Classification -- 'accuracy'</option>
+      <option value="balanced_accuracy">Classification -- 'balanced_accuracy'</option>
+      <option value="average_precision">Classification -- 'average_precision'</option>
+      <option value="f1">Classification -- 'f1'</option>
+      <option value="f1_micro">Classification -- 'f1_micro'</option>
+      <option value="f1_macro">Classification -- 'f1_macro'</option>
+      <option value="f1_weighted">Classification -- 'f1_weighted'</option>
+      <option value="f1_samples">Classification -- 'f1_samples'</option>
+      <option value="neg_log_loss">Classification -- 'neg_log_loss'</option>
+      <option value="precision">Classification -- 'precision'</option>
+      <option value="precision_micro">Classification -- 'precision_micro'</option>
+      <option value="precision_macro">Classification -- 'precision_macro'</option>
+      <option value="precision_wighted">Classification -- 'precision_wighted'</option>
+      <option value="precision_samples">Classification -- 'precision_samples'</option>
+      <option value="recall">Classification -- 'recall'</option>
+      <option value="recall_micro">Classification -- 'recall_micro'</option>
+      <option value="recall_macro">Classification -- 'recall_macro'</option>
+      <option value="recall_wighted">Classification -- 'recall_wighted'</option>
+      <option value="recall_samples">Classification -- 'recall_samples'</option>
+      <option value="roc_auc">Classification -- 'roc_auc'</option>
+    </param>
+  </xml>
+
+  <xml name="secondary_scoring_selection_regression">
+    <param name="secondary_scoring" type="select" multiple="true" label="Additional scoring used in multi-metric mode:" help="If the same metric with the primary is chosen, the metric will be ignored.">
+      <option value="explained_variance">Regression -- 'explained_variance'</option>
+      <option value="neg_mean_absolute_error">Regression -- 'neg_mean_absolute_error'</option>
+      <option value="neg_mean_squared_error">Regression -- 'neg_mean_squared_error'</option>
+      <option value="neg_mean_squared_log_error">Regression -- 'neg_mean_squared_log_error'</option>
+      <option value="neg_median_absolute_error">Regression -- 'neg_median_absolute_error'</option>
+      <option value="r2">Regression -- 'r2'</option>
+    </param>
   </xml>
 
   <xml name="pre_dispatch" token_type="hidden" token_default_value="all" token_help="Number of predispatched jobs for parallel execution">
@@ -1210,7 +1104,7 @@
   </xml>
 
   <xml name="search_param_input" token_label="Estimator parameter:" token_help="One parameter per box. For example: C: [1, 10, 100, 1000]. See bottom for more examples">
-    <param name="search_p" type="text" value="" size="100" optional="true" label="@LABEL@" help="@HELP@">
+    <param name="search_p" type="text" value="" optional="true" label="@LABEL@" help="@HELP@">
       <sanitizer>
         <valid initial="default">
           <add value="&apos;"/>
@@ -1223,12 +1117,12 @@
   </xml>
 
   <xml name="search_cv_options">
-      <expand macro="scoring"/>
+      <expand macro="scoring_selection"/>
       <expand macro="model_validation_common_options"/>
       <expand macro="pre_dispatch" value="2*n_jobs" help="Controls the number of jobs that get dispatched during parallel execution"/>
       <param argument="iid" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="true" label="iid" help="If True, data is identically distributed across the folds"/>
       <param argument="refit" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="true" label="refit" help="Refit an estimator using the best found parameters on the whole dataset."/>
-      <!--error_score-->
+      <param argument="error_score" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="true" label="Raise fit error:" help="If false, the metric score is assigned to 0 if an error occurs in estimator fitting and FitFailedWarning is raised."/>
       <param argument="return_train_score" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="false" label="return_train_score" help=""/>
   </xml>
 
@@ -1307,7 +1201,7 @@
           <option value="RandomForestClassifier">RandomForestClassifier</option>
           <option value="RandomForestRegressor">RandomForestRegressor</option>
           <option value="RandomTreesEmbedding">RandomTreesEmbedding</option>
-          <option value="VotingClassifier">VotingClassifier</option>
+          <!--option value="VotingClassifier">VotingClassifier</option-->
         </param>
         <expand macro="estimator_params_text"/>
       </when>
@@ -1330,12 +1224,11 @@
       </when>
       <when value="neighbors">
         <param name="selected_estimator" type="select" label="Choose estimator class:">
-          <option value="BallTree" selected="true">BallTree</option>
-          <option value="DistanceMetric">DistanceMetric</option>
-          <option value="KDTree">KDTree</option>
+          <option value="KNeighborsClassifier" selected="true">KNeighborsClassifier</option>
+          <option value="KNeighborsRegressor">KNeighborsRegressor</option>
+          <!--option value="BallTree">BallTree</option-->
+          <!--option value="KDTree">KDTree</option-->
           <option value="KernelDensity">KernelDensity</option>
-          <option value="KNeighborsClassifier">KNeighborsClassifier</option>
-          <option value="KNeighborsRegressor">KNeighborsRegressor</option>
           <option value="LocalOutlierFactor">LocalOutlierFactor</option>
           <option value="RadiusNeighborsClassifier">RadiusNeighborsClassifier</option>
           <option value="RadiusNeighborsRegressor">RadiusNeighborsRegressor</option>
@@ -1354,9 +1247,9 @@
     </conditional>
   </xml>
 
-  <xml name="estimator_params_text" token_label="Type in estimator parameters:"
-        token_help="Parameters in dictionary without braces ('{}'), e.g., 'C': 1, 'kernel': 'linear'. No double quotes. Leave this box blank for default estimator.">
-    <param name="text_params" type="text" value="" size="50" optional="true" label="@LABEL@" help="@HELP@">
+  <xml name="estimator_params_text" token_label="Type in parameter settings if different from default:" token_default_value=''
+        token_help="Dictionary-capable, e.g., C=1, kernel='linear'. No double quotes. Leave this box blank for default estimator.">
+    <param name="text_params" type="text" value="@DEFAULT_VALUE@" optional="true" label="@LABEL@" help="@HELP@">
       <sanitizer>
         <valid initial="default">
           <add value="&apos;"/>
@@ -1374,20 +1267,20 @@
         <option value="SkewedChi2Sampler">SkewedChi2Sampler</option>
       </param>
       <when value="Nystroem">
-        <expand macro="estimator_params_text" label="Type in kernel approximater parameters:"
-              help="Parameters in dictionary without braces ('{}'), e.g., 'n_components': 100, 'kernel': 'rbf'. No double quotes. Leave this box blank for class default."/>
+        <expand macro="estimator_params_text"
+              help="Default(=blank): coef0=None, degree=None, gamma=None, kernel='rbf', kernel_params=None, n_components=100, random_state=None. No double quotes"/>
       </when>
       <when value="RBFSampler">
-        <expand macro="estimator_params_text" label="Type in kernel approximater parameters:"
-              help="Parameters in dictionary without braces ('{}'), e.g., 'n_components': 100, 'gamma': 1.0. No double quotes. Leave this box blank for class default."/>
+        <expand macro="estimator_params_text"
+              help="Default(=blank): gamma=1.0, n_components=100, random_state=None."/>
       </when>
       <when value="AdditiveChi2Sampler">
-        <expand macro="estimator_params_text" label="Type in kernel approximater parameters:"
-              help="Parameters in dictionary without braces ('{}'), e.g., 'sample_steps': 2, 'sample_interval': None. No double quotes. Leave this box blank for class default."/>
+        <expand macro="estimator_params_text"
+              help="Default(=blank): sample_interval=None, sample_steps=2."/>
       </when>
       <when value="SkewedChi2Sampler">
-        <expand macro="estimator_params_text" label="Type in kernel approximater parameters:"
-              help="Parameters in dictionary without braces ('{}'), e.g., 'n_components': 100, 'skewedness': 1.0. No double quotes. Leave this box blank for class default."/>
+        <expand macro="estimator_params_text"
+              help="Default(=blank): n_components=100, random_state=None, skewedness=1.0."/>
       </when>
     </conditional>
   </xml>
@@ -1406,60 +1299,56 @@
         <option value="NMF">NMF</option>
         <option value="PCA">PCA</option>
         <option value="SparsePCA">SparsePCA</option>
-        <option value="SparseCoder">SparseCoder</option>
+        <!--option value="SparseCoder">SparseCoder</option-->
         <option value="TruncatedSVD">TruncatedSVD</option>
       </param>
       <when value="DictionaryLearning">
-        <expand macro="estimator_params_text" label="Type in maxtrix decomposition parameters:"
-              help="Parameters in dictionary without braces ('{}'), e.g., 'n_components': None, 'alpha': 1.0. No double quotes. Leave this box blank for class default."/>
+        <expand macro="estimator_params_text"
+              help="Default(=blank): alpha=1, code_init=None, dict_init=None, fit_algorithm='lars', max_iter=1000, n_components=None, random_state=None, split_sign=False, tol=1e-08, transform_algorithm='omp', transform_alpha=None, transform_n_nonzero_coefs=None, verbose=False."/>
       </when>
       <when value="FactorAnalysis">
-        <expand macro="estimator_params_text" label="Type in maxtrix decomposition parameters:"
-              help="Parameters in dictionary without braces ('{}'), e.g., 'n_components': 100, 'random_state': 42. No double quotes. Leave this box blank for class default."/>
+        <expand macro="estimator_params_text"
+              help="Default(=blank): copy=True, iterated_power=3, max_iter=1000, n_components=None, noise_variance_init=None, random_state=0, svd_method='randomized', tol=0.01."/>
       </when>
       <when value="FastICA">
-        <expand macro="estimator_params_text" label="Type in maxtrix decomposition parameters:"
-              help="Parameters in dictionary without braces ('{}'), e.g., 'n_components': 100, 'random_state': 42. No double quotes. Leave this box blank for class default."/>
+        <expand macro="estimator_params_text"
+              help="Default(=blank): algorithm='parallel', fun='logcosh', fun_args=None, max_iter=200, n_components=None, random_state=None, tol=0.0001, w_init=None, whiten=True. No double quotes."/>
       </when>
       <when value="IncrementalPCA">
-        <expand macro="estimator_params_text" label="Type in maxtrix decomposition parameters:"
-              help="Parameters in dictionary without braces ('{}'), e.g., 'n_components': 100, 'whiten': False. No double quotes. Leave this box blank for class default."/>
+        <expand macro="estimator_params_text"
+              help="Default(=blank): batch_size=None, copy=True, n_components=None, whiten=False."/>
       </when>
       <when value="KernelPCA">
-        <expand macro="estimator_params_text" label="Type in maxtrix decomposition parameters:"
-              help="Parameters in dictionary without braces ('{}'), e.g., 'n_components': 100, 'random_state': 42. No double quotes. Leave this box blank for class default."/>
+        <expand macro="estimator_params_text"
+              help="Default(=blank): alpha=1.0, coef0=1, copy_X=True, degree=3, eigen_solver='auto', fit_inverse_transform=False, gamma=None, kernel='linear', kernel_params=None, max_iter=None, n_components=None, random_state=None, remove_zero_eig=False, tol=0. No double quotes."/>
       </when>
       <when value="LatentDirichletAllocation">
-        <expand macro="estimator_params_text" label="Type in maxtrix decomposition parameters:"
-              help="Parameters in dictionary without braces ('{}'), e.g., 'n_components': 100, 'random_state': 42. No double quotes. Leave this box blank for class default."/>
+        <expand macro="estimator_params_text"
+              help="Default(=blank): batch_size=128, doc_topic_prior=None, evaluate_every=-1, learning_decay=0.7, learning_method=None, learning_offset=10.0, max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001, n_components=10, n_topics=None, perp_tol=0.1, random_state=None, topic_word_prior=None, total_samples=1000000.0, verbose=0."/>
       </when>
       <when value="MiniBatchDictionaryLearning">
-        <expand macro="estimator_params_text" label="Type in maxtrix decomposition parameters:"
-              help="Parameters in dictionary without braces ('{}'), e.g., 'n_components': 100, 'random_state': 42. No double quotes. Leave this box blank for class default."/>
+        <expand macro="estimator_params_text"
+              help="Default(=blank): alpha=1, batch_size=3, dict_init=None, fit_algorithm='lars', n_components=None, n_iter=1000, random_state=None, shuffle=True, split_sign=False, transform_algorithm='omp', transform_alpha=None, transform_n_nonzero_coefs=None, verbose=False."/>
       </when>
       <when value="MiniBatchSparsePCA">
-        <expand macro="estimator_params_text" label="Type in maxtrix decomposition parameters:"
-              help="Parameters in dictionary without braces ('{}'), e.g., 'n_components': 100, 'random_state': 42. No double quotes. Leave this box blank for class default."/>
+        <expand macro="estimator_params_text"
+              help="Default(=blank): alpha=1, batch_size=3, callback=None, method='lars', n_components=None, n_iter=100, random_state=None, ridge_alpha=0.01, shuffle=True, verbose=False."/>
       </when>
       <when value="NMF">
-        <expand macro="estimator_params_text" label="Type in maxtrix decomposition parameters:"
-              help="Parameters in dictionary without braces ('{}'), e.g., 'n_components': 100, 'init': 'random'. No double quotes. Leave this box blank for class default."/>
+        <expand macro="estimator_params_text"
+              help="Default(=blank): alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200, n_components=None, random_state=None, shuffle=False, solver='cd', tol=0.0001, verbose=0."/>
       </when>
       <when value="PCA">
-        <expand macro="estimator_params_text" label="Type in maxtrix decomposition parameters:"
-              help="Parameters in dictionary without braces ('{}'), e.g., 'n_components': 100, 'random_state': 42. No double quotes. Leave this box blank for class default."/>
+        <expand macro="estimator_params_text"
+              help="Default(=blank): copy=True, iterated_power='auto', n_components=None, random_state=None, svd_solver='auto', tol=0.0, whiten=False."/>
       </when>
       <when value="SparsePCA">
-        <expand macro="estimator_params_text" label="Type in maxtrix decomposition parameters:"
-              help="Parameters in dictionary without braces ('{}'), e.g., 'n_components': 100, 'random_state': 42. No double quotes. Leave this box blank for class default."/>
-      </when>
-      <when value="SparseCoder">
-        <expand macro="estimator_params_text" label="Type in maxtrix decomposition parameters:"
-              help="Parameters in dictionary without braces ('{}'), e.g., 'transform_algorithm': 'omp', 'transform_alpha': 1.0. No double quotes. Leave this box blank for class default."/>
+        <expand macro="estimator_params_text"
+              help="Default(=blank): U_init=None, V_init=None, alpha=1, max_iter=1000, method='lars', n_components=None, random_state=None, ridge_alpha=0.01, tol=1e-08, verbose=False."/>
       </when>
       <when value="TruncatedSVD">
-        <expand macro="estimator_params_text" label="Type in maxtrix decomposition parameters:"
-              help="Parameters in dictionary without braces ('{}'), e.g., 'n_components': 2, 'algorithm': 'randomized'. No double quotes. Leave this box blank for default estimator."/>
+        <expand macro="estimator_params_text"
+              help="Default(=blank): algorithm='randomized', n_components=2, n_iter=5, random_state=None, tol=0.0."/>
       </when>
     </conditional>
   </xml>
@@ -1470,8 +1359,45 @@
         <option value="FeatureAgglomeration" selected="true">FeatureAgglomeration</option>
       </param>
       <when value="FeatureAgglomeration">
-        <expand macro="estimator_params_text" label="Type in parameters:"
-              help="Parameters in dictionary without braces ('{}'), e.g., 'n_clusters': 2, 'affinity': 'euclidean'. No double quotes. Leave this box blank for class default."/>
+        <expand macro="estimator_params_text"
+              help="Default(=blank): affinity='euclidean', compute_full_tree='auto', connectivity=None, linkage='ward', memory=None, n_clusters=2, pooling_func=np.mean."/>
+      </when>
+    </conditional>
+  </xml>
+
+  <xml name="skrebate">
+    <conditional name="skrebate_selector">
+      <param name="select_algorithm" type="select" label="Choose the algorithm:">
+        <option value="ReliefF">ReliefF</option>
+        <option value="SURF">SURF</option>
+        <option value="SURFstar">SURFstar</option>
+        <option value="MultiSURF">MultiSURF</option>
+        <option value="MultiSURFstar">MultiSURFstar</option>
+        <option value="TuRF">TuRF</option>
+      </param>
+      <when value="ReliefF">
+        <expand macro="estimator_params_text"
+              help="Default(=blank): discrete_threshold=10, n_features_to_select=10, n_neighbors=100, verbose=False."/>
+      </when>
+      <when value="SURF">
+        <expand macro="estimator_params_text"
+              help="Default(=blank): discrete_threshold=10, n_features_to_select=10, verbose=False."/>
+      </when>
+      <when value="SURFstar">
+        <expand macro="estimator_params_text"
+              help="Default(=blank): discrete_threshold=10, n_features_to_select=10, verbose=False."/>
+      </when>
+      <when value="MultiSURF">
+        <expand macro="estimator_params_text"
+              help="Default(=blank): discrete_threshold=10, n_features_to_select=10, verbose=False."/>
+      </when>
+      <when value="MultiSURFstar">
+        <expand macro="estimator_params_text"
+              help="Default(=blank): discrete_threshold=10, n_features_to_select=10, verbose=False."/>
+      </when>
+      <when value="TuRF">
+        <expand macro="estimator_params_text"
+              help="Default(=blank): core_algorithm='ReliefF', discrete_threshold=10, n_features_to_select=10, n_neighbors=100, pct=0.5, verbose=False."/>
       </when>
     </conditional>
   </xml>
Binary file test-data/pipeline09 has changed
Binary file test-data/pipeline10 has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/utils.py	Fri Aug 17 12:28:21 2018 -0400
@@ -0,0 +1,251 @@
+import sys
+import os
+import pandas
+import re
+import pickle
+import warnings
+import numpy as np
+import xgboost
+import scipy
+import sklearn
+import ast
+from asteval import Interpreter, make_symbol_table
+from sklearn import metrics, model_selection, ensemble, svm, linear_model, naive_bayes, tree, neighbors
+
+N_JOBS = int( os.environ.get('GALAXY_SLOTS', 1) )
+
+def read_columns(f, c=None, c_option='by_index_number', return_df=False, **args):
+    data = pandas.read_csv(f, **args)
+    if c_option == 'by_index_number':
+        cols = list(map(lambda x: x - 1, c))
+        data = data.iloc[:,cols]
+    if c_option == 'all_but_by_index_number':
+        cols = list(map(lambda x: x - 1, c))
+        data.drop(data.columns[cols], axis=1, inplace=True)
+    if c_option == 'by_header_name':
+        cols = [e.strip() for e in c.split(',')]
+        data = data[cols]
+    if c_option == 'all_but_by_header_name':
+        cols = [e.strip() for e in c.split(',')]
+        data.drop(cols, axis=1, inplace=True)
+    y = data.values
+    if return_df:
+        return y, data
+    else:
+        return y
+    return y
+
+
+## generate an instance for one of sklearn.feature_selection classes
+def feature_selector(inputs):
+    selector = inputs["selected_algorithm"]
+    selector = getattr(sklearn.feature_selection, selector)
+    options = inputs["options"]
+
+    if inputs['selected_algorithm'] == 'SelectFromModel':
+        if not options['threshold'] or options['threshold'] == 'None':
+            options['threshold'] = None
+        if inputs['model_inputter']['input_mode'] == 'prefitted':
+            model_file = inputs['model_inputter']['fitted_estimator']
+            with open(model_file, 'rb') as model_handler:
+                fitted_estimator = pickle.load(model_handler)
+            new_selector = selector(fitted_estimator, prefit=True, **options)
+        else:
+            estimator_json = inputs['model_inputter']["estimator_selector"]
+            estimator = get_estimator(estimator_json)
+            new_selector = selector(estimator, **options)
+
+    elif inputs['selected_algorithm'] == 'RFE':
+        estimator=get_estimator(inputs["estimator_selector"])
+        new_selector = selector(estimator, **options)
+
+    elif inputs['selected_algorithm'] == 'RFECV':
+        options['scoring'] = get_scoring(options['scoring'])
+        options['n_jobs'] = N_JOBS
+        options['cv'] = get_cv( options['cv'].strip() )
+        estimator=get_estimator(inputs["estimator_selector"])
+        new_selector = selector(estimator, **options)
+
+    elif inputs['selected_algorithm'] == "VarianceThreshold":
+        new_selector = selector(**options)
+
+    else:
+        score_func = inputs["score_func"]
+        score_func = getattr(sklearn.feature_selection, score_func)
+        new_selector = selector(score_func, **options)
+
+    return new_selector
+ 
+
+def get_X_y(params, file1, file2):
+    input_type = params["selected_tasks"]["selected_algorithms"]["input_options"]["selected_input"]
+    if input_type=="tabular":
+        header = 'infer' if params["selected_tasks"]["selected_algorithms"]["input_options"]["header1"] else None
+        column_option = params["selected_tasks"]["selected_algorithms"]["input_options"]["column_selector_options_1"]["selected_column_selector_option"]
+        if column_option in ["by_index_number", "all_but_by_index_number", "by_header_name", "all_but_by_header_name"]:
+            c = params["selected_tasks"]["selected_algorithms"]["input_options"]["column_selector_options_1"]["col1"]
+        else:
+            c = None
+        X = read_columns(
+            file1,
+            c = c,
+            c_option = column_option,
+            sep='\t',
+            header=header,
+            parse_dates=True
+        )
+    else:
+        X = mmread(file1)
+
+    header = 'infer' if params["selected_tasks"]["selected_algorithms"]["input_options"]["header2"] else None
+    column_option = params["selected_tasks"]["selected_algorithms"]["input_options"]["column_selector_options_2"]["selected_column_selector_option2"]
+    if column_option in ["by_index_number", "all_but_by_index_number", "by_header_name", "all_but_by_header_name"]:
+        c = params["selected_tasks"]["selected_algorithms"]["input_options"]["column_selector_options_2"]["col2"]
+    else:
+        c = None
+    y = read_columns(
+        file2,
+        c = c,
+        c_option = column_option,
+        sep='\t',
+        header=header,
+        parse_dates=True
+    )
+    y=y.ravel()
+    return X, y
+
+
+class SafeEval(Interpreter):
+
+    def __init__(self, load_scipy=False, load_numpy=False):
+
+        # File opening and other unneeded functions could be dropped
+        unwanted = ['open', 'type', 'dir', 'id', 'str', 'repr']
+
+        # Allowed symbol table. Add more if needed.
+        new_syms = {
+            'np_arange': getattr(np, 'arange'),
+            'ensemble_ExtraTreesClassifier': getattr(ensemble, 'ExtraTreesClassifier')
+        }
+
+        syms = make_symbol_table(use_numpy=False, **new_syms)
+
+        if load_scipy:
+            scipy_distributions = scipy.stats.distributions.__dict__
+            for key in scipy_distributions.keys():
+                if isinstance(scipy_distributions[key], (scipy.stats.rv_continuous, scipy.stats.rv_discrete)):
+                    syms['scipy_stats_' + key] = scipy_distributions[key]
+
+        if load_numpy:
+            from_numpy_random = ['beta', 'binomial', 'bytes', 'chisquare', 'choice', 'dirichlet', 'division',
+                                'exponential', 'f', 'gamma', 'geometric', 'gumbel', 'hypergeometric',
+                                'laplace', 'logistic', 'lognormal', 'logseries', 'mtrand', 'multinomial',
+                                'multivariate_normal', 'negative_binomial', 'noncentral_chisquare', 'noncentral_f',
+                                'normal', 'pareto', 'permutation', 'poisson', 'power', 'rand', 'randint',
+                                'randn', 'random', 'random_integers', 'random_sample', 'ranf', 'rayleigh',
+                                'sample', 'seed', 'set_state', 'shuffle', 'standard_cauchy', 'standard_exponential',
+                                'standard_gamma', 'standard_normal', 'standard_t', 'triangular', 'uniform',
+                                'vonmises', 'wald', 'weibull', 'zipf' ]
+            for f in from_numpy_random:
+                syms['np_random_' + f] = getattr(np.random, f)
+
+        for key in unwanted:
+            syms.pop(key, None)
+
+        super(SafeEval, self).__init__( symtable=syms, use_numpy=False, minimal=False,
+                                        no_if=True, no_for=True, no_while=True, no_try=True,
+                                        no_functiondef=True, no_ifexp=True, no_listcomp=False,
+                                        no_augassign=False, no_assert=True, no_delete=True,
+                                        no_raise=True, no_print=True)
+
+
+def get_search_params(params_builder):
+    search_params = {}
+    safe_eval = SafeEval(load_scipy=True, load_numpy=True)
+
+    for p in params_builder['param_set']:
+        search_p = p['search_param_selector']['search_p']
+        if search_p.strip() == '':
+            continue
+        param_type = p['search_param_selector']['selected_param_type']
+
+        lst = search_p.split(":")
+        assert (len(lst) == 2), "Error, make sure there is one and only one colon in search parameter input."
+        literal = lst[1].strip()
+        ev = safe_eval(literal)
+        if param_type == "final_estimator_p":
+            search_params["estimator__" + lst[0].strip()] = ev
+        else:
+            search_params["preprocessing_" + param_type[5:6] + "__" + lst[0].strip()] = ev
+
+    return search_params
+
+
+def get_estimator(estimator_json):
+    estimator_module = estimator_json['selected_module']
+    estimator_cls = estimator_json['selected_estimator']
+
+    if estimator_module == "xgboost":
+        cls = getattr(xgboost, estimator_cls)
+    else:
+        module = getattr(sklearn, estimator_module)
+        cls = getattr(module, estimator_cls)
+
+    estimator = cls()
+
+    estimator_params = estimator_json['text_params'].strip()
+    if estimator_params != "":
+        try:
+            params = safe_eval('dict(' + estimator_params + ')')
+        except ValueError:
+            sys.exit("Unsupported parameter input: `%s`" %estimator_params)
+        estimator.set_params(**params)
+    if 'n_jobs' in estimator.get_params():
+        estimator.set_params( n_jobs=N_JOBS )
+
+    return estimator
+
+
+def get_cv(literal):
+    safe_eval = SafeEval()
+    if literal == "":
+        return None
+    if literal.isdigit():
+        return int(literal)
+    m = re.match(r'^(?P<method>\w+)\((?P<args>.*)\)$', literal)
+    if m:
+        my_class = getattr( model_selection, m.group('method') )
+        args = safe_eval( 'dict('+ m.group('args') + ')' )
+        return my_class( **args )
+    sys.exit("Unsupported CV input: %s" %literal)
+
+
+def get_scoring(scoring_json):
+    def balanced_accuracy_score(y_true, y_pred):
+        C = metrics.confusion_matrix(y_true, y_pred)
+        with np.errstate(divide='ignore', invalid='ignore'):
+            per_class = np.diag(C) / C.sum(axis=1)
+        if np.any(np.isnan(per_class)):
+            warnings.warn('y_pred contains classes not in y_true')
+            per_class = per_class[~np.isnan(per_class)]
+        score = np.mean(per_class)
+        return score
+
+    if scoring_json['primary_scoring'] == "default":
+        return None
+
+    my_scorers = metrics.SCORERS
+    if 'balanced_accuracy' not in my_scorers:
+        my_scorers['balanced_accuracy'] = metrics.make_scorer(balanced_accuracy_score)
+
+    if scoring_json['secondary_scoring'] != 'None'\
+            and scoring_json['secondary_scoring'] != scoring_json['primary_scoring']:
+        scoring = {}
+        scoring['primary'] = my_scorers[ scoring_json['primary_scoring'] ]
+        for scorer in scoring_json['secondary_scoring'].split(','):
+            if scorer != scoring_json['primary_scoring']:
+                scoring[scorer] = my_scorers[scorer]
+        return scoring
+
+    return my_scorers[ scoring_json['primary_scoring'] ]
+