Repository 'sklearn_data_preprocess'
hg clone https://toolshed.g2.bx.psu.edu/repos/bgruening/sklearn_data_preprocess

Changeset 19:f196d4715cfb (2018-08-17)
Previous changeset 18:a886cf4c8392 (2018-08-07) Next changeset 20:2bda387c73e4 (2018-08-23)
Commit message:
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit d00173591e4a783a4c1cb2664e4bb192ab5414f7
modified:
main_macros.xml
pre_process.xml
added:
test-data/pipeline09
test-data/pipeline10
utils.py
b
diff -r a886cf4c8392 -r f196d4715cfb main_macros.xml
--- a/main_macros.xml Tue Aug 07 05:47:31 2018 -0400
+++ b/main_macros.xml Fri Aug 17 12:28:58 2018 -0400
[
b'@@ -1,216 +1,13 @@\n <macros>\n   <token name="@VERSION@">0.9</token>\n \n-  <token name="@COLUMNS_FUNCTION@">\n-def read_columns(f, c=None, c_option=\'by_index_number\', return_df=False, **args):\n-  data = pandas.read_csv(f, **args)\n-  if c_option == \'by_index_number\':\n-    cols = list(map(lambda x: x - 1, c))\n-    data = data.iloc[:,cols]\n-  if c_option == \'all_but_by_index_number\':\n-    cols = list(map(lambda x: x - 1, c))\n-    data.drop(data.columns[cols], axis=1, inplace=True)\n-  if c_option == \'by_header_name\':\n-    cols = [e.strip() for e in c.split(\',\')]\n-    data = data[cols]\n-  if c_option == \'all_but_by_header_name\':\n-    cols = [e.strip() for e in c.split(\',\')]\n-    data.drop(cols, axis=1, inplace=True)\n-  y = data.values\n-  if return_df:\n-    return y, data\n-  else:\n-    return y\n-  return y\n-  </token>\n-\n-## generate an instance for one of sklearn.feature_selection classes\n-  <token name="@FEATURE_SELECTOR_FUNCTION@">\n-def feature_selector(inputs):\n-  selector = inputs["selected_algorithm"]\n-  selector = getattr(sklearn.feature_selection, selector)\n-  options = inputs["options"]\n-\n-  if inputs[\'selected_algorithm\'] == \'SelectFromModel\':\n-    if not options[\'threshold\'] or options[\'threshold\'] == \'None\':\n-      options[\'threshold\'] = None\n-    if inputs[\'model_inputter\'][\'input_mode\'] == \'prefitted\':\n-      model_file = inputs[\'model_inputter\'][\'fitted_estimator\']\n-      with open(model_file, \'rb\') as model_handler:\n-        fitted_estimator = pickle.load(model_handler)\n-      new_selector = selector(fitted_estimator, prefit=True, **options)\n-    else:\n-      estimator_json = inputs[\'model_inputter\']["estimator_selector"]\n-      estimator = get_estimator(estimator_json)\n-      new_selector = selector(estimator, **options)\n-\n-  elif inputs[\'selected_algorithm\'] in [\'RFE\', \'RFECV\']:\n-    if \'scoring\' in options and (not options[\'scoring\'] or options[\'scoring\'] == \'None\'):\n-      options[\'scoring\'] = None\n-    estimator=get_estimator(inputs["estimator_selector"])\n-    new_selector = selector(estimator, **options)\n-\n-  elif inputs[\'selected_algorithm\'] == "VarianceThreshold":\n-    new_selector = selector(**options)\n-\n-  else:\n-    score_func = inputs["score_func"]\n-    score_func = getattr(sklearn.feature_selection, score_func)\n-    new_selector = selector(score_func, **options)\n-\n-  return new_selector\n-  </token>\n-\n-  <token name="@GET_X_y_FUNCTION@">\n-def get_X_y(params, file1, file2):\n-  input_type = params["selected_tasks"]["selected_algorithms"]["input_options"]["selected_input"]\n-  if input_type=="tabular":\n-    header = \'infer\' if params["selected_tasks"]["selected_algorithms"]["input_options"]["header1"] else None\n-    column_option = params["selected_tasks"]["selected_algorithms"]["input_options"]["column_selector_options_1"]["selected_column_selector_option"]\n-    if column_option in ["by_index_number", "all_but_by_index_number", "by_header_name", "all_but_by_header_name"]:\n-      c = params["selected_tasks"]["selected_algorithms"]["input_options"]["column_selector_options_1"]["col1"]\n-    else:\n-      c = None\n-    X = read_columns(\n-      file1,\n-      c = c,\n-      c_option = column_option,\n-      sep=\'\\t\',\n-      header=header,\n-      parse_dates=True\n-    )\n-  else:\n-    X = mmread(file1)\n-\n-  header = \'infer\' if params["selected_tasks"]["selected_algorithms"]["input_options"]["header2"] else None\n-  column_option = params["selected_tasks"]["selected_algorithms"]["input_options"]["column_selector_options_2"]["selected_column_selector_option2"]\n-  if column_option in ["by_index_number", "all_but_by_index_number", "by_header_name", "all_but_by_header_name"]:\n-    c = params["selected_tasks"]["selected_algorithms"]["input_options"]["column_selector_options_2"]["col2"]\n-  else:\n-    c = None\n-  y = read_columns(\n-    file2,\n-    c = c,\n-    c_option = column_option,\n-    sep=\'\\t\',\n-    header=header,\n-    parse_dates=True\n-  )\n-  y=y.ravel()\n-  return X, y\n-  </token>\n-\n-  <token name="@SAFE_EVAL_FUNCTION@">\n-de'..b'copy=True, iterated_power=\'auto\', n_components=None, random_state=None, svd_solver=\'auto\', tol=0.0, whiten=False."/>\n       </when>\n       <when value="SparsePCA">\n-        <expand macro="estimator_params_text" label="Type in maxtrix decomposition parameters:"\n-              help="Parameters in dictionary without braces (\'{}\'), e.g., \'n_components\': 100, \'random_state\': 42. No double quotes. Leave this box blank for class default."/>\n-      </when>\n-      <when value="SparseCoder">\n-        <expand macro="estimator_params_text" label="Type in maxtrix decomposition parameters:"\n-              help="Parameters in dictionary without braces (\'{}\'), e.g., \'transform_algorithm\': \'omp\', \'transform_alpha\': 1.0. No double quotes. Leave this box blank for class default."/>\n+        <expand macro="estimator_params_text"\n+              help="Default(=blank): U_init=None, V_init=None, alpha=1, max_iter=1000, method=\'lars\', n_components=None, random_state=None, ridge_alpha=0.01, tol=1e-08, verbose=False."/>\n       </when>\n       <when value="TruncatedSVD">\n-        <expand macro="estimator_params_text" label="Type in maxtrix decomposition parameters:"\n-              help="Parameters in dictionary without braces (\'{}\'), e.g., \'n_components\': 2, \'algorithm\': \'randomized\'. No double quotes. Leave this box blank for default estimator."/>\n+        <expand macro="estimator_params_text"\n+              help="Default(=blank): algorithm=\'randomized\', n_components=2, n_iter=5, random_state=None, tol=0.0."/>\n       </when>\n     </conditional>\n   </xml>\n@@ -1470,8 +1359,45 @@\n         <option value="FeatureAgglomeration" selected="true">FeatureAgglomeration</option>\n       </param>\n       <when value="FeatureAgglomeration">\n-        <expand macro="estimator_params_text" label="Type in parameters:"\n-              help="Parameters in dictionary without braces (\'{}\'), e.g., \'n_clusters\': 2, \'affinity\': \'euclidean\'. No double quotes. Leave this box blank for class default."/>\n+        <expand macro="estimator_params_text"\n+              help="Default(=blank): affinity=\'euclidean\', compute_full_tree=\'auto\', connectivity=None, linkage=\'ward\', memory=None, n_clusters=2, pooling_func=np.mean."/>\n+      </when>\n+    </conditional>\n+  </xml>\n+\n+  <xml name="skrebate">\n+    <conditional name="skrebate_selector">\n+      <param name="select_algorithm" type="select" label="Choose the algorithm:">\n+        <option value="ReliefF">ReliefF</option>\n+        <option value="SURF">SURF</option>\n+        <option value="SURFstar">SURFstar</option>\n+        <option value="MultiSURF">MultiSURF</option>\n+        <option value="MultiSURFstar">MultiSURFstar</option>\n+        <option value="TuRF">TuRF</option>\n+      </param>\n+      <when value="ReliefF">\n+        <expand macro="estimator_params_text"\n+              help="Default(=blank): discrete_threshold=10, n_features_to_select=10, n_neighbors=100, verbose=False."/>\n+      </when>\n+      <when value="SURF">\n+        <expand macro="estimator_params_text"\n+              help="Default(=blank): discrete_threshold=10, n_features_to_select=10, verbose=False."/>\n+      </when>\n+      <when value="SURFstar">\n+        <expand macro="estimator_params_text"\n+              help="Default(=blank): discrete_threshold=10, n_features_to_select=10, verbose=False."/>\n+      </when>\n+      <when value="MultiSURF">\n+        <expand macro="estimator_params_text"\n+              help="Default(=blank): discrete_threshold=10, n_features_to_select=10, verbose=False."/>\n+      </when>\n+      <when value="MultiSURFstar">\n+        <expand macro="estimator_params_text"\n+              help="Default(=blank): discrete_threshold=10, n_features_to_select=10, verbose=False."/>\n+      </when>\n+      <when value="TuRF">\n+        <expand macro="estimator_params_text"\n+              help="Default(=blank): core_algorithm=\'ReliefF\', discrete_threshold=10, n_features_to_select=10, n_neighbors=100, pct=0.5, verbose=False."/>\n       </when>\n     </conditional>\n   </xml>\n'
b
diff -r a886cf4c8392 -r f196d4715cfb pre_process.xml
--- a/pre_process.xml Tue Aug 07 05:47:31 2018 -0400
+++ b/pre_process.xml Fri Aug 17 12:28:58 2018 -0400
[
@@ -19,12 +19,11 @@
 import json
 import pandas
 import pickle
-import numpy as np
 from scipy.io import mmread
 from scipy.io import mmwrite
 from sklearn import preprocessing
 
-@COLUMNS_FUNCTION@
+execfile("$__tool_directory__/utils.py")
 
 input_json_path = sys.argv[1]
 with open(input_json_path, "r") as param_handler:
b
diff -r a886cf4c8392 -r f196d4715cfb test-data/pipeline09
b
Binary file test-data/pipeline09 has changed
b
diff -r a886cf4c8392 -r f196d4715cfb test-data/pipeline10
b
Binary file test-data/pipeline10 has changed
b
diff -r a886cf4c8392 -r f196d4715cfb utils.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/utils.py Fri Aug 17 12:28:58 2018 -0400
[
b'@@ -0,0 +1,251 @@\n+import sys\n+import os\n+import pandas\n+import re\n+import pickle\n+import warnings\n+import numpy as np\n+import xgboost\n+import scipy\n+import sklearn\n+import ast\n+from asteval import Interpreter, make_symbol_table\n+from sklearn import metrics, model_selection, ensemble, svm, linear_model, naive_bayes, tree, neighbors\n+\n+N_JOBS = int( os.environ.get(\'GALAXY_SLOTS\', 1) )\n+\n+def read_columns(f, c=None, c_option=\'by_index_number\', return_df=False, **args):\n+    data = pandas.read_csv(f, **args)\n+    if c_option == \'by_index_number\':\n+        cols = list(map(lambda x: x - 1, c))\n+        data = data.iloc[:,cols]\n+    if c_option == \'all_but_by_index_number\':\n+        cols = list(map(lambda x: x - 1, c))\n+        data.drop(data.columns[cols], axis=1, inplace=True)\n+    if c_option == \'by_header_name\':\n+        cols = [e.strip() for e in c.split(\',\')]\n+        data = data[cols]\n+    if c_option == \'all_but_by_header_name\':\n+        cols = [e.strip() for e in c.split(\',\')]\n+        data.drop(cols, axis=1, inplace=True)\n+    y = data.values\n+    if return_df:\n+        return y, data\n+    else:\n+        return y\n+    return y\n+\n+\n+## generate an instance for one of sklearn.feature_selection classes\n+def feature_selector(inputs):\n+    selector = inputs["selected_algorithm"]\n+    selector = getattr(sklearn.feature_selection, selector)\n+    options = inputs["options"]\n+\n+    if inputs[\'selected_algorithm\'] == \'SelectFromModel\':\n+        if not options[\'threshold\'] or options[\'threshold\'] == \'None\':\n+            options[\'threshold\'] = None\n+        if inputs[\'model_inputter\'][\'input_mode\'] == \'prefitted\':\n+            model_file = inputs[\'model_inputter\'][\'fitted_estimator\']\n+            with open(model_file, \'rb\') as model_handler:\n+                fitted_estimator = pickle.load(model_handler)\n+            new_selector = selector(fitted_estimator, prefit=True, **options)\n+        else:\n+            estimator_json = inputs[\'model_inputter\']["estimator_selector"]\n+            estimator = get_estimator(estimator_json)\n+            new_selector = selector(estimator, **options)\n+\n+    elif inputs[\'selected_algorithm\'] == \'RFE\':\n+        estimator=get_estimator(inputs["estimator_selector"])\n+        new_selector = selector(estimator, **options)\n+\n+    elif inputs[\'selected_algorithm\'] == \'RFECV\':\n+        options[\'scoring\'] = get_scoring(options[\'scoring\'])\n+        options[\'n_jobs\'] = N_JOBS\n+        options[\'cv\'] = get_cv( options[\'cv\'].strip() )\n+        estimator=get_estimator(inputs["estimator_selector"])\n+        new_selector = selector(estimator, **options)\n+\n+    elif inputs[\'selected_algorithm\'] == "VarianceThreshold":\n+        new_selector = selector(**options)\n+\n+    else:\n+        score_func = inputs["score_func"]\n+        score_func = getattr(sklearn.feature_selection, score_func)\n+        new_selector = selector(score_func, **options)\n+\n+    return new_selector\n+ \n+\n+def get_X_y(params, file1, file2):\n+    input_type = params["selected_tasks"]["selected_algorithms"]["input_options"]["selected_input"]\n+    if input_type=="tabular":\n+        header = \'infer\' if params["selected_tasks"]["selected_algorithms"]["input_options"]["header1"] else None\n+        column_option = params["selected_tasks"]["selected_algorithms"]["input_options"]["column_selector_options_1"]["selected_column_selector_option"]\n+        if column_option in ["by_index_number", "all_but_by_index_number", "by_header_name", "all_but_by_header_name"]:\n+            c = params["selected_tasks"]["selected_algorithms"]["input_options"]["column_selector_options_1"]["col1"]\n+        else:\n+            c = None\n+        X = read_columns(\n+            file1,\n+            c = c,\n+            c_option = column_option,\n+            sep=\'\\t\',\n+            header=header,\n+            parse_dates=True\n+        )\n+    else:\n+        X = mmread(file1)\n+\n+    header = \'infer\' if params["selected_tasks"]["selected_algorithms"]["input_options"]["header2"] else None\n+    co'..b'                               \'vonmises\', \'wald\', \'weibull\', \'zipf\' ]\n+            for f in from_numpy_random:\n+                syms[\'np_random_\' + f] = getattr(np.random, f)\n+\n+        for key in unwanted:\n+            syms.pop(key, None)\n+\n+        super(SafeEval, self).__init__( symtable=syms, use_numpy=False, minimal=False,\n+                                        no_if=True, no_for=True, no_while=True, no_try=True,\n+                                        no_functiondef=True, no_ifexp=True, no_listcomp=False,\n+                                        no_augassign=False, no_assert=True, no_delete=True,\n+                                        no_raise=True, no_print=True)\n+\n+\n+def get_search_params(params_builder):\n+    search_params = {}\n+    safe_eval = SafeEval(load_scipy=True, load_numpy=True)\n+\n+    for p in params_builder[\'param_set\']:\n+        search_p = p[\'search_param_selector\'][\'search_p\']\n+        if search_p.strip() == \'\':\n+            continue\n+        param_type = p[\'search_param_selector\'][\'selected_param_type\']\n+\n+        lst = search_p.split(":")\n+        assert (len(lst) == 2), "Error, make sure there is one and only one colon in search parameter input."\n+        literal = lst[1].strip()\n+        ev = safe_eval(literal)\n+        if param_type == "final_estimator_p":\n+            search_params["estimator__" + lst[0].strip()] = ev\n+        else:\n+            search_params["preprocessing_" + param_type[5:6] + "__" + lst[0].strip()] = ev\n+\n+    return search_params\n+\n+\n+def get_estimator(estimator_json):\n+    estimator_module = estimator_json[\'selected_module\']\n+    estimator_cls = estimator_json[\'selected_estimator\']\n+\n+    if estimator_module == "xgboost":\n+        cls = getattr(xgboost, estimator_cls)\n+    else:\n+        module = getattr(sklearn, estimator_module)\n+        cls = getattr(module, estimator_cls)\n+\n+    estimator = cls()\n+\n+    estimator_params = estimator_json[\'text_params\'].strip()\n+    if estimator_params != "":\n+        try:\n+            params = safe_eval(\'dict(\' + estimator_params + \')\')\n+        except ValueError:\n+            sys.exit("Unsupported parameter input: `%s`" %estimator_params)\n+        estimator.set_params(**params)\n+    if \'n_jobs\' in estimator.get_params():\n+        estimator.set_params( n_jobs=N_JOBS )\n+\n+    return estimator\n+\n+\n+def get_cv(literal):\n+    safe_eval = SafeEval()\n+    if literal == "":\n+        return None\n+    if literal.isdigit():\n+        return int(literal)\n+    m = re.match(r\'^(?P<method>\\w+)\\((?P<args>.*)\\)$\', literal)\n+    if m:\n+        my_class = getattr( model_selection, m.group(\'method\') )\n+        args = safe_eval( \'dict(\'+ m.group(\'args\') + \')\' )\n+        return my_class( **args )\n+    sys.exit("Unsupported CV input: %s" %literal)\n+\n+\n+def get_scoring(scoring_json):\n+    def balanced_accuracy_score(y_true, y_pred):\n+        C = metrics.confusion_matrix(y_true, y_pred)\n+        with np.errstate(divide=\'ignore\', invalid=\'ignore\'):\n+            per_class = np.diag(C) / C.sum(axis=1)\n+        if np.any(np.isnan(per_class)):\n+            warnings.warn(\'y_pred contains classes not in y_true\')\n+            per_class = per_class[~np.isnan(per_class)]\n+        score = np.mean(per_class)\n+        return score\n+\n+    if scoring_json[\'primary_scoring\'] == "default":\n+        return None\n+\n+    my_scorers = metrics.SCORERS\n+    if \'balanced_accuracy\' not in my_scorers:\n+        my_scorers[\'balanced_accuracy\'] = metrics.make_scorer(balanced_accuracy_score)\n+\n+    if scoring_json[\'secondary_scoring\'] != \'None\'\\\n+            and scoring_json[\'secondary_scoring\'] != scoring_json[\'primary_scoring\']:\n+        scoring = {}\n+        scoring[\'primary\'] = my_scorers[ scoring_json[\'primary_scoring\'] ]\n+        for scorer in scoring_json[\'secondary_scoring\'].split(\',\'):\n+            if scorer != scoring_json[\'primary_scoring\']:\n+                scoring[scorer] = my_scorers[scorer]\n+        return scoring\n+\n+    return my_scorers[ scoring_json[\'primary_scoring\'] ]\n+\n'