Repository 'sklearn_data_preprocess'
hg clone https://toolshed.g2.bx.psu.edu/repos/bgruening/sklearn_data_preprocess

Changeset 15:dad38f036e83 (2018-07-13)
Previous changeset 14:f9def78f6cd5 (2018-07-10) Next changeset 16:23f26ac9c7b3 (2018-08-04)
Commit message:
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit f54ff2ba2f8e7542d68966ce5a6b17d7f624ac48
modified:
main_macros.xml
pre_process.xml
removed:
test-data/mv_result07.tabular
b
diff -r f9def78f6cd5 -r dad38f036e83 main_macros.xml
--- a/main_macros.xml Tue Jul 10 03:12:09 2018 -0400
+++ b/main_macros.xml Fri Jul 13 03:55:44 2018 -0400
[
b'@@ -35,7 +35,8 @@\n     if not options[\'threshold\'] or options[\'threshold\'] == \'None\':\n       options[\'threshold\'] = None\n       if \'extra_estimator\' in inputs and inputs[\'extra_estimator\'][\'has_estimator\'] == \'no_load\':\n-        fitted_estimator = pickle.load(open("inputs[\'extra_estimator\'][\'fitted_estimator\']", \'r\'))\n+        with open("inputs[\'extra_estimator\'][\'fitted_estimator\']", \'rb\') as model_handler:\n+          fitted_estimator = pickle.load(model_handler)\n         new_selector = selector(fitted_estimator, prefit=True, **options)\n       else:\n         estimator=inputs["estimator"]\n@@ -83,7 +84,7 @@\n       parse_dates=True\n     )\n   else:\n-    X = mmread(open(file1, \'r\'))\n+    X = mmread(file1)\n \n   header = \'infer\' if params["selected_tasks"]["selected_algorithms"]["input_options"]["header2"] else None\n   column_option = params["selected_tasks"]["selected_algorithms"]["input_options"]["column_selector_options_2"]["selected_column_selector_option2"]\n@@ -432,19 +433,6 @@\n \n \n   <!--Data interface-->\n-  <xml name="tabular_input">\n-    <param name="infile" type="data" format="tabular" label="Data file with numeric values"/>\n-    <param name="start_column" type="data_column" data_ref="infile" optional="True" label="Select a subset of data. Start column:" />\n-    <param name="end_column" type="data_column" data_ref="infile" optional="True" label="End column:" />\n-  </xml>\n-\n-  <xml name="sample_cols" token_label1="File containing true class labels:" token_label2="File containing predicted class labels:" token_multiple1="False" token_multiple2="False" token_format1="tabular" token_format2="tabular" token_help1="" token_help2="">\n-    <param name="infile1" type="data" format="@FORMAT1@" label="@LABEL1@" help="@HELP1@"/>\n-    <param name="col1" multiple="@MULTIPLE1@" type="data_column" data_ref="infile1" label="Select target column(s):"/>\n-    <param name="infile2" type="data" format="@FORMAT2@" label="@LABEL2@" help="@HELP2@"/>\n-    <param name="col2" multiple="@MULTIPLE2@" type="data_column" data_ref="infile2" label="Select target column(s):"/>\n-    <yield/>\n-  </xml>\n \n   <xml name="samples_tabular" token_multiple1="false" token_multiple2="false">\n     <param name="infile1" type="data" format="tabular" label="Training samples dataset:"/>\n@@ -472,13 +460,13 @@\n       <param name="@COL_NAME@" multiple="@MULTIPLE@" type="data_column" data_ref="@INFILE@" label="Select target column(s):"/>\n     </when>\n     <when value="by_header_name">\n-      <param name="@COL_NAME@" type="text" value="" label="Type header name(s):" help="String seperate by colon. For example: target1,target2"/>\n+      <param name="@COL_NAME@" type="text" value="" label="Type header name(s):" help="Comma-separated string. For example: target1,target2"/>\n     </when>\n     <when value="all_but_by_index_number">\n       <param name="@COL_NAME@" multiple="@MULTIPLE@" type="data_column" data_ref="@INFILE@" label="Select target column(s):"/>\n     </when>\n     <when value="all_but_by_header_name">\n-      <param name="@COL_NAME@" type="text" value="" label="Type header name(s):" help="String seperate by colon. For example: target1,target2"/>\n+      <param name="@COL_NAME@" type="text" value="" label="Type header name(s):" help="Comma-separated string. For example: target1,target2"/>\n     </when>\n     <when value="all_columns">\n     </when>\n@@ -553,11 +541,6 @@\n     </conditional>\n   </xml>\n \n-  <xml name="multitype_input" token_format="tabular" token_help="All datasets with tabular format are supporetd.">\n-    <param name="infile_transform" type="data" format="@FORMAT@" label="Select a dataset to transform:" help="@HELP@"/>\n-  </xml>\n-\n-\n   <!--Advanced options-->\n   <xml name="nn_advanced_options">\n     <section name="options" title="Advanced Options" expanded="False">\n@@ -822,9 +805,17 @@\n     </param>\n   </xml>\n \n+  <xml name="sparse_preprocessors_ext">\n+    <expand macro="sparse_preprocessors">\n+      <option value="KernelCenterer">Kernel Centerer (Centers a kernel '..b' Options" expanded="False">\n+              <!--feature_range-->\n+              <param argument="copy" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true"\n+                  label="Use a copy of data for precomputing normalization" help=" "/>\n+          </section>\n+      </when>\n+      <when value="PolynomialFeatures">\n+          <section name="options" title="Advanced Options" expanded="False">\n+              <param argument="degree" type="integer" optional="true" value="2" label="The degree of the polynomial features " help=""/>\n+              <param argument="interaction_only" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="false" label="Produce interaction features only" help="(Features that are products of at most degree distinct input features) "/>\n+              <param argument="include_bias" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" label="Include a bias column" help="Feature in which all polynomial powers are zero "/>\n+          </section>\n+      </when>\n+      <when value="RobustScaler">\n+          <section name="options" title="Advanced Options" expanded="False">\n+              <!--=True, =True, copy=True-->\n+              <param argument="with_centering" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true"\n+                  label="Center the data before scaling" help=" "/>\n+              <param argument="with_scaling" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true"\n+                  label="Scale the data to interquartile range" help=" "/>\n+              <param argument="copy" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true"\n+                  label="Use a copy of data for inplace scaling" help=" "/>\n+          </section>\n+      </when>\n+    </expand>\n+  </xml>\n+\n   <xml name="estimator_input_no_fit">\n     <expand macro="feature_selection_estimator" />\n     <conditional name="extra_estimator">\n@@ -892,6 +914,7 @@\n       <expand macro="feature_selection_estimator_choices" />\n     </conditional>\n   </xml>\n+\n   <xml name="feature_selection_all">\n     <conditional name="feature_selection_algorithms">\n       <param name="selected_algorithm" type="select" label="Select a feature selection algorithm">\n@@ -1014,6 +1037,7 @@\n       </when-->\n     </conditional>\n   </xml>\n+\n   <xml name="feature_selection_score_function">\n     <param argument="score_func" type="select" label="Select a score function">\n       <option value="chi2">chi2 - Compute chi-squared stats between each non-negative feature and class</option>\n@@ -1023,6 +1047,7 @@\n       <option value="mutual_info_regression">mutual_info_regression - Estimate mutual information for a continuous target variable</option>\n     </param>\n   </xml>\n+\n   <xml name="feature_selection_estimator">\n     <param argument="estimator" type="select" label="Select an estimator" help="The base estimator from which the transformer is built.">\n       <option value="svm.SVR(kernel=&quot;linear&quot;)">svm.SVR(kernel=&quot;linear&quot;)</option>\n@@ -1032,6 +1057,7 @@\n       <option value="ensemble.RandomForestRegressor(n_estimators = 1000, random_state = 42)">ensemble.RandomForestRegressor(n_estimators = 1000, random_state = 42)</option>\n     </param>\n   </xml>\n+\n   <xml name="feature_selection_extra_estimator">   \n       <param name="has_estimator" type="select" label="Does your estimator on the list above?">\n         <option value="yes">Yes, my estimator is on the list</option>\n@@ -1039,6 +1065,7 @@\n         <yield/>\n       </param>\n   </xml>\n+\n   <xml name="feature_selection_estimator_choices">\n     <when value="yes">\n     </when>\n@@ -1047,6 +1074,7 @@\n     </when>\n     <yield/>\n   </xml>\n+\n   <xml name="feature_selection_methods">\n     <conditional name="select_methods">\n       <param name="selected_method" type="select" label="Select an operation">\n'
b
diff -r f9def78f6cd5 -r dad38f036e83 pre_process.xml
--- a/pre_process.xml Tue Jul 10 03:12:09 2018 -0400
+++ b/pre_process.xml Fri Jul 13 03:55:44 2018 -0400
[
b'@@ -24,19 +24,32 @@\n from scipy.io import mmwrite\n from sklearn import preprocessing\n \n+@COLUMNS_FUNCTION@\n+\n input_json_path = sys.argv[1]\n-params = json.load(open(input_json_path, "r"))\n+with open(input_json_path, "r") as param_handler:\n+    params = json.load(param_handler)\n \n #if $input_type.selected_input_type == "sparse":\n-X = mmread(open("$infile", \'r\'))\n+X = mmread("$infile")\n #else:\n-X = pandas.read_csv("$infile", sep=\'\\t\', header=None, index_col=None, parse_dates=True, encoding=None, tupleize_cols=False )\n-#end if\n-\n-#if $input_type.pre_processors.infile_transform.ext == \'txt\':\n-y = mmread(open("$infile", \'r\'))\n-#else:\n-y = pandas.read_csv("$infile", sep=\'\\t\', header=None, index_col=None, parse_dates=True, encoding=None, tupleize_cols=False )\n+header = \'infer\' if params["input_type"]["header1"] else None\n+column_option = params["input_type"]["column_selector_options_1"]["selected_column_selector_option"]\n+if column_option in ["by_index_number", "all_but_by_index_number", "by_header_name", "all_but_by_header_name"]:\n+    c = params["input_type"]["column_selector_options_1"]["col1"]\n+else:\n+    c = None\n+X = read_columns(\n+        "$input_type.infile",\n+        c = c,\n+        c_option = column_option,\n+        sep=\'\\t\',\n+        header=header,\n+        parse_dates=True,\n+        encoding=None,\n+        index_col=None,\n+        tupleize_cols=False\n+)\n #end if\n \n preprocessor = params["input_type"]["pre_processors"]["selected_pre_processor"]\n@@ -45,17 +58,19 @@\n my_class = getattr(preprocessing, preprocessor)\n estimator = my_class(**options)\n estimator.fit(X)\n-result = estimator.transform(y)\n+result = estimator.transform(X)\n \n-#if $input_type.pre_processors.infile_transform.ext == \'txt\':\n-mmwrite(open("$outfile_transform" , \'w+\'), result)\n+#if $input_type.selected_input_type == "sparse":\n+with open("$outfile_transform", "w+") as transform_handler:\n+    mmwrite(transform_handler, result)\n #else:\n res = pandas.DataFrame(result)\n res.to_csv(path_or_buf = "$outfile_transform", sep="\\t", index=False, header=None)\n #end if\n \n #if $save:\n-pickle.dump(estimator,open("$outfile_fit", \'w+\'), pickle.HIGHEST_PROTOCOL)\n+with open("$outfile_fit", \'wb\') as out_handler:\n+    pickle.dump(estimator, out_handler, pickle.HIGHEST_PROTOCOL)\n #end if\n         ]]>\n         </configfile>\n@@ -67,49 +82,14 @@\n                 <option value="sparse">Sparse</option>\n             </param>\n             <when value="tabular">\n-                <param name="infile" type="data" format="tabular" label="Select a tabular file you want to train your preprocessor on its data:"/>\n+                <param name="infile" type="data" format="tabular" label="Select a tabular file you want to train your preprocessor on its data:" />\n+                <param name="header1" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolfalse" checked="false" label="Does the dataset contain header:" />\n+                <conditional name="column_selector_options_1">\n+                    <expand macro="samples_column_selector_options" multiple="true" column_option="selected_column_selector_option" col_name="col1" infile="infile"/>\n+                </conditional>\n                 <conditional name="pre_processors">\n-                    <expand macro="sparse_preprocessors">\n-                        <option value="KernelCenterer">Kernel Centerer (Centers a kernel matrix)</option>\n-                        <option value="MinMaxScaler">Minmax Scaler (Scales features to a range)</option>\n-                        <option value="PolynomialFeatures">Polynomial Features (Generates polynomial and interaction features)</option>\n-                        <option value="RobustScaler">Robust Scaler (Scales features using outlier-invariance statistics)</option>\n-                    </expand>\n-                    <expand macro="sparse_preprocessor_options">\n-                        <when value="KernelCenterer">\n-                            <expand macro="multitype_input"/>\n-            '..b's>\n         <test>\n             <param name="infile" value="train.tabular" ftype="tabular"/>\n-            <param name="infile_transform" value="train.tabular" ftype="tabular"/>\n+            <param name="selected_column_selector_option" value="all_columns"/>\n             <param name="selected_input_type" value="tabular"/>\n             <param name="selected_pre_processor" value="KernelCenterer"/>\n             <param name="save" value="true"/>\n@@ -142,7 +122,7 @@\n         </test>\n         <test>\n             <param name="infile" value="train.tabular" ftype="tabular"/>\n-            <param name="infile_transform" value="train.tabular" ftype="tabular"/>\n+            <param name="selected_column_selector_option" value="all_columns"/>\n             <param name="selected_input_type" value="tabular"/>\n             <param name="selected_pre_processor" value="MinMaxScaler"/>\n             <param name="save" value="true"/>\n@@ -151,7 +131,7 @@\n         </test>\n         <test>\n             <param name="infile" value="train.tabular" ftype="tabular"/>\n-            <param name="infile_transform" value="train.tabular" ftype="tabular"/>\n+            <param name="selected_column_selector_option" value="all_columns"/>\n             <param name="selected_input_type" value="tabular"/>\n             <param name="selected_pre_processor" value="PolynomialFeatures"/>\n             <param name="save" value="true"/>\n@@ -160,7 +140,7 @@\n         </test>\n         <test>\n             <param name="infile" value="train.tabular" ftype="tabular"/>\n-            <param name="infile_transform" value="train.tabular" ftype="tabular"/>\n+            <param name="selected_column_selector_option" value="all_columns"/>\n             <param name="selected_input_type" value="tabular"/>\n             <param name="selected_pre_processor" value="RobustScaler"/>\n             <param name="save" value="true"/>\n@@ -169,7 +149,6 @@\n         </test>\n         <test>\n             <param name="infile" value="csr_sparse2.mtx" ftype="txt"/>\n-            <param name="infile_transform" value="csr_sparse2.mtx" ftype="txt"/>\n             <param name="selected_input_type" value="sparse"/>\n             <param name="selected_pre_processor" value="Binarizer"/>\n             <param name="save" value="true"/>\n@@ -178,7 +157,6 @@\n         </test>\n         <test>\n             <param name="infile" value="csr_sparse2.mtx" ftype="txt"/>\n-            <param name="infile_transform" value="csr_sparse2.mtx" ftype="txt"/>\n             <param name="selected_input_type" value="sparse"/>\n             <param name="selected_pre_processor" value="Imputer"/>\n             <param name="save" value="true"/>\n@@ -188,8 +166,8 @@\n         </test>\n         <test>\n             <param name="infile" value="train.tabular" ftype="tabular"/>\n-            <param name="infile_transform" value="train.tabular" ftype="tabular"/>\n             <param name="selected_input_type" value="tabular"/>\n+            <param name="selected_column_selector_option" value="all_columns"/>\n             <param name="selected_pre_processor" value="StandardScaler"/>\n             <param name="save" value="true"/>\n             <output name="outfile_transform" file="prp_result07" ftype="tabular"/>\n@@ -197,7 +175,6 @@\n         </test>\n         <test>\n             <param name="infile" value="csr_sparse2.mtx" ftype="txt"/>\n-            <param name="infile_transform" value="csr_sparse2.mtx" ftype="txt"/>\n             <param name="selected_input_type" value="sparse"/>\n             <param name="selected_pre_processor" value="MaxAbsScaler"/>\n             <param name="save" value="true"/>\n@@ -206,7 +183,6 @@\n         </test>\n         <test>\n             <param name="infile" value="csr_sparse2.mtx" ftype="txt"/>\n-            <param name="infile_transform" value="csr_sparse2.mtx" ftype="txt"/>\n             <param name="selected_input_type" value="sparse"/>\n             <param name="selected_pre_processor" value="Normalizer"/>\n             <param name="save" value="true"/>\n'
b
diff -r f9def78f6cd5 -r dad38f036e83 test-data/mv_result07.tabular
--- a/test-data/mv_result07.tabular Tue Jul 10 03:12:09 2018 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,1 +0,0 @@
-0.7824428015300172