Mercurial > repos > bgruening > sklearn_data_preprocess
changeset 15:dad38f036e83 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit f54ff2ba2f8e7542d68966ce5a6b17d7f624ac48
author | bgruening |
---|---|
date | Fri, 13 Jul 2018 03:55:44 -0400 |
parents | f9def78f6cd5 |
children | 23f26ac9c7b3 |
files | main_macros.xml pre_process.xml test-data/mv_result07.tabular |
diffstat | 3 files changed, 95 insertions(+), 92 deletions(-) [+] |
line wrap: on
line diff
--- a/main_macros.xml Tue Jul 10 03:12:09 2018 -0400 +++ b/main_macros.xml Fri Jul 13 03:55:44 2018 -0400 @@ -35,7 +35,8 @@ if not options['threshold'] or options['threshold'] == 'None': options['threshold'] = None if 'extra_estimator' in inputs and inputs['extra_estimator']['has_estimator'] == 'no_load': - fitted_estimator = pickle.load(open("inputs['extra_estimator']['fitted_estimator']", 'r')) + with open("inputs['extra_estimator']['fitted_estimator']", 'rb') as model_handler: + fitted_estimator = pickle.load(model_handler) new_selector = selector(fitted_estimator, prefit=True, **options) else: estimator=inputs["estimator"] @@ -83,7 +84,7 @@ parse_dates=True ) else: - X = mmread(open(file1, 'r')) + X = mmread(file1) header = 'infer' if params["selected_tasks"]["selected_algorithms"]["input_options"]["header2"] else None column_option = params["selected_tasks"]["selected_algorithms"]["input_options"]["column_selector_options_2"]["selected_column_selector_option2"] @@ -432,19 +433,6 @@ <!--Data interface--> - <xml name="tabular_input"> - <param name="infile" type="data" format="tabular" label="Data file with numeric values"/> - <param name="start_column" type="data_column" data_ref="infile" optional="True" label="Select a subset of data. Start column:" /> - <param name="end_column" type="data_column" data_ref="infile" optional="True" label="End column:" /> - </xml> - - <xml name="sample_cols" token_label1="File containing true class labels:" token_label2="File containing predicted class labels:" token_multiple1="False" token_multiple2="False" token_format1="tabular" token_format2="tabular" token_help1="" token_help2=""> - <param name="infile1" type="data" format="@FORMAT1@" label="@LABEL1@" help="@HELP1@"/> - <param name="col1" multiple="@MULTIPLE1@" type="data_column" data_ref="infile1" label="Select target column(s):"/> - <param name="infile2" type="data" format="@FORMAT2@" label="@LABEL2@" help="@HELP2@"/> - <param name="col2" multiple="@MULTIPLE2@" type="data_column" data_ref="infile2" label="Select target column(s):"/> - <yield/> - </xml> <xml name="samples_tabular" token_multiple1="false" token_multiple2="false"> <param name="infile1" type="data" format="tabular" label="Training samples dataset:"/> @@ -472,13 +460,13 @@ <param name="@COL_NAME@" multiple="@MULTIPLE@" type="data_column" data_ref="@INFILE@" label="Select target column(s):"/> </when> <when value="by_header_name"> - <param name="@COL_NAME@" type="text" value="" label="Type header name(s):" help="String seperate by colon. For example: target1,target2"/> + <param name="@COL_NAME@" type="text" value="" label="Type header name(s):" help="Comma-separated string. For example: target1,target2"/> </when> <when value="all_but_by_index_number"> <param name="@COL_NAME@" multiple="@MULTIPLE@" type="data_column" data_ref="@INFILE@" label="Select target column(s):"/> </when> <when value="all_but_by_header_name"> - <param name="@COL_NAME@" type="text" value="" label="Type header name(s):" help="String seperate by colon. For example: target1,target2"/> + <param name="@COL_NAME@" type="text" value="" label="Type header name(s):" help="Comma-separated string. For example: target1,target2"/> </when> <when value="all_columns"> </when> @@ -553,11 +541,6 @@ </conditional> </xml> - <xml name="multitype_input" token_format="tabular" token_help="All datasets with tabular format are supporetd."> - <param name="infile_transform" type="data" format="@FORMAT@" label="Select a dataset to transform:" help="@HELP@"/> - </xml> - - <!--Advanced options--> <xml name="nn_advanced_options"> <section name="options" title="Advanced Options" expanded="False"> @@ -822,9 +805,17 @@ </param> </xml> + <xml name="sparse_preprocessors_ext"> + <expand macro="sparse_preprocessors"> + <option value="KernelCenterer">Kernel Centerer (Centers a kernel matrix)</option> + <option value="MinMaxScaler">Minmax Scaler (Scales features to a range)</option> + <option value="PolynomialFeatures">Polynomial Features (Generates polynomial and interaction features)</option> + <option value="RobustScaler">Robust Scaler (Scales features using outlier-invariance statistics)</option> + </expand> + </xml> + <xml name="sparse_preprocessor_options"> <when value="Binarizer"> - <expand macro="multitype_input" format="tabular,txt" help="Tabular and sparse datasets are supporetd."/> <section name="options" title="Advanced Options" expanded="False"> <param argument="copy" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolfalse" checked="true" label="Use a copy of data for precomputing binarization" help=" "/> @@ -834,7 +825,6 @@ </section> </when> <when value="Imputer"> - <expand macro="multitype_input" format="tabular,txt" help="Tabular and sparse datasets are supporetd."/> <section name="options" title="Advanced Options" expanded="False"> <param argument="copy" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolfalse" checked="true" label="Use a copy of data for precomputing imputation" help=" "/> @@ -854,7 +844,6 @@ </section> </when> <when value="StandardScaler"> - <expand macro="multitype_input"/> <section name="options" title="Advanced Options" expanded="False"> <param argument="copy" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolfalse" checked="true" label="Use a copy of data for performing inplace scaling" help=" "/> @@ -865,14 +854,12 @@ </section> </when> <when value="MaxAbsScaler"> - <expand macro="multitype_input" format="tabular,txt" help="Tabular and sparse datasets are supporetd."/> <section name="options" title="Advanced Options" expanded="False"> <param argument="copy" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolfalse" checked="true" label="Use a copy of data for precomputing scaling" help=" "/> </section> </when> <when value="Normalizer"> - <expand macro="multitype_input" format="tabular,txt" help="Tabular and sparse datasets are supporetd."/> <section name="options" title="Advanced Options" expanded="False"> <param argument="norm" type="select" optional="true" label="The norm to use to normalize non zero samples" help=" "> <option value="l1" selected="true">l1</option> @@ -885,6 +872,41 @@ </when> <yield/> </xml> + + <xml name="sparse_preprocessor_options_ext"> + <expand macro="sparse_preprocessor_options"> + <when value="KernelCenterer"> + <section name="options" title="Advanced Options" expanded="False"> + </section> + </when> + <when value="MinMaxScaler"> + <section name="options" title="Advanced Options" expanded="False"> + <!--feature_range--> + <param argument="copy" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" + label="Use a copy of data for precomputing normalization" help=" "/> + </section> + </when> + <when value="PolynomialFeatures"> + <section name="options" title="Advanced Options" expanded="False"> + <param argument="degree" type="integer" optional="true" value="2" label="The degree of the polynomial features " help=""/> + <param argument="interaction_only" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="false" label="Produce interaction features only" help="(Features that are products of at most degree distinct input features) "/> + <param argument="include_bias" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" label="Include a bias column" help="Feature in which all polynomial powers are zero "/> + </section> + </when> + <when value="RobustScaler"> + <section name="options" title="Advanced Options" expanded="False"> + <!--=True, =True, copy=True--> + <param argument="with_centering" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" + label="Center the data before scaling" help=" "/> + <param argument="with_scaling" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" + label="Scale the data to interquartile range" help=" "/> + <param argument="copy" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" + label="Use a copy of data for inplace scaling" help=" "/> + </section> + </when> + </expand> + </xml> + <xml name="estimator_input_no_fit"> <expand macro="feature_selection_estimator" /> <conditional name="extra_estimator"> @@ -892,6 +914,7 @@ <expand macro="feature_selection_estimator_choices" /> </conditional> </xml> + <xml name="feature_selection_all"> <conditional name="feature_selection_algorithms"> <param name="selected_algorithm" type="select" label="Select a feature selection algorithm"> @@ -1014,6 +1037,7 @@ </when--> </conditional> </xml> + <xml name="feature_selection_score_function"> <param argument="score_func" type="select" label="Select a score function"> <option value="chi2">chi2 - Compute chi-squared stats between each non-negative feature and class</option> @@ -1023,6 +1047,7 @@ <option value="mutual_info_regression">mutual_info_regression - Estimate mutual information for a continuous target variable</option> </param> </xml> + <xml name="feature_selection_estimator"> <param argument="estimator" type="select" label="Select an estimator" help="The base estimator from which the transformer is built."> <option value="svm.SVR(kernel="linear")">svm.SVR(kernel="linear")</option> @@ -1032,6 +1057,7 @@ <option value="ensemble.RandomForestRegressor(n_estimators = 1000, random_state = 42)">ensemble.RandomForestRegressor(n_estimators = 1000, random_state = 42)</option> </param> </xml> + <xml name="feature_selection_extra_estimator"> <param name="has_estimator" type="select" label="Does your estimator on the list above?"> <option value="yes">Yes, my estimator is on the list</option> @@ -1039,6 +1065,7 @@ <yield/> </param> </xml> + <xml name="feature_selection_estimator_choices"> <when value="yes"> </when> @@ -1047,6 +1074,7 @@ </when> <yield/> </xml> + <xml name="feature_selection_methods"> <conditional name="select_methods"> <param name="selected_method" type="select" label="Select an operation">
--- a/pre_process.xml Tue Jul 10 03:12:09 2018 -0400 +++ b/pre_process.xml Fri Jul 13 03:55:44 2018 -0400 @@ -24,19 +24,32 @@ from scipy.io import mmwrite from sklearn import preprocessing +@COLUMNS_FUNCTION@ + input_json_path = sys.argv[1] -params = json.load(open(input_json_path, "r")) +with open(input_json_path, "r") as param_handler: + params = json.load(param_handler) #if $input_type.selected_input_type == "sparse": -X = mmread(open("$infile", 'r')) +X = mmread("$infile") #else: -X = pandas.read_csv("$infile", sep='\t', header=None, index_col=None, parse_dates=True, encoding=None, tupleize_cols=False ) -#end if - -#if $input_type.pre_processors.infile_transform.ext == 'txt': -y = mmread(open("$infile", 'r')) -#else: -y = pandas.read_csv("$infile", sep='\t', header=None, index_col=None, parse_dates=True, encoding=None, tupleize_cols=False ) +header = 'infer' if params["input_type"]["header1"] else None +column_option = params["input_type"]["column_selector_options_1"]["selected_column_selector_option"] +if column_option in ["by_index_number", "all_but_by_index_number", "by_header_name", "all_but_by_header_name"]: + c = params["input_type"]["column_selector_options_1"]["col1"] +else: + c = None +X = read_columns( + "$input_type.infile", + c = c, + c_option = column_option, + sep='\t', + header=header, + parse_dates=True, + encoding=None, + index_col=None, + tupleize_cols=False +) #end if preprocessor = params["input_type"]["pre_processors"]["selected_pre_processor"] @@ -45,17 +58,19 @@ my_class = getattr(preprocessing, preprocessor) estimator = my_class(**options) estimator.fit(X) -result = estimator.transform(y) +result = estimator.transform(X) -#if $input_type.pre_processors.infile_transform.ext == 'txt': -mmwrite(open("$outfile_transform" , 'w+'), result) +#if $input_type.selected_input_type == "sparse": +with open("$outfile_transform", "w+") as transform_handler: + mmwrite(transform_handler, result) #else: res = pandas.DataFrame(result) res.to_csv(path_or_buf = "$outfile_transform", sep="\t", index=False, header=None) #end if #if $save: -pickle.dump(estimator,open("$outfile_fit", 'w+'), pickle.HIGHEST_PROTOCOL) +with open("$outfile_fit", 'wb') as out_handler: + pickle.dump(estimator, out_handler, pickle.HIGHEST_PROTOCOL) #end if ]]> </configfile> @@ -67,49 +82,14 @@ <option value="sparse">Sparse</option> </param> <when value="tabular"> - <param name="infile" type="data" format="tabular" label="Select a tabular file you want to train your preprocessor on its data:"/> + <param name="infile" type="data" format="tabular" label="Select a tabular file you want to train your preprocessor on its data:" /> + <param name="header1" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolfalse" checked="false" label="Does the dataset contain header:" /> + <conditional name="column_selector_options_1"> + <expand macro="samples_column_selector_options" multiple="true" column_option="selected_column_selector_option" col_name="col1" infile="infile"/> + </conditional> <conditional name="pre_processors"> - <expand macro="sparse_preprocessors"> - <option value="KernelCenterer">Kernel Centerer (Centers a kernel matrix)</option> - <option value="MinMaxScaler">Minmax Scaler (Scales features to a range)</option> - <option value="PolynomialFeatures">Polynomial Features (Generates polynomial and interaction features)</option> - <option value="RobustScaler">Robust Scaler (Scales features using outlier-invariance statistics)</option> - </expand> - <expand macro="sparse_preprocessor_options"> - <when value="KernelCenterer"> - <expand macro="multitype_input"/> - <section name="options" title="Advanced Options" expanded="False"> - </section> - </when> - <when value="MinMaxScaler"> - <expand macro="multitype_input"/> - <section name="options" title="Advanced Options" expanded="False"> - <!--feature_range--> - <param argument="copy" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" - label="Use a copy of data for precomputing normalization" help=" "/> - </section> - </when> - <when value="PolynomialFeatures"> - <expand macro="multitype_input"/> - <section name="options" title="Advanced Options" expanded="False"> - <param argument="degree" type="integer" optional="true" value="2" label="The degree of the polynomial features " help=""/> - <param argument="interaction_only" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="false" label="Produce interaction features only" help="(Features that are products of at most degree distinct input features) "/> - <param argument="include_bias" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" label="Include a bias column" help="Feature in which all polynomial powers are zero "/> - </section> - </when> - <when value="RobustScaler"> - <expand macro="multitype_input"/> - <section name="options" title="Advanced Options" expanded="False"> - <!--=True, =True, copy=True--> - <param argument="with_centering" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" - label="Center the data before scaling" help=" "/> - <param argument="with_scaling" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" - label="Scale the data to interquartile range" help=" "/> - <param argument="copy" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" - label="Use a copy of data for inplace scaling" help=" "/> - </section> - </when> - </expand> + <expand macro="sparse_preprocessors_ext" /> + <expand macro="sparse_preprocessor_options_ext" /> </conditional> </when> <when value="sparse"> @@ -133,7 +113,7 @@ <tests> <test> <param name="infile" value="train.tabular" ftype="tabular"/> - <param name="infile_transform" value="train.tabular" ftype="tabular"/> + <param name="selected_column_selector_option" value="all_columns"/> <param name="selected_input_type" value="tabular"/> <param name="selected_pre_processor" value="KernelCenterer"/> <param name="save" value="true"/> @@ -142,7 +122,7 @@ </test> <test> <param name="infile" value="train.tabular" ftype="tabular"/> - <param name="infile_transform" value="train.tabular" ftype="tabular"/> + <param name="selected_column_selector_option" value="all_columns"/> <param name="selected_input_type" value="tabular"/> <param name="selected_pre_processor" value="MinMaxScaler"/> <param name="save" value="true"/> @@ -151,7 +131,7 @@ </test> <test> <param name="infile" value="train.tabular" ftype="tabular"/> - <param name="infile_transform" value="train.tabular" ftype="tabular"/> + <param name="selected_column_selector_option" value="all_columns"/> <param name="selected_input_type" value="tabular"/> <param name="selected_pre_processor" value="PolynomialFeatures"/> <param name="save" value="true"/> @@ -160,7 +140,7 @@ </test> <test> <param name="infile" value="train.tabular" ftype="tabular"/> - <param name="infile_transform" value="train.tabular" ftype="tabular"/> + <param name="selected_column_selector_option" value="all_columns"/> <param name="selected_input_type" value="tabular"/> <param name="selected_pre_processor" value="RobustScaler"/> <param name="save" value="true"/> @@ -169,7 +149,6 @@ </test> <test> <param name="infile" value="csr_sparse2.mtx" ftype="txt"/> - <param name="infile_transform" value="csr_sparse2.mtx" ftype="txt"/> <param name="selected_input_type" value="sparse"/> <param name="selected_pre_processor" value="Binarizer"/> <param name="save" value="true"/> @@ -178,7 +157,6 @@ </test> <test> <param name="infile" value="csr_sparse2.mtx" ftype="txt"/> - <param name="infile_transform" value="csr_sparse2.mtx" ftype="txt"/> <param name="selected_input_type" value="sparse"/> <param name="selected_pre_processor" value="Imputer"/> <param name="save" value="true"/> @@ -188,8 +166,8 @@ </test> <test> <param name="infile" value="train.tabular" ftype="tabular"/> - <param name="infile_transform" value="train.tabular" ftype="tabular"/> <param name="selected_input_type" value="tabular"/> + <param name="selected_column_selector_option" value="all_columns"/> <param name="selected_pre_processor" value="StandardScaler"/> <param name="save" value="true"/> <output name="outfile_transform" file="prp_result07" ftype="tabular"/> @@ -197,7 +175,6 @@ </test> <test> <param name="infile" value="csr_sparse2.mtx" ftype="txt"/> - <param name="infile_transform" value="csr_sparse2.mtx" ftype="txt"/> <param name="selected_input_type" value="sparse"/> <param name="selected_pre_processor" value="MaxAbsScaler"/> <param name="save" value="true"/> @@ -206,7 +183,6 @@ </test> <test> <param name="infile" value="csr_sparse2.mtx" ftype="txt"/> - <param name="infile_transform" value="csr_sparse2.mtx" ftype="txt"/> <param name="selected_input_type" value="sparse"/> <param name="selected_pre_processor" value="Normalizer"/> <param name="save" value="true"/>