Mercurial > repos > bgruening > sklearn_data_preprocess
comparison pre_process.xml @ 15:dad38f036e83 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit f54ff2ba2f8e7542d68966ce5a6b17d7f624ac48
author | bgruening |
---|---|
date | Fri, 13 Jul 2018 03:55:44 -0400 |
parents | 29899feb4d44 |
children | f196d4715cfb |
comparison
equal
deleted
inserted
replaced
14:f9def78f6cd5 | 15:dad38f036e83 |
---|---|
22 import numpy as np | 22 import numpy as np |
23 from scipy.io import mmread | 23 from scipy.io import mmread |
24 from scipy.io import mmwrite | 24 from scipy.io import mmwrite |
25 from sklearn import preprocessing | 25 from sklearn import preprocessing |
26 | 26 |
27 @COLUMNS_FUNCTION@ | |
28 | |
27 input_json_path = sys.argv[1] | 29 input_json_path = sys.argv[1] |
28 params = json.load(open(input_json_path, "r")) | 30 with open(input_json_path, "r") as param_handler: |
31 params = json.load(param_handler) | |
29 | 32 |
30 #if $input_type.selected_input_type == "sparse": | 33 #if $input_type.selected_input_type == "sparse": |
31 X = mmread(open("$infile", 'r')) | 34 X = mmread("$infile") |
32 #else: | 35 #else: |
33 X = pandas.read_csv("$infile", sep='\t', header=None, index_col=None, parse_dates=True, encoding=None, tupleize_cols=False ) | 36 header = 'infer' if params["input_type"]["header1"] else None |
34 #end if | 37 column_option = params["input_type"]["column_selector_options_1"]["selected_column_selector_option"] |
35 | 38 if column_option in ["by_index_number", "all_but_by_index_number", "by_header_name", "all_but_by_header_name"]: |
36 #if $input_type.pre_processors.infile_transform.ext == 'txt': | 39 c = params["input_type"]["column_selector_options_1"]["col1"] |
37 y = mmread(open("$infile", 'r')) | 40 else: |
38 #else: | 41 c = None |
39 y = pandas.read_csv("$infile", sep='\t', header=None, index_col=None, parse_dates=True, encoding=None, tupleize_cols=False ) | 42 X = read_columns( |
43 "$input_type.infile", | |
44 c = c, | |
45 c_option = column_option, | |
46 sep='\t', | |
47 header=header, | |
48 parse_dates=True, | |
49 encoding=None, | |
50 index_col=None, | |
51 tupleize_cols=False | |
52 ) | |
40 #end if | 53 #end if |
41 | 54 |
42 preprocessor = params["input_type"]["pre_processors"]["selected_pre_processor"] | 55 preprocessor = params["input_type"]["pre_processors"]["selected_pre_processor"] |
43 options = params["input_type"]["pre_processors"]["options"] | 56 options = params["input_type"]["pre_processors"]["options"] |
44 | 57 |
45 my_class = getattr(preprocessing, preprocessor) | 58 my_class = getattr(preprocessing, preprocessor) |
46 estimator = my_class(**options) | 59 estimator = my_class(**options) |
47 estimator.fit(X) | 60 estimator.fit(X) |
48 result = estimator.transform(y) | 61 result = estimator.transform(X) |
49 | 62 |
50 #if $input_type.pre_processors.infile_transform.ext == 'txt': | 63 #if $input_type.selected_input_type == "sparse": |
51 mmwrite(open("$outfile_transform" , 'w+'), result) | 64 with open("$outfile_transform", "w+") as transform_handler: |
65 mmwrite(transform_handler, result) | |
52 #else: | 66 #else: |
53 res = pandas.DataFrame(result) | 67 res = pandas.DataFrame(result) |
54 res.to_csv(path_or_buf = "$outfile_transform", sep="\t", index=False, header=None) | 68 res.to_csv(path_or_buf = "$outfile_transform", sep="\t", index=False, header=None) |
55 #end if | 69 #end if |
56 | 70 |
57 #if $save: | 71 #if $save: |
58 pickle.dump(estimator,open("$outfile_fit", 'w+'), pickle.HIGHEST_PROTOCOL) | 72 with open("$outfile_fit", 'wb') as out_handler: |
73 pickle.dump(estimator, out_handler, pickle.HIGHEST_PROTOCOL) | |
59 #end if | 74 #end if |
60 ]]> | 75 ]]> |
61 </configfile> | 76 </configfile> |
62 </configfiles> | 77 </configfiles> |
63 <inputs> | 78 <inputs> |
65 <param name="selected_input_type" type="select" label="Select the type of your input data:"> | 80 <param name="selected_input_type" type="select" label="Select the type of your input data:"> |
66 <option value="tabular" selected="true">Tabular</option> | 81 <option value="tabular" selected="true">Tabular</option> |
67 <option value="sparse">Sparse</option> | 82 <option value="sparse">Sparse</option> |
68 </param> | 83 </param> |
69 <when value="tabular"> | 84 <when value="tabular"> |
70 <param name="infile" type="data" format="tabular" label="Select a tabular file you want to train your preprocessor on its data:"/> | 85 <param name="infile" type="data" format="tabular" label="Select a tabular file you want to train your preprocessor on its data:" /> |
86 <param name="header1" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolfalse" checked="false" label="Does the dataset contain header:" /> | |
87 <conditional name="column_selector_options_1"> | |
88 <expand macro="samples_column_selector_options" multiple="true" column_option="selected_column_selector_option" col_name="col1" infile="infile"/> | |
89 </conditional> | |
71 <conditional name="pre_processors"> | 90 <conditional name="pre_processors"> |
72 <expand macro="sparse_preprocessors"> | 91 <expand macro="sparse_preprocessors_ext" /> |
73 <option value="KernelCenterer">Kernel Centerer (Centers a kernel matrix)</option> | 92 <expand macro="sparse_preprocessor_options_ext" /> |
74 <option value="MinMaxScaler">Minmax Scaler (Scales features to a range)</option> | |
75 <option value="PolynomialFeatures">Polynomial Features (Generates polynomial and interaction features)</option> | |
76 <option value="RobustScaler">Robust Scaler (Scales features using outlier-invariance statistics)</option> | |
77 </expand> | |
78 <expand macro="sparse_preprocessor_options"> | |
79 <when value="KernelCenterer"> | |
80 <expand macro="multitype_input"/> | |
81 <section name="options" title="Advanced Options" expanded="False"> | |
82 </section> | |
83 </when> | |
84 <when value="MinMaxScaler"> | |
85 <expand macro="multitype_input"/> | |
86 <section name="options" title="Advanced Options" expanded="False"> | |
87 <!--feature_range--> | |
88 <param argument="copy" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" | |
89 label="Use a copy of data for precomputing normalization" help=" "/> | |
90 </section> | |
91 </when> | |
92 <when value="PolynomialFeatures"> | |
93 <expand macro="multitype_input"/> | |
94 <section name="options" title="Advanced Options" expanded="False"> | |
95 <param argument="degree" type="integer" optional="true" value="2" label="The degree of the polynomial features " help=""/> | |
96 <param argument="interaction_only" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="false" label="Produce interaction features only" help="(Features that are products of at most degree distinct input features) "/> | |
97 <param argument="include_bias" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" label="Include a bias column" help="Feature in which all polynomial powers are zero "/> | |
98 </section> | |
99 </when> | |
100 <when value="RobustScaler"> | |
101 <expand macro="multitype_input"/> | |
102 <section name="options" title="Advanced Options" expanded="False"> | |
103 <!--=True, =True, copy=True--> | |
104 <param argument="with_centering" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" | |
105 label="Center the data before scaling" help=" "/> | |
106 <param argument="with_scaling" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" | |
107 label="Scale the data to interquartile range" help=" "/> | |
108 <param argument="copy" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" | |
109 label="Use a copy of data for inplace scaling" help=" "/> | |
110 </section> | |
111 </when> | |
112 </expand> | |
113 </conditional> | 93 </conditional> |
114 </when> | 94 </when> |
115 <when value="sparse"> | 95 <when value="sparse"> |
116 <param name="infile" type="data" format="txt" label="Select a sparse representation you want to train your preprocessor on its data:"/> | 96 <param name="infile" type="data" format="txt" label="Select a sparse representation you want to train your preprocessor on its data:"/> |
117 <conditional name="pre_processors"> | 97 <conditional name="pre_processors"> |
131 </data> | 111 </data> |
132 </outputs> | 112 </outputs> |
133 <tests> | 113 <tests> |
134 <test> | 114 <test> |
135 <param name="infile" value="train.tabular" ftype="tabular"/> | 115 <param name="infile" value="train.tabular" ftype="tabular"/> |
136 <param name="infile_transform" value="train.tabular" ftype="tabular"/> | 116 <param name="selected_column_selector_option" value="all_columns"/> |
137 <param name="selected_input_type" value="tabular"/> | 117 <param name="selected_input_type" value="tabular"/> |
138 <param name="selected_pre_processor" value="KernelCenterer"/> | 118 <param name="selected_pre_processor" value="KernelCenterer"/> |
139 <param name="save" value="true"/> | 119 <param name="save" value="true"/> |
140 <output name="outfile_transform" file="prp_result01" ftype="tabular"/> | 120 <output name="outfile_transform" file="prp_result01" ftype="tabular"/> |
141 <output name="outfile_fit" file="prp_model01" ftype="zip" compare="sim_size" delta="500"/> | 121 <output name="outfile_fit" file="prp_model01" ftype="zip" compare="sim_size" delta="500"/> |
142 </test> | 122 </test> |
143 <test> | 123 <test> |
144 <param name="infile" value="train.tabular" ftype="tabular"/> | 124 <param name="infile" value="train.tabular" ftype="tabular"/> |
145 <param name="infile_transform" value="train.tabular" ftype="tabular"/> | 125 <param name="selected_column_selector_option" value="all_columns"/> |
146 <param name="selected_input_type" value="tabular"/> | 126 <param name="selected_input_type" value="tabular"/> |
147 <param name="selected_pre_processor" value="MinMaxScaler"/> | 127 <param name="selected_pre_processor" value="MinMaxScaler"/> |
148 <param name="save" value="true"/> | 128 <param name="save" value="true"/> |
149 <output name="outfile_transform" file="prp_result02" ftype="tabular"/> | 129 <output name="outfile_transform" file="prp_result02" ftype="tabular"/> |
150 <output name="outfile_fit" file="prp_model02" ftype="zip" compare="sim_size" delta="500"/> | 130 <output name="outfile_fit" file="prp_model02" ftype="zip" compare="sim_size" delta="500"/> |
151 </test> | 131 </test> |
152 <test> | 132 <test> |
153 <param name="infile" value="train.tabular" ftype="tabular"/> | 133 <param name="infile" value="train.tabular" ftype="tabular"/> |
154 <param name="infile_transform" value="train.tabular" ftype="tabular"/> | 134 <param name="selected_column_selector_option" value="all_columns"/> |
155 <param name="selected_input_type" value="tabular"/> | 135 <param name="selected_input_type" value="tabular"/> |
156 <param name="selected_pre_processor" value="PolynomialFeatures"/> | 136 <param name="selected_pre_processor" value="PolynomialFeatures"/> |
157 <param name="save" value="true"/> | 137 <param name="save" value="true"/> |
158 <output name="outfile_transform" file="prp_result03" ftype="tabular"/> | 138 <output name="outfile_transform" file="prp_result03" ftype="tabular"/> |
159 <output name="outfile_fit" file="prp_model03" ftype="zip" compare="sim_size" delta="500"/> | 139 <output name="outfile_fit" file="prp_model03" ftype="zip" compare="sim_size" delta="500"/> |
160 </test> | 140 </test> |
161 <test> | 141 <test> |
162 <param name="infile" value="train.tabular" ftype="tabular"/> | 142 <param name="infile" value="train.tabular" ftype="tabular"/> |
163 <param name="infile_transform" value="train.tabular" ftype="tabular"/> | 143 <param name="selected_column_selector_option" value="all_columns"/> |
164 <param name="selected_input_type" value="tabular"/> | 144 <param name="selected_input_type" value="tabular"/> |
165 <param name="selected_pre_processor" value="RobustScaler"/> | 145 <param name="selected_pre_processor" value="RobustScaler"/> |
166 <param name="save" value="true"/> | 146 <param name="save" value="true"/> |
167 <output name="outfile_transform" file="prp_result04" ftype="tabular"/> | 147 <output name="outfile_transform" file="prp_result04" ftype="tabular"/> |
168 <output name="outfile_fit" file="prp_model04" ftype="zip" compare="sim_size" delta="500"/> | 148 <output name="outfile_fit" file="prp_model04" ftype="zip" compare="sim_size" delta="500"/> |
169 </test> | 149 </test> |
170 <test> | 150 <test> |
171 <param name="infile" value="csr_sparse2.mtx" ftype="txt"/> | 151 <param name="infile" value="csr_sparse2.mtx" ftype="txt"/> |
172 <param name="infile_transform" value="csr_sparse2.mtx" ftype="txt"/> | |
173 <param name="selected_input_type" value="sparse"/> | 152 <param name="selected_input_type" value="sparse"/> |
174 <param name="selected_pre_processor" value="Binarizer"/> | 153 <param name="selected_pre_processor" value="Binarizer"/> |
175 <param name="save" value="true"/> | 154 <param name="save" value="true"/> |
176 <output name="outfile_transform" file="prp_result05" ftype="tabular"/> | 155 <output name="outfile_transform" file="prp_result05" ftype="tabular"/> |
177 <output name="outfile_fit" file="prp_model05" ftype="zip" compare="sim_size" delta="500"/> | 156 <output name="outfile_fit" file="prp_model05" ftype="zip" compare="sim_size" delta="500"/> |
178 </test> | 157 </test> |
179 <test> | 158 <test> |
180 <param name="infile" value="csr_sparse2.mtx" ftype="txt"/> | 159 <param name="infile" value="csr_sparse2.mtx" ftype="txt"/> |
181 <param name="infile_transform" value="csr_sparse2.mtx" ftype="txt"/> | |
182 <param name="selected_input_type" value="sparse"/> | 160 <param name="selected_input_type" value="sparse"/> |
183 <param name="selected_pre_processor" value="Imputer"/> | 161 <param name="selected_pre_processor" value="Imputer"/> |
184 <param name="save" value="true"/> | 162 <param name="save" value="true"/> |
185 <param name="axis" value="true"/> | 163 <param name="axis" value="true"/> |
186 <output name="outfile_transform" file="prp_result06" ftype="tabular"/> | 164 <output name="outfile_transform" file="prp_result06" ftype="tabular"/> |
187 <output name="outfile_fit" file="prp_model06" ftype="zip" compare="sim_size" delta="500"/> | 165 <output name="outfile_fit" file="prp_model06" ftype="zip" compare="sim_size" delta="500"/> |
188 </test> | 166 </test> |
189 <test> | 167 <test> |
190 <param name="infile" value="train.tabular" ftype="tabular"/> | 168 <param name="infile" value="train.tabular" ftype="tabular"/> |
191 <param name="infile_transform" value="train.tabular" ftype="tabular"/> | 169 <param name="selected_input_type" value="tabular"/> |
192 <param name="selected_input_type" value="tabular"/> | 170 <param name="selected_column_selector_option" value="all_columns"/> |
193 <param name="selected_pre_processor" value="StandardScaler"/> | 171 <param name="selected_pre_processor" value="StandardScaler"/> |
194 <param name="save" value="true"/> | 172 <param name="save" value="true"/> |
195 <output name="outfile_transform" file="prp_result07" ftype="tabular"/> | 173 <output name="outfile_transform" file="prp_result07" ftype="tabular"/> |
196 <output name="outfile_fit" file="prp_model07" ftype="zip" compare="sim_size" delta="500"/> | 174 <output name="outfile_fit" file="prp_model07" ftype="zip" compare="sim_size" delta="500"/> |
197 </test> | 175 </test> |
198 <test> | 176 <test> |
199 <param name="infile" value="csr_sparse2.mtx" ftype="txt"/> | 177 <param name="infile" value="csr_sparse2.mtx" ftype="txt"/> |
200 <param name="infile_transform" value="csr_sparse2.mtx" ftype="txt"/> | |
201 <param name="selected_input_type" value="sparse"/> | 178 <param name="selected_input_type" value="sparse"/> |
202 <param name="selected_pre_processor" value="MaxAbsScaler"/> | 179 <param name="selected_pre_processor" value="MaxAbsScaler"/> |
203 <param name="save" value="true"/> | 180 <param name="save" value="true"/> |
204 <output name="outfile_transform" file="prp_result08" ftype="tabular"/> | 181 <output name="outfile_transform" file="prp_result08" ftype="tabular"/> |
205 <output name="outfile_fit" file="prp_model08" ftype="zip" compare="sim_size" delta="500"/> | 182 <output name="outfile_fit" file="prp_model08" ftype="zip" compare="sim_size" delta="500"/> |
206 </test> | 183 </test> |
207 <test> | 184 <test> |
208 <param name="infile" value="csr_sparse2.mtx" ftype="txt"/> | 185 <param name="infile" value="csr_sparse2.mtx" ftype="txt"/> |
209 <param name="infile_transform" value="csr_sparse2.mtx" ftype="txt"/> | |
210 <param name="selected_input_type" value="sparse"/> | 186 <param name="selected_input_type" value="sparse"/> |
211 <param name="selected_pre_processor" value="Normalizer"/> | 187 <param name="selected_pre_processor" value="Normalizer"/> |
212 <param name="save" value="true"/> | 188 <param name="save" value="true"/> |
213 <output name="outfile_transform" file="prp_result09" ftype="tabular"/> | 189 <output name="outfile_transform" file="prp_result09" ftype="tabular"/> |
214 <output name="outfile_fit" file="prp_model09" ftype="zip" compare="sim_size" delta="500"/> | 190 <output name="outfile_fit" file="prp_model09" ftype="zip" compare="sim_size" delta="500"/> |