Mercurial > repos > bgruening > sklearn_ensemble
comparison ensemble.xml @ 5:f1761288587e draft
planemo upload for repository https://github.com/bgruening/galaxytools/tools/sklearn commit 35fa73d6e9ba8f0789ddfb743d893d950a68af02
author | bgruening |
---|---|
date | Tue, 10 Apr 2018 15:18:51 -0400 |
parents | 0431274c367d |
children | cd595710f0c0 |
comparison
equal
deleted
inserted
replaced
4:0431274c367d | 5:f1761288587e |
---|---|
29 | 29 |
30 #if $selected_tasks.selected_task == "train": | 30 #if $selected_tasks.selected_task == "train": |
31 | 31 |
32 algorithm = params["selected_tasks"]["selected_algorithms"]["selected_algorithm"] | 32 algorithm = params["selected_tasks"]["selected_algorithms"]["selected_algorithm"] |
33 options = params["selected_tasks"]["selected_algorithms"]["options"] | 33 options = params["selected_tasks"]["selected_algorithms"]["options"] |
34 if "select_max_features" in options: | |
35 if options["select_max_features"]["max_features"] == "number_input": | |
36 options["select_max_features"]["max_features"] = options["select_max_features"]["num_max_features"] | |
37 options["select_max_features"].pop("num_max_features") | |
38 options["max_features"] = options["select_max_features"]["max_features"] | |
39 options.pop("select_max_features") | |
40 if "presort" in options: | |
41 if options["presort"] == "true": | |
42 options["presort"] = True | |
43 if options["presort"] == "false": | |
44 options["presort"] = False | |
45 if "min_samples_leaf" in options and options["min_samples_leaf"] == 1.0: | |
46 options["min_samples_leaf"] = 1 | |
47 if "min_samples_split" in options and options["min_samples_split"] > 1.0: | |
48 options["min_samples_split"] = int(options["min_samples_split"]) | |
34 input_type = params["selected_tasks"]["selected_algorithms"]["input_options"]["selected_input"] | 49 input_type = params["selected_tasks"]["selected_algorithms"]["input_options"]["selected_input"] |
35 if input_type=="tabular": | 50 if input_type=="tabular": |
36 header = 'infer' if params["selected_tasks"]["selected_algorithms"]["input_options"]["header1"] else None | 51 header = 'infer' if params["selected_tasks"]["selected_algorithms"]["input_options"]["header1"] else None |
37 X = read_columns( | 52 X = read_columns( |
38 "$selected_tasks.selected_algorithms.input_options.infile1", | 53 "$selected_tasks.selected_algorithms.input_options.infile1", |
50 "$selected_tasks.selected_algorithms.input_options.col2", | 65 "$selected_tasks.selected_algorithms.input_options.col2", |
51 sep='\t', | 66 sep='\t', |
52 header=header, | 67 header=header, |
53 parse_dates=True | 68 parse_dates=True |
54 ) | 69 ) |
70 y=y.ravel() | |
55 | 71 |
56 my_class = getattr(sklearn.ensemble, algorithm) | 72 my_class = getattr(sklearn.ensemble, algorithm) |
57 estimator = my_class(**options) | 73 estimator = my_class(**options) |
58 estimator.fit(X,y) | 74 estimator.fit(X,y) |
59 pickle.dump(estimator,open("$outfile_fit", 'w+'), pickle.HIGHEST_PROTOCOL) | 75 pickle.dump(estimator,open("$outfile_fit", 'w+'), pickle.HIGHEST_PROTOCOL) |
60 | 76 |
61 #else: | 77 #else: |
62 classifier_object = pickle.load(open("$selected_tasks.infile_model", 'r')) | 78 classifier_object = pickle.load(open("$selected_tasks.infile_model", 'r')) |
63 data = pandas.read_csv("$selected_tasks.infile_data", sep='\t', header=0, index_col=None, parse_dates=True, encoding=None, tupleize_cols=False) | 79 header = 'infer' if params["selected_tasks"]["header"] else None |
80 data = pandas.read_csv("$selected_tasks.infile_data", sep='\t', header=header, index_col=None, parse_dates=True, encoding=None, tupleize_cols=False) | |
64 prediction = classifier_object.predict(data) | 81 prediction = classifier_object.predict(data) |
65 prediction_df = pandas.DataFrame(prediction) | 82 prediction_df = pandas.DataFrame(prediction) |
66 res = pandas.concat([data, prediction_df], axis=1) | 83 res = pandas.concat([data, prediction_df], axis=1) |
67 res.to_csv(path_or_buf = "$outfile_predict", sep="\t", index=False) | 84 res.to_csv(path_or_buf = "$outfile_predict", sep="\t", index=False) |
68 #end if | 85 #end if |
73 <inputs> | 90 <inputs> |
74 <expand macro="sl_Conditional" model="zip"> | 91 <expand macro="sl_Conditional" model="zip"> |
75 <param name="selected_algorithm" type="select" label="Select an ensemble method:"> | 92 <param name="selected_algorithm" type="select" label="Select an ensemble method:"> |
76 <option value="RandomForestClassifier" selected="true">Random forest classifier</option> | 93 <option value="RandomForestClassifier" selected="true">Random forest classifier</option> |
77 <option value="AdaBoostClassifier">Ada boost classifier</option> | 94 <option value="AdaBoostClassifier">Ada boost classifier</option> |
95 <option value="GradientBoostingClassifier">Gradient Boosting Classifier</option> | |
78 <option value="RandomForestRegressor">Random forest regressor</option> | 96 <option value="RandomForestRegressor">Random forest regressor</option> |
79 <option value="AdaBoostRegressor">Ada boost regressor</option> | 97 <option value="AdaBoostRegressor">Ada boost regressor</option> |
98 <option value="GradientBoostingRegressor">Gradient Boosting Regressor</option> | |
80 </param> | 99 </param> |
81 <when value="RandomForestClassifier"> | 100 <when value="RandomForestClassifier"> |
82 <expand macro="sl_mixed_input"/> | 101 <expand macro="sl_mixed_input"/> |
83 <section name="options" title="Advanced Options" expanded="False"> | 102 <section name="options" title="Advanced Options" expanded="False"> |
84 <expand macro="n_estimators"/> | 103 <expand macro="n_estimators"/> |
89 <expand macro="min_samples_leaf"/> | 108 <expand macro="min_samples_leaf"/> |
90 <expand macro="min_weight_fraction_leaf"/> | 109 <expand macro="min_weight_fraction_leaf"/> |
91 <expand macro="max_leaf_nodes"/> | 110 <expand macro="max_leaf_nodes"/> |
92 <expand macro="bootstrap"/> | 111 <expand macro="bootstrap"/> |
93 <expand macro="warm_start" checked="false"/> | 112 <expand macro="warm_start" checked="false"/> |
113 <expand macro="n_jobs"/> | |
94 <expand macro="random_state"/> | 114 <expand macro="random_state"/> |
95 <expand macro="oob_score"/> | 115 <expand macro="oob_score"/> |
96 <!--class_weight=None--> | 116 <!--class_weight=None--> |
97 </section> | 117 </section> |
98 </when> | 118 </when> |
107 <option value="SAMME">SAMME</option> | 127 <option value="SAMME">SAMME</option> |
108 </param> | 128 </param> |
109 <expand macro="random_state"/> | 129 <expand macro="random_state"/> |
110 </section> | 130 </section> |
111 </when> | 131 </when> |
132 <when value="GradientBoostingClassifier"> | |
133 <expand macro="sl_mixed_input"/> | |
134 <section name="options" title="Advanced Options" expanded="False"> | |
135 <!--base_estimator=None--> | |
136 <param argument="loss" type="select" label="Loss function"> | |
137 <option value="deviance" selected="true">deviance - logistic regression with probabilistic outputs</option> | |
138 <option value="exponential">exponential - gradient boosting recovers the AdaBoost algorithm</option> | |
139 </param> | |
140 <expand macro="learning_rate" default_value='0.1'/> | |
141 <expand macro="n_estimators" default_value="100" help="The number of boosting stages to perform"/> | |
142 <expand macro="max_depth" default_value="3" help="maximum depth of the individual regression estimators"/> | |
143 <expand macro="criterion2"> | |
144 <option value="friedman_mse" selected="true">friedman_mse - mean squared error with improvement score by Friedman</option> | |
145 </expand> | |
146 <expand macro="min_samples_split" type="float"/> | |
147 <expand macro="min_samples_leaf" type="float" label="The minimum number of samples required to be at a leaf node"/> | |
148 <expand macro="min_weight_fraction_leaf"/> | |
149 <expand macro="subsample"/> | |
150 <expand macro="max_features"/> | |
151 <expand macro="max_leaf_nodes"/> | |
152 <expand macro="min_impurity_decrease"/> | |
153 <expand macro="verbose"/> | |
154 <expand macro="warm_start" checked="false"/> | |
155 <expand macro="random_state"/> | |
156 <expand macro="presort"/> | |
157 </section> | |
158 </when> | |
112 <when value="RandomForestRegressor"> | 159 <when value="RandomForestRegressor"> |
113 <expand macro="sl_mixed_input"/> | 160 <expand macro="sl_mixed_input"/> |
114 <section name="options" title="Advanced Options" expanded="False"> | 161 <section name="options" title="Advanced Options" expanded="False"> |
115 <expand macro="n_estimators"/> | 162 <expand macro="n_estimators"/> |
163 <expand macro="criterion2"/> | |
116 <expand macro="max_features"/> | 164 <expand macro="max_features"/> |
117 <expand macro="max_depth"/> | 165 <expand macro="max_depth"/> |
118 <expand macro="min_samples_split"/> | 166 <expand macro="min_samples_split"/> |
119 <expand macro="min_samples_leaf"/> | 167 <expand macro="min_samples_leaf"/> |
120 <expand macro="min_weight_fraction_leaf"/> | 168 <expand macro="min_weight_fraction_leaf"/> |
121 <expand macro="max_leaf_nodes"/> | 169 <expand macro="max_leaf_nodes"/> |
170 <expand macro="min_impurity_decrease"/> | |
122 <expand macro="bootstrap"/> | 171 <expand macro="bootstrap"/> |
172 <expand macro="oob_score"/> | |
173 <expand macro="n_jobs"/> | |
174 <expand macro="random_state"/> | |
175 <expand macro="verbose"/> | |
123 <expand macro="warm_start" checked="false"/> | 176 <expand macro="warm_start" checked="false"/> |
124 <expand macro="random_state"/> | |
125 <expand macro="oob_score"/> | |
126 </section> | 177 </section> |
127 </when> | 178 </when> |
128 <when value="AdaBoostRegressor"> | 179 <when value="AdaBoostRegressor"> |
129 <expand macro="sl_mixed_input"/> | 180 <expand macro="sl_mixed_input"/> |
130 <section name="options" title="Advanced Options" expanded="False"> | 181 <section name="options" title="Advanced Options" expanded="False"> |
137 <option value="exponential">exponential</option> | 188 <option value="exponential">exponential</option> |
138 </param> | 189 </param> |
139 <expand macro="random_state"/> | 190 <expand macro="random_state"/> |
140 </section> | 191 </section> |
141 </when> | 192 </when> |
193 <when value="GradientBoostingRegressor"> | |
194 <expand macro="sl_mixed_input"/> | |
195 <section name="options" title="Advanced Options" expanded="False"> | |
196 <param argument="loss" type="select" label="Loss function"> | |
197 <option value="ls" selected="true">ls - least squares regression</option> | |
198 <option value="lad">lad - least absolute deviation</option> | |
199 <option value="huber">huber - combination of least squares regression and least absolute deviation</option> | |
200 <option value="quantile">quantile - use alpha to specify the quantile</option> | |
201 </param> | |
202 <expand macro="learning_rate" default_value="0.1"/> | |
203 <expand macro="n_estimators" default_value="100" help="The number of boosting stages to perform"/> | |
204 <expand macro="max_depth" default_value="3" help="maximum depth of the individual regression estimators"/> | |
205 <expand macro="criterion2"> | |
206 <option value="friedman_mse" selected="true">friedman_mse - mean squared error with improvement score by Friedman</option> | |
207 </expand> | |
208 <expand macro="min_samples_split" type="float"/> | |
209 <expand macro="min_samples_leaf" type="float" label="The minimum number of samples required to be at a leaf node"/> | |
210 <expand macro="min_weight_fraction_leaf"/> | |
211 <expand macro="subsample"/> | |
212 <expand macro="max_features"/> | |
213 <expand macro="max_leaf_nodes"/> | |
214 <expand macro="min_impurity_decrease"/> | |
215 <param argument="alpha" type="float" value="0.9" label="alpha" help="The alpha-quantile of the huber loss function and the quantile loss function" /> | |
216 <!--base_estimator=None--> | |
217 <expand macro="verbose"/> | |
218 <expand macro="warm_start" checked="false"/> | |
219 <expand macro="random_state"/> | |
220 <expand macro="presort"/> | |
221 </section> | |
222 </when> | |
142 </expand> | 223 </expand> |
143 </inputs> | 224 </inputs> |
144 | 225 |
145 <expand macro="output"/> | 226 <expand macro="output"/> |
146 | 227 |
159 <param name="infile_model" value="rfc_model01" ftype="zip"/> | 240 <param name="infile_model" value="rfc_model01" ftype="zip"/> |
160 <param name="infile_data" value="test.tabular" ftype="tabular"/> | 241 <param name="infile_data" value="test.tabular" ftype="tabular"/> |
161 <param name="selected_task" value="load"/> | 242 <param name="selected_task" value="load"/> |
162 <output name="outfile_predict" file="rfc_result01" compare="sim_size" delta="500"/> | 243 <output name="outfile_predict" file="rfc_result01" compare="sim_size" delta="500"/> |
163 </test> | 244 </test> |
164 | |
165 <test> | 245 <test> |
166 <param name="infile1" value="regression_train.tabular" ftype="tabular"/> | 246 <param name="infile1" value="regression_train.tabular" ftype="tabular"/> |
167 <param name="infile2" value="regression_train.tabular" ftype="tabular"/> | 247 <param name="infile2" value="regression_train.tabular" ftype="tabular"/> |
168 <param name="col1" value="1,2,3,4,5"/> | 248 <param name="col1" value="1,2,3,4,5"/> |
169 <param name="col2" value="6"/> | 249 <param name="col2" value="6"/> |
175 <test> | 255 <test> |
176 <param name="infile_model" value="rfr_model01" ftype="zip"/> | 256 <param name="infile_model" value="rfr_model01" ftype="zip"/> |
177 <param name="infile_data" value="regression_test.tabular" ftype="tabular"/> | 257 <param name="infile_data" value="regression_test.tabular" ftype="tabular"/> |
178 <param name="selected_task" value="load"/> | 258 <param name="selected_task" value="load"/> |
179 <output name="outfile_predict" file="rfr_result01" compare="sim_size" delta="500"/> | 259 <output name="outfile_predict" file="rfr_result01" compare="sim_size" delta="500"/> |
260 </test> | |
261 <test> | |
262 <param name="infile1" value="regression_X.tabular" ftype="tabular"/> | |
263 <param name="infile2" value="regression_y.tabular" ftype="tabular"/> | |
264 <param name="header1" value="True"/> | |
265 <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17"/> | |
266 <param name="header2" value="True"/> | |
267 <param name="col2" value="1"/> | |
268 <param name="selected_task" value="train"/> | |
269 <param name="selected_algorithm" value="GradientBoostingRegressor"/> | |
270 <param name="max_features" value="number_input"/> | |
271 <param name="num_max_features" value=""/> | |
272 <param name="random_state" value="42"/> | |
273 <output name="outfile_fit" file="gbr_model01" compare="sim_size" delta="500"/> | |
274 </test> | |
275 <test> | |
276 <param name="infile_model" value="gbr_model01" ftype="zip"/> | |
277 <param name="infile_data" value="regression_test_X.tabular" ftype="tabular"/> | |
278 <param name="selected_task" value="load"/> | |
279 <param name="header" value="True"/> | |
280 <output name="outfile_predict" file="gbr_prediction_result01.tabular" compare="sim_size" delta="500"/> | |
281 </test> | |
282 <test> | |
283 <param name="infile1" value="train.tabular" ftype="tabular"/> | |
284 <param name="infile2" value="train.tabular" ftype="tabular"/> | |
285 <param name="col1" value="1,2,3,4"/> | |
286 <param name="col2" value="5"/> | |
287 <param name="selected_task" value="train"/> | |
288 <param name="selected_algorithm" value="GradientBoostingClassifier"/> | |
289 <output name="outfile_fit" file="gbc_model01" compare="sim_size" delta="500"/> | |
290 </test> | |
291 <test> | |
292 <param name="infile_model" value="gbc_model01" ftype="zip"/> | |
293 <param name="infile_data" value="test.tabular" ftype="tabular"/> | |
294 <param name="selected_task" value="load"/> | |
295 <output name="outfile_predict" file="gbc_result01" compare="sim_size" delta="500"/> | |
180 </test> | 296 </test> |
181 </tests> | 297 </tests> |
182 <help><![CDATA[ | 298 <help><![CDATA[ |
183 ***What it does*** | 299 ***What it does*** |
184 The goal of ensemble methods is to combine the predictions of several base estimators built with a given learning algorithm in order to improve generalizability / robustness over a single estimator. This tool offers two sets of ensemble algorithms for classification and regression: random forests and ADA boosting which are based on sklearn.ensemble library from Scikit-learn. Here you can find out about the input, output and methods presented in the tools. For information about ensemble methods and parameters settings please refer to `Scikit-learn ensemble`_. | 300 The goal of ensemble methods is to combine the predictions of several base estimators built with a given learning algorithm in order to improve generalizability / robustness over a single estimator. This tool offers two sets of ensemble algorithms for classification and regression: random forests and ADA boosting which are based on sklearn.ensemble library from Scikit-learn. Here you can find out about the input, output and methods presented in the tools. For information about ensemble methods and parameters settings please refer to `Scikit-learn ensemble`_. |