comparison ensemble.xml @ 5:f1761288587e draft

planemo upload for repository https://github.com/bgruening/galaxytools/tools/sklearn commit 35fa73d6e9ba8f0789ddfb743d893d950a68af02
author bgruening
date Tue, 10 Apr 2018 15:18:51 -0400
parents 0431274c367d
children cd595710f0c0
comparison
equal deleted inserted replaced
4:0431274c367d 5:f1761288587e
29 29
30 #if $selected_tasks.selected_task == "train": 30 #if $selected_tasks.selected_task == "train":
31 31
32 algorithm = params["selected_tasks"]["selected_algorithms"]["selected_algorithm"] 32 algorithm = params["selected_tasks"]["selected_algorithms"]["selected_algorithm"]
33 options = params["selected_tasks"]["selected_algorithms"]["options"] 33 options = params["selected_tasks"]["selected_algorithms"]["options"]
34 if "select_max_features" in options:
35 if options["select_max_features"]["max_features"] == "number_input":
36 options["select_max_features"]["max_features"] = options["select_max_features"]["num_max_features"]
37 options["select_max_features"].pop("num_max_features")
38 options["max_features"] = options["select_max_features"]["max_features"]
39 options.pop("select_max_features")
40 if "presort" in options:
41 if options["presort"] == "true":
42 options["presort"] = True
43 if options["presort"] == "false":
44 options["presort"] = False
45 if "min_samples_leaf" in options and options["min_samples_leaf"] == 1.0:
46 options["min_samples_leaf"] = 1
47 if "min_samples_split" in options and options["min_samples_split"] > 1.0:
48 options["min_samples_split"] = int(options["min_samples_split"])
34 input_type = params["selected_tasks"]["selected_algorithms"]["input_options"]["selected_input"] 49 input_type = params["selected_tasks"]["selected_algorithms"]["input_options"]["selected_input"]
35 if input_type=="tabular": 50 if input_type=="tabular":
36 header = 'infer' if params["selected_tasks"]["selected_algorithms"]["input_options"]["header1"] else None 51 header = 'infer' if params["selected_tasks"]["selected_algorithms"]["input_options"]["header1"] else None
37 X = read_columns( 52 X = read_columns(
38 "$selected_tasks.selected_algorithms.input_options.infile1", 53 "$selected_tasks.selected_algorithms.input_options.infile1",
50 "$selected_tasks.selected_algorithms.input_options.col2", 65 "$selected_tasks.selected_algorithms.input_options.col2",
51 sep='\t', 66 sep='\t',
52 header=header, 67 header=header,
53 parse_dates=True 68 parse_dates=True
54 ) 69 )
70 y=y.ravel()
55 71
56 my_class = getattr(sklearn.ensemble, algorithm) 72 my_class = getattr(sklearn.ensemble, algorithm)
57 estimator = my_class(**options) 73 estimator = my_class(**options)
58 estimator.fit(X,y) 74 estimator.fit(X,y)
59 pickle.dump(estimator,open("$outfile_fit", 'w+'), pickle.HIGHEST_PROTOCOL) 75 pickle.dump(estimator,open("$outfile_fit", 'w+'), pickle.HIGHEST_PROTOCOL)
60 76
61 #else: 77 #else:
62 classifier_object = pickle.load(open("$selected_tasks.infile_model", 'r')) 78 classifier_object = pickle.load(open("$selected_tasks.infile_model", 'r'))
63 data = pandas.read_csv("$selected_tasks.infile_data", sep='\t', header=0, index_col=None, parse_dates=True, encoding=None, tupleize_cols=False) 79 header = 'infer' if params["selected_tasks"]["header"] else None
80 data = pandas.read_csv("$selected_tasks.infile_data", sep='\t', header=header, index_col=None, parse_dates=True, encoding=None, tupleize_cols=False)
64 prediction = classifier_object.predict(data) 81 prediction = classifier_object.predict(data)
65 prediction_df = pandas.DataFrame(prediction) 82 prediction_df = pandas.DataFrame(prediction)
66 res = pandas.concat([data, prediction_df], axis=1) 83 res = pandas.concat([data, prediction_df], axis=1)
67 res.to_csv(path_or_buf = "$outfile_predict", sep="\t", index=False) 84 res.to_csv(path_or_buf = "$outfile_predict", sep="\t", index=False)
68 #end if 85 #end if
73 <inputs> 90 <inputs>
74 <expand macro="sl_Conditional" model="zip"> 91 <expand macro="sl_Conditional" model="zip">
75 <param name="selected_algorithm" type="select" label="Select an ensemble method:"> 92 <param name="selected_algorithm" type="select" label="Select an ensemble method:">
76 <option value="RandomForestClassifier" selected="true">Random forest classifier</option> 93 <option value="RandomForestClassifier" selected="true">Random forest classifier</option>
77 <option value="AdaBoostClassifier">Ada boost classifier</option> 94 <option value="AdaBoostClassifier">Ada boost classifier</option>
95 <option value="GradientBoostingClassifier">Gradient Boosting Classifier</option>
78 <option value="RandomForestRegressor">Random forest regressor</option> 96 <option value="RandomForestRegressor">Random forest regressor</option>
79 <option value="AdaBoostRegressor">Ada boost regressor</option> 97 <option value="AdaBoostRegressor">Ada boost regressor</option>
98 <option value="GradientBoostingRegressor">Gradient Boosting Regressor</option>
80 </param> 99 </param>
81 <when value="RandomForestClassifier"> 100 <when value="RandomForestClassifier">
82 <expand macro="sl_mixed_input"/> 101 <expand macro="sl_mixed_input"/>
83 <section name="options" title="Advanced Options" expanded="False"> 102 <section name="options" title="Advanced Options" expanded="False">
84 <expand macro="n_estimators"/> 103 <expand macro="n_estimators"/>
89 <expand macro="min_samples_leaf"/> 108 <expand macro="min_samples_leaf"/>
90 <expand macro="min_weight_fraction_leaf"/> 109 <expand macro="min_weight_fraction_leaf"/>
91 <expand macro="max_leaf_nodes"/> 110 <expand macro="max_leaf_nodes"/>
92 <expand macro="bootstrap"/> 111 <expand macro="bootstrap"/>
93 <expand macro="warm_start" checked="false"/> 112 <expand macro="warm_start" checked="false"/>
113 <expand macro="n_jobs"/>
94 <expand macro="random_state"/> 114 <expand macro="random_state"/>
95 <expand macro="oob_score"/> 115 <expand macro="oob_score"/>
96 <!--class_weight=None--> 116 <!--class_weight=None-->
97 </section> 117 </section>
98 </when> 118 </when>
107 <option value="SAMME">SAMME</option> 127 <option value="SAMME">SAMME</option>
108 </param> 128 </param>
109 <expand macro="random_state"/> 129 <expand macro="random_state"/>
110 </section> 130 </section>
111 </when> 131 </when>
132 <when value="GradientBoostingClassifier">
133 <expand macro="sl_mixed_input"/>
134 <section name="options" title="Advanced Options" expanded="False">
135 <!--base_estimator=None-->
136 <param argument="loss" type="select" label="Loss function">
137 <option value="deviance" selected="true">deviance - logistic regression with probabilistic outputs</option>
138 <option value="exponential">exponential - gradient boosting recovers the AdaBoost algorithm</option>
139 </param>
140 <expand macro="learning_rate" default_value='0.1'/>
141 <expand macro="n_estimators" default_value="100" help="The number of boosting stages to perform"/>
142 <expand macro="max_depth" default_value="3" help="maximum depth of the individual regression estimators"/>
143 <expand macro="criterion2">
144 <option value="friedman_mse" selected="true">friedman_mse - mean squared error with improvement score by Friedman</option>
145 </expand>
146 <expand macro="min_samples_split" type="float"/>
147 <expand macro="min_samples_leaf" type="float" label="The minimum number of samples required to be at a leaf node"/>
148 <expand macro="min_weight_fraction_leaf"/>
149 <expand macro="subsample"/>
150 <expand macro="max_features"/>
151 <expand macro="max_leaf_nodes"/>
152 <expand macro="min_impurity_decrease"/>
153 <expand macro="verbose"/>
154 <expand macro="warm_start" checked="false"/>
155 <expand macro="random_state"/>
156 <expand macro="presort"/>
157 </section>
158 </when>
112 <when value="RandomForestRegressor"> 159 <when value="RandomForestRegressor">
113 <expand macro="sl_mixed_input"/> 160 <expand macro="sl_mixed_input"/>
114 <section name="options" title="Advanced Options" expanded="False"> 161 <section name="options" title="Advanced Options" expanded="False">
115 <expand macro="n_estimators"/> 162 <expand macro="n_estimators"/>
163 <expand macro="criterion2"/>
116 <expand macro="max_features"/> 164 <expand macro="max_features"/>
117 <expand macro="max_depth"/> 165 <expand macro="max_depth"/>
118 <expand macro="min_samples_split"/> 166 <expand macro="min_samples_split"/>
119 <expand macro="min_samples_leaf"/> 167 <expand macro="min_samples_leaf"/>
120 <expand macro="min_weight_fraction_leaf"/> 168 <expand macro="min_weight_fraction_leaf"/>
121 <expand macro="max_leaf_nodes"/> 169 <expand macro="max_leaf_nodes"/>
170 <expand macro="min_impurity_decrease"/>
122 <expand macro="bootstrap"/> 171 <expand macro="bootstrap"/>
172 <expand macro="oob_score"/>
173 <expand macro="n_jobs"/>
174 <expand macro="random_state"/>
175 <expand macro="verbose"/>
123 <expand macro="warm_start" checked="false"/> 176 <expand macro="warm_start" checked="false"/>
124 <expand macro="random_state"/>
125 <expand macro="oob_score"/>
126 </section> 177 </section>
127 </when> 178 </when>
128 <when value="AdaBoostRegressor"> 179 <when value="AdaBoostRegressor">
129 <expand macro="sl_mixed_input"/> 180 <expand macro="sl_mixed_input"/>
130 <section name="options" title="Advanced Options" expanded="False"> 181 <section name="options" title="Advanced Options" expanded="False">
137 <option value="exponential">exponential</option> 188 <option value="exponential">exponential</option>
138 </param> 189 </param>
139 <expand macro="random_state"/> 190 <expand macro="random_state"/>
140 </section> 191 </section>
141 </when> 192 </when>
193 <when value="GradientBoostingRegressor">
194 <expand macro="sl_mixed_input"/>
195 <section name="options" title="Advanced Options" expanded="False">
196 <param argument="loss" type="select" label="Loss function">
197 <option value="ls" selected="true">ls - least squares regression</option>
198 <option value="lad">lad - least absolute deviation</option>
199 <option value="huber">huber - combination of least squares regression and least absolute deviation</option>
200 <option value="quantile">quantile - use alpha to specify the quantile</option>
201 </param>
202 <expand macro="learning_rate" default_value="0.1"/>
203 <expand macro="n_estimators" default_value="100" help="The number of boosting stages to perform"/>
204 <expand macro="max_depth" default_value="3" help="maximum depth of the individual regression estimators"/>
205 <expand macro="criterion2">
206 <option value="friedman_mse" selected="true">friedman_mse - mean squared error with improvement score by Friedman</option>
207 </expand>
208 <expand macro="min_samples_split" type="float"/>
209 <expand macro="min_samples_leaf" type="float" label="The minimum number of samples required to be at a leaf node"/>
210 <expand macro="min_weight_fraction_leaf"/>
211 <expand macro="subsample"/>
212 <expand macro="max_features"/>
213 <expand macro="max_leaf_nodes"/>
214 <expand macro="min_impurity_decrease"/>
215 <param argument="alpha" type="float" value="0.9" label="alpha" help="The alpha-quantile of the huber loss function and the quantile loss function" />
216 <!--base_estimator=None-->
217 <expand macro="verbose"/>
218 <expand macro="warm_start" checked="false"/>
219 <expand macro="random_state"/>
220 <expand macro="presort"/>
221 </section>
222 </when>
142 </expand> 223 </expand>
143 </inputs> 224 </inputs>
144 225
145 <expand macro="output"/> 226 <expand macro="output"/>
146 227
159 <param name="infile_model" value="rfc_model01" ftype="zip"/> 240 <param name="infile_model" value="rfc_model01" ftype="zip"/>
160 <param name="infile_data" value="test.tabular" ftype="tabular"/> 241 <param name="infile_data" value="test.tabular" ftype="tabular"/>
161 <param name="selected_task" value="load"/> 242 <param name="selected_task" value="load"/>
162 <output name="outfile_predict" file="rfc_result01" compare="sim_size" delta="500"/> 243 <output name="outfile_predict" file="rfc_result01" compare="sim_size" delta="500"/>
163 </test> 244 </test>
164
165 <test> 245 <test>
166 <param name="infile1" value="regression_train.tabular" ftype="tabular"/> 246 <param name="infile1" value="regression_train.tabular" ftype="tabular"/>
167 <param name="infile2" value="regression_train.tabular" ftype="tabular"/> 247 <param name="infile2" value="regression_train.tabular" ftype="tabular"/>
168 <param name="col1" value="1,2,3,4,5"/> 248 <param name="col1" value="1,2,3,4,5"/>
169 <param name="col2" value="6"/> 249 <param name="col2" value="6"/>
175 <test> 255 <test>
176 <param name="infile_model" value="rfr_model01" ftype="zip"/> 256 <param name="infile_model" value="rfr_model01" ftype="zip"/>
177 <param name="infile_data" value="regression_test.tabular" ftype="tabular"/> 257 <param name="infile_data" value="regression_test.tabular" ftype="tabular"/>
178 <param name="selected_task" value="load"/> 258 <param name="selected_task" value="load"/>
179 <output name="outfile_predict" file="rfr_result01" compare="sim_size" delta="500"/> 259 <output name="outfile_predict" file="rfr_result01" compare="sim_size" delta="500"/>
260 </test>
261 <test>
262 <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
263 <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
264 <param name="header1" value="True"/>
265 <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17"/>
266 <param name="header2" value="True"/>
267 <param name="col2" value="1"/>
268 <param name="selected_task" value="train"/>
269 <param name="selected_algorithm" value="GradientBoostingRegressor"/>
270 <param name="max_features" value="number_input"/>
271 <param name="num_max_features" value=""/>
272 <param name="random_state" value="42"/>
273 <output name="outfile_fit" file="gbr_model01" compare="sim_size" delta="500"/>
274 </test>
275 <test>
276 <param name="infile_model" value="gbr_model01" ftype="zip"/>
277 <param name="infile_data" value="regression_test_X.tabular" ftype="tabular"/>
278 <param name="selected_task" value="load"/>
279 <param name="header" value="True"/>
280 <output name="outfile_predict" file="gbr_prediction_result01.tabular" compare="sim_size" delta="500"/>
281 </test>
282 <test>
283 <param name="infile1" value="train.tabular" ftype="tabular"/>
284 <param name="infile2" value="train.tabular" ftype="tabular"/>
285 <param name="col1" value="1,2,3,4"/>
286 <param name="col2" value="5"/>
287 <param name="selected_task" value="train"/>
288 <param name="selected_algorithm" value="GradientBoostingClassifier"/>
289 <output name="outfile_fit" file="gbc_model01" compare="sim_size" delta="500"/>
290 </test>
291 <test>
292 <param name="infile_model" value="gbc_model01" ftype="zip"/>
293 <param name="infile_data" value="test.tabular" ftype="tabular"/>
294 <param name="selected_task" value="load"/>
295 <output name="outfile_predict" file="gbc_result01" compare="sim_size" delta="500"/>
180 </test> 296 </test>
181 </tests> 297 </tests>
182 <help><![CDATA[ 298 <help><![CDATA[
183 ***What it does*** 299 ***What it does***
184 The goal of ensemble methods is to combine the predictions of several base estimators built with a given learning algorithm in order to improve generalizability / robustness over a single estimator. This tool offers two sets of ensemble algorithms for classification and regression: random forests and ADA boosting which are based on sklearn.ensemble library from Scikit-learn. Here you can find out about the input, output and methods presented in the tools. For information about ensemble methods and parameters settings please refer to `Scikit-learn ensemble`_. 300 The goal of ensemble methods is to combine the predictions of several base estimators built with a given learning algorithm in order to improve generalizability / robustness over a single estimator. This tool offers two sets of ensemble algorithms for classification and regression: random forests and ADA boosting which are based on sklearn.ensemble library from Scikit-learn. Here you can find out about the input, output and methods presented in the tools. For information about ensemble methods and parameters settings please refer to `Scikit-learn ensemble`_.