comparison lightgbm.xml @ 0:13226b2ddfb4 draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 756f8be9c3cd437e131e6410cd625c24fe078e8c"
author bgruening
date Wed, 22 Jan 2020 07:51:20 -0500
parents
children c3bafda50176
comparison
equal deleted inserted replaced
-1:000000000000 0:13226b2ddfb4
1 <tool id="sklearn_lightgbm" name="LightGBM" version="@VERSION@">
2 <description>- train and apply LightGBM models</description>
3 <macros>
4 <import>main_macros.xml</import>
5 </macros>
6 <expand macro="python_requirements">
7 <requirement type="package" version="2.3.0">lightgbm</requirement>
8 </expand>
9 <expand macro="macro_stdio"/>
10 <version_command>echo "@VERSION@"</version_command>
11 <command><![CDATA[
12 python '$lightgbm_script' '$inputs'
13 ]]>
14 </command>
15 <configfiles>
16 <inputs name="inputs"/>
17 <configfile name="lightgbm_script">
18 <![CDATA[
19 import json
20 import lightgbm as lgb
21 import pandas
22 import sys
23
24 from scipy.io import mmread
25 from galaxy_ml.utils import get_X_y
26
27
28 N_JOBS = int(__import__('os').environ.get('GALAXY_SLOTS', 1))
29
30 # Get inputs, outputs.
31 input_json_path = sys.argv[1]
32 with open(input_json_path, "r") as param_handler:
33 params = json.load(param_handler)
34 print(params)
35
36 # Put all cheetah up here to avoid confusion.
37 #if $selected_tasks.selected_task == "train":
38 infile1 = "$selected_tasks.selected_algorithms.input_options.infile1"
39 infile2 = "$selected_tasks.selected_algorithms.input_options.infile2"
40 #else:
41 infile_model = "$selected_tasks.infile_model"
42 infile_data = "$selected_tasks.infile_data"
43 #end if
44 outfile_fit = "$outfile_fit"
45 outfile_predict = "$outfile_predict"
46
47 # All Python from here on out:
48
49 if params["selected_tasks"]["selected_task"] == "train":
50 algorithm = params["selected_tasks"]["selected_algorithms"]["selected_algorithm"]
51 options = params["selected_tasks"]["selected_algorithms"]["options"]
52 options['num_threads'] = N_JOBS
53 if "max_leaf_nodes" in options:
54 options["num_leaves"] = options["max_leaf_nodes"]
55 options.pop("max_leaf_nodes")
56
57 X, y = get_X_y(params, infile1, infile2)
58 lgb_train = lgb.Dataset(X, y)
59 gbm = lgb.train(options, lgb_train)
60 gbm.save_model(outfile_fit)
61
62 else:
63 gbm = lgb.Booster(model_file=infile_model)
64 header = 'infer' if params["selected_tasks"]["header"] else None
65 data = pandas.read_csv(infile_data, sep='\t', header=header, index_col=None, parse_dates=True, encoding=None)
66 prediction = gbm.predict(data, num_iteration=gbm.best_iteration)
67 prediction_df = pandas.DataFrame(prediction, columns=["predicted"])
68 res = pandas.concat([data, prediction_df], axis=1)
69 res.to_csv(path_or_buf = outfile_predict, sep="\t", index=False)
70
71 ]]>
72 </configfile>
73 </configfiles>
74 <inputs>
75 <expand macro="sl_Conditional">
76 <param name="selected_algorithm" type="select" label="Classification or regression?">
77 <option value="LightGBMClassifier">Classification</option>
78 <option value="LightGBMRegressor">Regression</option>
79 </param>
80 <when value="LightGBMClassifier" >
81 <expand macro="sl_mixed_input"/>
82 <section name="options" title="Advanced Options" expanded="False">
83 <param argument="objective" type="select" label="Loss function">
84 <option value="binary">Binary log loss classification</option>
85 <option value="multiclass">Multiclass - softmax objective function</option>
86 <option value="multiclassova">Multiclass - one-vs-all binary objective function</option>
87 </param>
88 <param name="num_class" label="Number of classes" type="integer" value="1"/>
89 <expand macro="n_estimators" default_value="100" help="The number of boosting stages to perform"/>
90 <expand macro="max_depth" default_value="3" help="maximum depth of the individual regression estimators"/>
91 <expand macro="learning_rate" default_value='0.1'/>
92 <expand macro="max_leaf_nodes"/>
93 <expand macro="subsample"/>
94 <expand macro="verbose"/>
95 <expand macro="feature_fraction"/>
96 <expand macro="lambda_l1"/>
97 <expand macro="lambda_l2"/>
98 <expand macro="min_gain_to_split"/>
99 <expand macro="min_child_weight"/>
100 <expand macro="random_state"/>
101 </section>
102 </when>
103
104 <when value="LightGBMRegressor">
105 <expand macro="sl_mixed_input"/>
106 <section name="options" title="Advanced Options" expanded="False">
107 <param argument="objective" type="select" label="Loss function">
108 <option value="l1">Absolute loss</option>
109 <option value="l2">Square loss</option>
110 <option value="rmse">Root square loss</option>
111 <option value="huber">Huber loss - combination of least squares regression and least absolute deviation</option>
112 <option value="quantile">Quantile - use alpha to specify the quantile</option>
113 </param>
114 <expand macro="n_estimators" default_value="100" help="The number of boosting stages to perform"/>
115 <expand macro="max_depth" default_value="3" help="maximum depth of the individual regression estimators"/>
116 <expand macro="learning_rate" default_value='0.1'/>
117 <expand macro="max_leaf_nodes"/>
118 <expand macro="subsample"/>
119 <expand macro="verbose"/>
120 <expand macro="feature_fraction"/>
121 <expand macro="lambda_l1"/>
122 <expand macro="lambda_l2"/>
123 <expand macro="min_gain_to_split"/>
124 <expand macro="min_child_weight"/>
125 <expand macro="random_state"/>
126 </section>
127 </when>
128 </expand>
129 </inputs>
130
131 <outputs>
132 <data format="tabular" name="outfile_predict">
133 <filter>selected_tasks['selected_task'] == 'load'</filter>
134 </data>
135 <data format="txt" name="outfile_fit" label="${tool.name}.${selected_tasks.selected_algorithms.selected_algorithm}">
136 <filter>selected_tasks['selected_task'] == 'train'</filter>
137 </data>
138 </outputs>
139
140 <tests>
141 <test>
142 <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
143 <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
144 <param name="header1" value="True"/>
145 <param name="selected_column_selector_option" value="all_columns"/>
146 <param name="header2" value="True"/>
147 <param name="col2" value="1"/>
148 <param name="selected_task" value="train"/>
149 <param name="selected_algorithm" value="LightGBMRegressor"/>
150 <param name="objective" value="l2"/>
151 <param name="n_estimators" value="10000"/>
152 <param name="learning_rate" value="0.02"/>
153 <param name="max_leaf_nodes" value="32"/>
154 <param name="feature_fraction" value="0.9"/>
155 <param name="subsample" value="0.9"/>
156 <param name="max_depth" value="8"/>
157 <param name="lambda_l1" value="0.04"/>
158 <param name="lambda_l2" value="0.07"/>
159 <param name="min_gain_to_split" value="0.02"/>
160 <param name="min_child_weight" value="39.0"/>
161 <param name="verbose" value="-1"/>
162 <param name="random_state" value="1"/>
163 <output name="outfile_fit" file="lgb_regr_model.txt" compare="sim_size" delta="5"/>
164 </test>
165 <test>
166 <param name="infile_model" value="lgb_regr_model.txt" ftype="txt"/>
167 <param name="infile_data" value="regression_X.tabular" ftype="tabular"/>
168 <param name="selected_task" value="load"/>
169 <param name="header" value="True"/>
170 <output name="outfile_predict" file="lgb_prediction_result01.tabular"/>
171 </test>
172 <test>
173 <param name="infile1" value="train.tabular" ftype="tabular"/>
174 <param name="infile2" value="train.tabular" ftype="tabular"/>
175 <param name="col1" value="1,2,3,4"/>
176 <param name="col2" value="5"/>
177 <param name="selected_task" value="train"/>
178 <param name="selected_algorithm" value="LightGBMClassifier"/>
179 <param name="objective" value="binary"/>
180 <param name="n_estimators" value="10000"/>
181 <param name="learning_rate" value="0.02"/>
182 <param name="max_leaf_nodes" value="32"/>
183 <param name="feature_fraction" value="0.9"/>
184 <param name="subsample" value="0.9"/>
185 <param name="max_depth" value="8"/>
186 <param name="lambda_l1" value="0.04"/>
187 <param name="lambda_l2" value="0.07"/>
188 <param name="min_gain_to_split" value="0.02"/>
189 <param name="min_child_weight" value="39.0"/>
190 <param name="verbose" value="-1"/>
191 <param name="random_state" value="1"/>
192 <output name="outfile_fit" file="lgb_class_model.txt" compare="sim_size" delta="5"/>
193 </test>
194
195 </tests>
196 <help><![CDATA[
197 **What it does**
198
199 LightGBM is a gradient boosting framework that uses tree based learning algorithms.
200
201 For information about the algorithm and parameter settings please refer to the `LightGBM website`_.
202
203 .. _`LightGBM website`: https://lightgbm.readthedocs.io/en/latest/index.html
204
205 **1 - Methods**
206
207 There are two operations available:
208
209 1 - Train a model: A training set containing samples and their respective labels (or predicted values) are used as input. Based on the options selected, an estimator object is fitted to the data and is returned.
210
211 2 - Load a model and predict: An existing model is used to predict the class labels (or regression values) for a new dataset.
212
213 **2 - Training input**
214 When you choose to train a model, you need a features dataset X and a labels set y. This tool expects tabular or sparse data for X and a single column for y (tabular). You can select a subset of columns in a tabular dataset as your features dataset or labels column. Below some examples are shown:
215
216 **Sample tabular features dataset**
217
218 The following training dataset contains 3 feature columns and a column containing class labels. You can simply select the first 3 columns as features and the last column as labels:
219
220 ::
221
222 4.01163365529 -6.10797684314 8.29829894763 1
223 10.0788438916 1.59539821454 10.0684278289 0
224 -5.17607775503 -0.878286135332 6.92941850665 2
225 4.00975406235 -7.11847496542 9.3802423585 1
226 4.61204065139 -5.71217537352 9.12509610964 1
227
228
229 **Sample sparse features dataset**
230
231 In this case you cannot specify a column range.
232
233 ::
234
235 4 1048577 8738
236 1 271 0.02083333333333341
237 1 1038 0.02461995616119806
238 2 829017 0.01629088031127686
239 2 829437 0.01209127083516686
240 2 830752 0.02535100632816968
241 3 1047487 0.01485722929945572
242 3 1047980 0.02640566620767753
243 3 1048475 0.01665869913262564
244 4 608 0.01662975263094352
245 4 1651 0.02519674277562741
246 4 4053 0.04223659971350601
247
248
249 **2 - Training output**
250
251 The trained model is generated and output in the form of a text file.
252
253
254 **3 - Prediction input**
255
256 When you choose to load a model and do prediction, the tool expects an already trained estimator and a tabular dataset as input. The dataset contains new samples for which you want to classify or predict values.
257
258
259 .. class:: warningmark
260
261 The number of feature columns must be the same in training and prediction datasets!
262
263
264 **3 - Prediction output**
265
266 The tool predicts the class labels for new samples and adds them as the last column to the prediction dataset. The new dataset (i.e. tabular input plus an additional column containing predicted values) is then returned as a tabular file. The prediction output format should look like the training dataset.
267
268 ]]></help>
269 <expand macro="sklearn_citation">
270 <citation type="bibtex">
271 @incollection{NIPS2017_6907,
272 title = {LightGBM: A Highly Efficient Gradient Boosting Decision Tree},
273 author = {Ke, Guolin and Meng, Qi and Finley, Thomas and Wang, Taifeng and Chen, Wei and Ma, Weidong and Ye, Qiwei and Liu, Tie-Yan},
274 booktitle = {Advances in Neural Information Processing Systems 30},
275 editor = {I. Guyon and U. V. Luxburg and S. Bengio and H. Wallach and R. Fergus and S. Vishwanathan and R. Garnett},
276 pages = {3146--3154},
277 year = {2017},
278 publisher = {Curran Associates, Inc.},
279 url = {http://papers.nips.cc/paper/6907-lightgbm-a-highly-efficient-gradient-boosting-decision-tree.pdf}
280 }
281 </citation>
282 </expand>
283 </tool>