comparison train_test_split.xml @ 11:5da2217cd788 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 9981e25b00de29ed881b2229a173a8c812ded9bb
author bgruening
date Wed, 09 Aug 2023 13:33:25 +0000
parents ce2fd1edbc6e
children
comparison
equal deleted inserted replaced
10:6e25381dad5c 11:5da2217cd788
1 <tool id="sklearn_train_test_split" name="Split Dataset" version="@VERSION@"> 1 <tool id="sklearn_train_test_split" name="Split Dataset" version="@VERSION@" profile="@PROFILE@">
2 <description>into training and test subsets</description> 2 <description>into training and test subsets</description>
3 <macros> 3 <macros>
4 <import>main_macros.xml</import> 4 <import>main_macros.xml</import>
5 <macro name="label_input" token_label="Select the dataset containing labels"> 5 <macro name="label_input" token_label="Select the dataset containing labels">
6 <param name="labels" type="data" format="tabular" label="@LABEL@"/> 6 <param name="labels" type="data" format="tabular" label="@LABEL@" />
7 <param name="header1" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolfalse" checked="true" label="Does the dataset contain header?" /> 7 <param name="header1" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolfalse" checked="true" label="Does the dataset contain header?" />
8 <param name="col" type="data_column" data_ref="labels" label="Select target column"/> 8 <param name="col" type="data_column" data_ref="labels" label="Select target column" />
9 </macro> 9 </macro>
10 </macros> 10 </macros>
11 <expand macro="python_requirements"/> 11 <expand macro="python_requirements" />
12 <expand macro="macro_stdio"/> 12 <expand macro="macro_stdio" />
13 <version_command>echo "@VERSION@"</version_command> 13 <version_command>echo "@VERSION@"</version_command>
14 <command detect_errors="exit_code"><![CDATA[ 14 <command detect_errors="exit_code"><![CDATA[
15 python '$__tool_directory__/train_test_split.py' 15 python '$__tool_directory__/train_test_split.py'
16 --inputs '$inputs' 16 --inputs '$inputs'
17 --infile_array '$infile_array' 17 --infile_array '$infile_array'
27 --outfile_train '$out_train' 27 --outfile_train '$out_train'
28 --outfile_test '$out_test' 28 --outfile_test '$out_test'
29 ]]> 29 ]]>
30 </command> 30 </command>
31 <configfiles> 31 <configfiles>
32 <inputs name="inputs"/> 32 <inputs name="inputs" />
33 </configfiles> 33 </configfiles>
34 <inputs> 34 <inputs>
35 <param name="infile_array" type="data" format="tabular" label="Select the dataset containing array to split" help="This tool only supports to split one array at each tool run. If X, y are in separate files, the splitting task could be done by invoking this tool twice in which this input dataset is swapped while all other parameters are kept the same."/> 35 <param name="infile_array" type="data" format="tabular" label="Select the dataset containing array to split" help="This tool only supports to split one array at each tool run. If X, y are in separate files, the splitting task could be done by invoking this tool twice in which this input dataset is swapped while all other parameters are kept the same." />
36 <param name="header0" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolfalse" checked="true" label="Does the dataset contain header?" /> 36 <param name="header0" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolfalse" checked="true" label="Does the dataset contain header?" />
37 <conditional name="mode_selection"> 37 <conditional name="mode_selection">
38 <param name="selected_mode" type="select" label="Select the splitting mode"> 38 <param name="selected_mode" type="select" label="Select the splitting mode">
39 <option value="train_test_split" selected="true">Train Test Split</option> 39 <option value="train_test_split" selected="true">Train Test Split</option>
40 <option value="cv_splitter">Cross-Validation Splitter</option> 40 <option value="cv_splitter">Cross-Validation Splitter</option>
41 </param> 41 </param>
42 <when value="train_test_split"> 42 <when value="train_test_split">
43 <section name="options" title="Options" expanded="true"> 43 <section name="options" title="Options" expanded="true">
44 <param argument="test_size" type="float" min="0" optional="false" value="0.25" label="Test size:" 44 <param argument="test_size" type="float" min="0" optional="false" value="0.25" label="Test size:"
45 help="A float number, 0.0 - 1.0, represents the proportion of the dataset to be included in the test split."/> 45 help="A float number, 0.0 - 1.0, represents the proportion of the dataset to be included in the test split." />
46 <param argument="random_state" type="integer" optional="true" value="" label="Random seed number:"/> 46 <param argument="random_state" type="integer" optional="true" value="" label="Random seed number:" />
47 <conditional name="shuffle_selection"> 47 <conditional name="shuffle_selection">
48 <param name="shuffle" type="select" label="Shuffle strategy"> 48 <param name="shuffle" type="select" label="Shuffle strategy">
49 <option value="None">None - No shuffle</option> 49 <option value="None">None - No shuffle</option>
50 <option value="simple">Shuffle</option> 50 <option value="simple">Shuffle</option>
51 <option value="stratified">Stratified Shuffle</option> 51 <option value="stratified">Stratified Shuffle</option>
52 <option value="group">Group Shuffle</option> 52 <option value="group">Group Shuffle</option>
53 </param> 53 </param>
54 <when value="None"/> 54 <when value="None" />
55 <when value="simple"/> 55 <when value="simple" />
56 <when value="stratified"> 56 <when value="stratified">
57 <expand macro="label_input"/> 57 <expand macro="label_input" />
58 </when> 58 </when>
59 <when value="group"> 59 <when value="group">
60 <expand macro="label_input" label="Select the dataset containing groups"/> 60 <expand macro="label_input" label="Select the dataset containing groups" />
61 </when> 61 </when>
62 </conditional> 62 </conditional>
63 </section> 63 </section>
64 </when> 64 </when>
65 <when value="cv_splitter"> 65 <when value="cv_splitter">
81 <option value="GroupShuffleSplit">GroupShuffleSplit</option> 81 <option value="GroupShuffleSplit">GroupShuffleSplit</option>
82 <option value="LeaveOneGroupOut">LeaveOneGroupOut</option> 82 <option value="LeaveOneGroupOut">LeaveOneGroupOut</option>
83 <option value="LeavePGroupsOut">LeavePGroupsOut</option> 83 <option value="LeavePGroupsOut">LeavePGroupsOut</option>
84 </param> 84 </param>
85 <when value="KFold"> 85 <when value="KFold">
86 <expand macro="cv_n_splits"/> 86 <expand macro="cv_n_splits" />
87 <expand macro="cv_shuffle"/> 87 <expand macro="cv_shuffle" />
88 <expand macro="random_state"/> 88 <expand macro="random_state" />
89 </when> 89 </when>
90 <when value="RepeatedKFold"> 90 <when value="RepeatedKFold">
91 <expand macro="cv_n_splits" value="5"/> 91 <expand macro="cv_n_splits" value="5" />
92 <param argument="n_repeats" type="integer" value="10" label="n_repeats" help="Number of times cross-validator needs to be repeated." /> 92 <param argument="n_repeats" type="integer" value="10" label="n_repeats" help="Number of times cross-validator needs to be repeated." />
93 <expand macro="random_state" /> 93 <expand macro="random_state" />
94 </when> 94 </when>
95 <when value="StratifiedKFold"> 95 <when value="StratifiedKFold">
96 <expand macro="cv_n_splits"/> 96 <expand macro="cv_n_splits" />
97 <expand macro="cv_shuffle"/> 97 <expand macro="cv_shuffle" />
98 <expand macro="random_state"/> 98 <expand macro="random_state" />
99 <section name="target_input" title="Target values" expanded="true"> 99 <section name="target_input" title="Target values" expanded="true">
100 <expand macro="label_input"/> 100 <expand macro="label_input" />
101 </section> 101 </section>
102 </when> 102 </when>
103 <when value="RepeatedStratifiedKFold"> 103 <when value="RepeatedStratifiedKFold">
104 <expand macro="cv_n_splits" value="5"/> 104 <expand macro="cv_n_splits" value="5" />
105 <param argument="n_repeats" type="integer" value="10" label="n_repeats" help="Number of times cross-validator needs to be repeated." /> 105 <param argument="n_repeats" type="integer" value="10" label="n_repeats" help="Number of times cross-validator needs to be repeated." />
106 <expand macro="random_state" /> 106 <expand macro="random_state" />
107 <section name="target_input" title="Target values" expanded="true"> 107 <section name="target_input" title="Target values" expanded="true">
108 <expand macro="label_input"/> 108 <expand macro="label_input" />
109 </section> 109 </section>
110 </when> 110 </when>
111 <when value="LeaveOneOut"> 111 <when value="LeaveOneOut">
112 </when> 112 </when>
113 <when value="LeavePOut"> 113 <when value="LeavePOut">
114 <param argument="p" type="integer" value="" label="p" help="Integer. Size of the test sets."/> 114 <param argument="p" type="integer" value="" label="p" help="Integer. Size of the test sets." />
115 </when> 115 </when>
116 <when value="ShuffleSplit"> 116 <when value="ShuffleSplit">
117 <expand macro="cv_n_splits" value="10" help="Number of re-shuffling and splitting iterations."/> 117 <expand macro="cv_n_splits" value="10" help="Number of re-shuffling and splitting iterations." />
118 <expand macro="cv_test_size" value="0.1" /> 118 <expand macro="cv_test_size" value="0.1" />
119 <expand macro="random_state"/> 119 <expand macro="random_state" />
120 </when> 120 </when>
121 <when value="StratifiedShuffleSplit"> 121 <when value="StratifiedShuffleSplit">
122 <expand macro="cv_n_splits" value="10" help="Number of re-shuffling and splitting iterations."/> 122 <expand macro="cv_n_splits" value="10" help="Number of re-shuffling and splitting iterations." />
123 <expand macro="cv_test_size" value="0.1" /> 123 <expand macro="cv_test_size" value="0.1" />
124 <expand macro="random_state"/> 124 <expand macro="random_state" />
125 <section name="target_input" title="Target values" expanded="true"> 125 <section name="target_input" title="Target values" expanded="true">
126 <expand macro="label_input"/> 126 <expand macro="label_input" />
127 </section> 127 </section>
128 </when> 128 </when>
129 <when value="TimeSeriesSplit"> 129 <when value="TimeSeriesSplit">
130 <expand macro="cv_n_splits"/> 130 <expand macro="cv_n_splits" />
131 <param argument="max_train_size" type="integer" value="" optional="true" label="Maximum size of the training set" help="Maximum size for a single training set." /> 131 <param argument="max_train_size" type="integer" value="" optional="true" label="Maximum size of the training set" help="Maximum size for a single training set." />
132 </when> 132 </when>
133 <when value="PredefinedSplit"> 133 <when value="PredefinedSplit">
134 <param argument="test_fold" type="text" value="" area="true" label="test_fold" help="List, e.g., [0, 1, -1, 1], represents two test sets, [X[0]] and [X[1], X[3]], X[2] is excluded from any test set due to '-1'."/> 134 <param argument="test_fold" type="text" value="" area="true" label="test_fold" help="List, e.g., [0, 1, -1, 1], represents two test sets, [X[0]] and [X[1], X[3]], X[2] is excluded from any test set due to '-1'." />
135 </when> 135 </when>
136 <when value="OrderedKFold"> 136 <when value="OrderedKFold">
137 <expand macro="cv_n_splits"/> 137 <expand macro="cv_n_splits" />
138 <expand macro="cv_shuffle"/> 138 <expand macro="cv_shuffle" />
139 <expand macro="random_state"/> 139 <expand macro="random_state" />
140 <section name="target_input" title="Target values" expanded="true"> 140 <expand macro="cv_n_stratification_bins" />
141 <expand macro="label_input" label="Select the dataset containing target values"/> 141 <section name="target_input" title="Target values" expanded="true">
142 <expand macro="label_input" label="Select the dataset containing target values" />
142 </section> 143 </section>
143 </when> 144 </when>
144 <when value="RepeatedOrderedKFold"> 145 <when value="RepeatedOrderedKFold">
145 <expand macro="cv_n_splits"/> 146 <expand macro="cv_n_splits" />
146 <param argument="n_repeats" type="integer" value="5"/> 147 <param argument="n_repeats" type="integer" value="5" />
147 <expand macro="random_state"/> 148 <expand macro="random_state" />
148 <section name="target_input" title="Target values" expanded="true"> 149 <expand macro="cv_n_stratification_bins" />
149 <expand macro="label_input" label="Select the dataset containing target values"/> 150 <section name="target_input" title="Target values" expanded="true">
151 <expand macro="label_input" label="Select the dataset containing target values" />
150 </section> 152 </section>
151 </when> 153 </when>
152 <when value="GroupKFold"> 154 <when value="GroupKFold">
153 <expand macro="cv_n_splits"/> 155 <expand macro="cv_n_splits" />
154 <expand macro="cv_groups" /> 156 <expand macro="cv_groups" />
155 </when> 157 </when>
156 <when value="GroupShuffleSplit"> 158 <when value="GroupShuffleSplit">
157 <expand macro="cv_n_splits" value="5"/> 159 <expand macro="cv_n_splits" value="5" />
158 <expand macro="cv_test_size"/> 160 <expand macro="cv_test_size" />
159 <expand macro="random_state"/> 161 <expand macro="random_state" />
160 <expand macro="cv_groups"/> 162 <expand macro="cv_groups" />
161 </when> 163 </when>
162 <when value="LeaveOneGroupOut"> 164 <when value="LeaveOneGroupOut">
163 <expand macro="cv_groups"/> 165 <expand macro="cv_groups" />
164 </when> 166 </when>
165 <when value="LeavePGroupsOut"> 167 <when value="LeavePGroupsOut">
166 <param argument="n_groups" type="integer" value="" label="n_groups" help="Number of groups (p) to leave out in the test split." /> 168 <param argument="n_groups" type="integer" value="" label="n_groups" help="Number of groups (p) to leave out in the test split." />
167 <expand macro="cv_groups"/> 169 <expand macro="cv_groups" />
168 </when> 170 </when>
169 </conditional> 171 </conditional>
170 <param name="nth_split" type="integer" min="1" value="1" label="Type the index of split to output" help="Split index starts from 1 to total = n_splits (x n_repeats). (nth_split)"/> 172 <param name="nth_split" type="integer" min="1" value="1" label="Type the index of split to output" help="Split index starts from 1 to total = n_splits (x n_repeats). (nth_split)" />
171 </when> 173 </when>
172 </conditional> 174 </conditional>
173 </inputs> 175 </inputs>
174 <outputs> 176 <outputs>
175 <data format="tabular" name="out_train" label="${tool.name} on ${on_string} (train)"/> 177 <data format="tabular" name="out_train" label="${tool.name} on ${on_string} (train)" />
176 <data format="tabular" name="out_test" label="${tool.name} on ${on_string} (test)"/> 178 <data format="tabular" name="out_test" label="${tool.name} on ${on_string} (test)" />
177 </outputs> 179 </outputs>
178 <tests> 180 <tests>
179 <test> 181 <test>
180 <param name="infile_array" value="regression_X.tabular" ftype="tabular"/> 182 <param name="infile_array" value="regression_X.tabular" ftype="tabular" />
181 <param name="header0" value="true"/> 183 <param name="header0" value="true" />
182 <conditional name="mode_selection"> 184 <conditional name="mode_selection">
183 <param name="selected_mode" value="train_test_split"/> 185 <param name="selected_mode" value="train_test_split" />
184 <section name="options"> 186 <section name="options">
185 <param name="random_state" value="123"/> 187 <param name="random_state" value="123" />
186 <conditional name="shuffle_selection"> 188 <conditional name="shuffle_selection">
187 <param name="shuffle" value="simple"/> 189 <param name="shuffle" value="simple" />
188 </conditional> 190 </conditional>
189 </section> 191 </section>
190 </conditional> 192 </conditional>
191 <output name="out_train" file="train_test_split_train01.tabular" ftype="tabular"/> 193 <output name="out_train" file="train_test_split_train01.tabular" ftype="tabular" />
192 <output name="out_test" file="train_test_split_test01.tabular" ftype="tabular"/> 194 <output name="out_test" file="train_test_split_test01.tabular" ftype="tabular" />
193 </test> 195 </test>
194 <test> 196 <test>
195 <param name="infile_array" value="regression_X.tabular" ftype="tabular"/> 197 <param name="infile_array" value="regression_X.tabular" ftype="tabular" />
196 <param name="header0" value="true"/> 198 <param name="header0" value="true" />
197 <conditional name="mode_selection"> 199 <conditional name="mode_selection">
198 <param name="selected_mode" value="cv_splitter"/> 200 <param name="selected_mode" value="cv_splitter" />
199 <conditional name="cv_selector"> 201 <conditional name="cv_selector">
200 <param name="selected_cv" value="ShuffleSplit"/> 202 <param name="selected_cv" value="ShuffleSplit" />
201 <param name="random_state" value="123"/> 203 <param name="random_state" value="123" />
202 <param name="n_splits" value="2"/> 204 <param name="n_splits" value="2" />
203 <param name="test_size" value="0.25"/> 205 <param name="test_size" value="0.25" />
204 </conditional> 206 </conditional>
205 </conditional> 207 </conditional>
206 <output name="out_train" file="train_test_split_train01.tabular" ftype="tabular"/> 208 <output name="out_train" file="train_test_split_train01.tabular" ftype="tabular" />
207 <output name="out_test" file="train_test_split_test01.tabular" ftype="tabular"/> 209 <output name="out_test" file="train_test_split_test01.tabular" ftype="tabular" />
208 </test> 210 </test>
209 <test> 211 <test>
210 <param name="infile_array" value="imblearn_X.tabular" ftype="tabular"/> 212 <param name="infile_array" value="imblearn_X.tabular" ftype="tabular" />
211 <param name="header0" value="false"/> 213 <param name="header0" value="false" />
212 <conditional name="mode_selection"> 214 <conditional name="mode_selection">
213 <param name="selected_mode" value="train_test_split"/> 215 <param name="selected_mode" value="train_test_split" />
214 <section name="options"> 216 <section name="options">
215 <param name="test_size" value="0.2"/> 217 <param name="test_size" value="0.2" />
216 <param name="random_state" value="123"/> 218 <param name="random_state" value="123" />
217 <conditional name="shuffle_selection"> 219 <conditional name="shuffle_selection">
218 <param name="shuffle" value="stratified"/> 220 <param name="shuffle" value="stratified" />
219 <param name="labels" value="imblearn_y.tabular" ftype="tabular"/> 221 <param name="labels" value="imblearn_y.tabular" ftype="tabular" />
220 <param name="header1" value="false"/> 222 <param name="header1" value="false" />
221 <param name="col" value="1"/> 223 <param name="col" value="1" />
222 </conditional> 224 </conditional>
223 </section> 225 </section>
224 </conditional> 226 </conditional>
225 <output name="out_train" file="train_test_split_train02.tabular" ftype="tabular"/> 227 <output name="out_train" file="train_test_split_train02.tabular" ftype="tabular" />
226 <output name="out_test" file="train_test_split_test02.tabular" ftype="tabular"/> 228 <output name="out_test" file="train_test_split_test02.tabular" ftype="tabular" />
227 </test> 229 </test>
228 <test> 230 <test>
229 <param name="infile_array" value="imblearn_X.tabular" ftype="tabular"/> 231 <param name="infile_array" value="imblearn_X.tabular" ftype="tabular" />
230 <param name="header0" value="false"/> 232 <param name="header0" value="false" />
231 <conditional name="mode_selection"> 233 <conditional name="mode_selection">
232 <param name="selected_mode" value="cv_splitter"/> 234 <param name="selected_mode" value="cv_splitter" />
233 <conditional name="cv_selector"> 235 <conditional name="cv_selector">
234 <param name="selected_cv" value="StratifiedShuffleSplit"/> 236 <param name="selected_cv" value="StratifiedShuffleSplit" />
235 <param name="random_state" value="123"/> 237 <param name="random_state" value="123" />
236 <param name="test_size" value="0.2"/> 238 <param name="test_size" value="0.2" />
237 <param name="n_splits" value="1"/> 239 <param name="n_splits" value="1" />
238 <section name="target_input"> 240 <section name="target_input">
239 <param name="labels" value="imblearn_y.tabular" ftype="tabular"/> 241 <param name="labels" value="imblearn_y.tabular" ftype="tabular" />
240 <param name="header1" value="false"/> 242 <param name="header1" value="false" />
241 <param name="col" value="1"/> 243 <param name="col" value="1" />
242 </section> 244 </section>
243 </conditional> 245 </conditional>
244 </conditional> 246 </conditional>
245 <output name="out_train" file="train_test_split_train02.tabular" ftype="tabular"/> 247 <output name="out_train" file="train_test_split_train02.tabular" ftype="tabular" />
246 <output name="out_test" file="train_test_split_test02.tabular" ftype="tabular"/> 248 <output name="out_test" file="train_test_split_test02.tabular" ftype="tabular" />
247 </test> 249 </test>
248 <test> 250 <test>
249 <param name="infile_array" value="regression_X.tabular" ftype="tabular"/> 251 <param name="infile_array" value="regression_X.tabular" ftype="tabular" />
250 <param name="header0" value="true"/> 252 <param name="header0" value="true" />
251 <conditional name="mode_selection"> 253 <conditional name="mode_selection">
252 <param name="selected_mode" value="cv_splitter"/> 254 <param name="selected_mode" value="cv_splitter" />
253 <conditional name="cv_selector"> 255 <conditional name="cv_selector">
254 <param name="selected_cv" value="OrderedKFold"/> 256 <param name="selected_cv" value="OrderedKFold" />
255 <param name="random_state" value="123"/> 257 <param name="random_state" value="123" />
256 <param name="shuffle" value="true"/> 258 <param name="shuffle" value="true" />
257 <param name="n_splits" value="5"/> 259 <param name="n_splits" value="5" />
258 <section name="target_input"> 260 <section name="target_input">
259 <param name="labels" value="regression_y.tabular" ftype="tabular"/> 261 <param name="labels" value="regression_y.tabular" ftype="tabular" />
260 <param name="header1" value="true"/> 262 <param name="header1" value="true" />
261 <param name="col" value="1"/> 263 <param name="col" value="1" />
262 </section> 264 </section>
263 </conditional> 265 </conditional>
264 </conditional> 266 </conditional>
265 <output name="out_train" file="train_test_split_train03.tabular" ftype="tabular"/> 267 <output name="out_train" file="train_test_split_train03.tabular" ftype="tabular" />
266 <output name="out_test" file="train_test_split_test03.tabular" ftype="tabular"/> 268 <output name="out_test" file="train_test_split_test03.tabular" ftype="tabular" />
267 </test> 269 </test>
268 </tests> 270 </tests>
269 <help><![CDATA[ 271 <help><![CDATA[
270 **What it does** 272 **What it does**
271 273 This tool implements splitter function and classes from `sklearn.model_selection` module to split contents (rows) of a table into two subsets for training and test, respectively . The simple train test split mode not only supports shuffle split and stratified shuffle split natively carried by the `train_test_split` function, but also gets extended to do group shuffle. The cross-validation splitter mode supports more diverse splitting strategies. Each tool run outputs one split, train and test. To get different splitting sets, for example, nested CV, multiple tool runs are needed with different `nth_split`.
272 This tool implements splitter function and classes from `sklearn.model_selection` module to split contents (rows) of a table into
273 two subsets for training and test, respectively . The simple train test split mode not only supports shuffle split and stratified
274 shuffle split natively carried by the `train_test_split` function, but also gets extended to do group shuffle.
275 The cross-validation splitter mode supports more diverse splitting strategies. Each tool run outputs one split, train and test.
276 To get different splitting sets, for example, nested CV, multiple tool runs are needed with different `nth_split`.
277 Example: 6-fold CV. Set `n_splits` to 6. Run the tool 6 times with the same parameters, but set `nth_split` according to the number of the run (1-6).
278 274
279 - Train Test Split mode 275 - Train Test Split mode
280 - direct split, no shuffle 276 - direct split, no shuffle
281 - shuffle split 277 - shuffle split
282 - stratified shuffle split 278 - stratified shuffle split
291 Input: a tabular dataset. 287 Input: a tabular dataset.
292 288
293 Output: two tabular datasets containing training and test subsets, respectively. 289 Output: two tabular datasets containing training and test subsets, respectively.
294 290
295 ]]></help> 291 ]]></help>
296 <expand macro="sklearn_citation"/> 292 <expand macro="sklearn_citation" />
297 </tool> 293 </tool>