comparison split_file_to_collection.xml @ 3:2ddc36385d7a draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 8d069684e155d2f5b6fae06d14d98ce41321da53"
author bgruening
date Tue, 10 Sep 2019 12:31:15 -0400
parents d150ac3d853d
children 0850f2dfba13
comparison
equal deleted inserted replaced
2:d150ac3d853d 3:2ddc36385d7a
48 --by '$split_parms.split_by.select_split_by' 48 --by '$split_parms.split_by.select_split_by'
49 #if $split_parms.split_by.select_split_by == "col": 49 #if $split_parms.split_by.select_split_by == "col":
50 --id_column '$split_parms.split_by.id_col' 50 --id_column '$split_parms.split_by.id_col'
51 --match '$split_parms.split_by.match_regex' 51 --match '$split_parms.split_by.match_regex'
52 --sub '$split_parms.split_by.sub_regex' 52 --sub '$split_parms.split_by.sub_regex'
53 #else 53 #else
54 --numnew '$split_parms.split_by.numnew' 54 --numnew '$split_parms.split_by.numnew'
55 #if $split_parms.split_by.select_allocate.allocate == "random": 55 #if $split_parms.split_by.select_allocate.allocate == "random":
56 --rand 56 --rand
57 --seed '$split_parms.split_by.rand.seed' 57 --seed '$split_parms.split_by.rand.seed'
58 #end if 58 #end if
59 #if $split_parms.split_by.select_allocate.allocate == "batch": 59 #if $split_parms.split_by.select_allocate.allocate == "batch":
188 <param name="input" value="test.tabular" ftype="tabular"/> 188 <param name="input" value="test.tabular" ftype="tabular"/>
189 <param name="select_ftype" value="tabular"/> 189 <param name="select_ftype" value="tabular"/>
190 <param name="select_split_by" value="row"/> 190 <param name="select_split_by" value="row"/>
191 <param name="top" value="2"/> 191 <param name="top" value="2"/>
192 <param name="numnew" value="2"/> 192 <param name="numnew" value="2"/>
193 <param name="newfilenames" value="test"/> 193 <param name="newfilenames" value="test"/>
194 <output_collection name="list_output_tab" type="list"> 194 <output_collection name="list_output_tab" type="list">
195 <element name="test_0.tabular" file="test_0.tabular" ftype="tabular"/> 195 <element name="test_000000.tabular" file="test_0.tabular" ftype="tabular"/>
196 <element name="test_1.tabular" file="test_1.tabular" ftype="tabular"/> 196 <element name="test_000001.tabular" file="test_1.tabular" ftype="tabular"/>
197 </output_collection> 197 </output_collection>
198 </test> 198 </test>
199 <test> 199 <test>
200 <param name="input" value="test.tabular" ftype="tabular"/> 200 <param name="input" value="test.tabular" ftype="tabular"/>
201 <param name="select_ftype" value="tabular"/> 201 <param name="select_ftype" value="tabular"/>
202 <param name="select_split_by" value="row"/> 202 <param name="select_split_by" value="row"/>
203 <param name="top" value="2"/> 203 <param name="top" value="2"/>
204 <param name="numnew" value="2"/> 204 <param name="numnew" value="2"/>
205 <param name="newfilenames" value="batch_tab"/> 205 <param name="newfilenames" value="batch_tab"/>
206 <param name="allocate" value="batch"/> 206 <param name="allocate" value="batch"/>
207 <output_collection name="list_output_tab" type="list"> 207 <output_collection name="list_output_tab" type="list">
208 <element name="batch_tab_0.tabular" file="batch_tab_0.tabular" ftype="tabular"/> 208 <element name="batch_tab_000000.tabular" file="batch_tab_0.tabular" ftype="tabular"/>
209 <element name="batch_tab_1.tabular" file="batch_tab_1.tabular" ftype="tabular"/> 209 <element name="batch_tab_000001.tabular" file="batch_tab_1.tabular" ftype="tabular"/>
210 </output_collection>
211 </test>
212 <test>
213 <param name="select_ftype" value="txt"/>
214 <param name="input" value="karyotype.txt" ftype="txt"/>
215 <param name="numnew" value="24"/>
216 <param name="newfilenames" value="chr"/>
217 <param name="allocate" value="batch"/>
218
219 <output_collection name="list_output_txt" type="list">
220 <element name="chr_000000.txt" file="chr_000000.txt" ftype="txt"/>
221 <element name="chr_000001.txt" file="chr_000001.txt" ftype="txt"/>
222 <element name="chr_000002.txt" file="chr_000002.txt" ftype="txt"/>
223 <element name="chr_000003.txt" file="chr_000003.txt" ftype="txt"/>
224 <element name="chr_000004.txt" file="chr_000004.txt" ftype="txt"/>
225 <element name="chr_000005.txt" file="chr_000005.txt" ftype="txt"/>
226 <element name="chr_000006.txt" file="chr_000006.txt" ftype="txt"/>
227 <element name="chr_000007.txt" file="chr_000007.txt" ftype="txt"/>
228 <element name="chr_000008.txt" file="chr_000008.txt" ftype="txt"/>
229 <element name="chr_000009.txt" file="chr_000009.txt" ftype="txt"/>
230 <element name="chr_000010.txt" file="chr_000010.txt" ftype="txt"/>
231 <element name="chr_000011.txt" file="chr_000011.txt" ftype="txt"/>
232 <element name="chr_000012.txt" file="chr_000012.txt" ftype="txt"/>
233 <element name="chr_000013.txt" file="chr_000013.txt" ftype="txt"/>
234 <element name="chr_000014.txt" file="chr_000014.txt" ftype="txt"/>
235 <element name="chr_000015.txt" file="chr_000015.txt" ftype="txt"/>
236 <element name="chr_000016.txt" file="chr_000016.txt" ftype="txt"/>
237 <element name="chr_000017.txt" file="chr_000017.txt" ftype="txt"/>
238 <element name="chr_000018.txt" file="chr_000018.txt" ftype="txt"/>
239 <element name="chr_000019.txt" file="chr_000019.txt" ftype="txt"/>
240 <element name="chr_000020.txt" file="chr_000020.txt" ftype="txt"/>
241 <element name="chr_000021.txt" file="chr_000021.txt" ftype="txt"/>
242 <element name="chr_000022.txt" file="chr_000022.txt" ftype="txt"/>
243 <element name="chr_000023.txt" file="chr_000023.txt" ftype="txt"/>
210 </output_collection> 244 </output_collection>
211 </test> 245 </test>
212 <test> 246 <test>
213 <param name="input" value="psm.tabular" ftype="tabular"/> 247 <param name="input" value="psm.tabular" ftype="tabular"/>
214 <param name="select_ftype" value="tabular"/> 248 <param name="select_ftype" value="tabular"/>
228 <param name="input" value="demo758Dacentroid.mgf" ftype="mgf"/> 262 <param name="input" value="demo758Dacentroid.mgf" ftype="mgf"/>
229 <param name="select_ftype" value="mgf"/> 263 <param name="select_ftype" value="mgf"/>
230 <param name="numnew" value="3"/> 264 <param name="numnew" value="3"/>
231 <param name="newfilenames" value="demo"/> 265 <param name="newfilenames" value="demo"/>
232 <output_collection name="list_output_mgf" type="list"> 266 <output_collection name="list_output_mgf" type="list">
233 <element name="demo_0.mgf" file="demo_0.mgf" ftype="mgf"/> 267 <element name="demo_000000.mgf" file="demo_0.mgf" ftype="mgf"/>
234 <element name="demo_1.mgf" file="demo_1.mgf" ftype="mgf"/> 268 <element name="demo_000001.mgf" file="demo_1.mgf" ftype="mgf"/>
235 <element name="demo_2.mgf" file="demo_2.mgf" ftype="mgf"/> 269 <element name="demo_000002.mgf" file="demo_2.mgf" ftype="mgf"/>
236 </output_collection> 270 </output_collection>
237 </test> 271 </test>
238 <test> 272 <test>
239 <param name="input" value="test.fasta" ftype="fasta"/> 273 <param name="input" value="test.fasta" ftype="fasta"/>
240 <param name="select_ftype" value="fasta"/> 274 <param name="select_ftype" value="fasta"/>
241 <param name="numnew" value="2"/> 275 <param name="numnew" value="2"/>
242 <param name="newfilenames" value="test"/> 276 <param name="newfilenames" value="test"/>
243 <output_collection name="list_output_fasta" type="list"> 277 <output_collection name="list_output_fasta" type="list">
244 <element name="test_0.fasta" file="test_0.fasta" ftype="fasta"/> 278 <element name="test_000000.fasta" file="test_0.fasta" ftype="fasta"/>
245 <element name="test_1.fasta" file="test_1.fasta" ftype="fasta"/> 279 <element name="test_000001.fasta" file="test_1.fasta" ftype="fasta"/>
246 </output_collection> 280 </output_collection>
247 </test> 281 </test>
248 <test> 282 <test>
249 <param name="input" value="test.fastq" ftype="fastq"/> 283 <param name="input" value="test.fastq" ftype="fastq"/>
250 <param name="select_ftype" value="fastq"/> 284 <param name="select_ftype" value="fastq"/>
251 <param name="numnew" value="2"/> 285 <param name="numnew" value="2"/>
252 <param name="newfilenames" value="test"/> 286 <param name="newfilenames" value="test"/>
253 <output_collection name="list_output_fastq" type="list"> 287 <output_collection name="list_output_fastq" type="list">
254 <element name="test_0.fastq" file="test_0.fastq" ftype="fastq"/> 288 <element name="test_000000.fastq" file="test_0.fastq" ftype="fastq"/>
255 <element name="test_1.fastq" file="test_1.fastq" ftype="fastq"/> 289 <element name="test_000001.fastq" file="test_1.fastq" ftype="fastq"/>
256 </output_collection> 290 </output_collection>
257 </test> 291 </test>
258 <test> 292 <test>
259 <param name="input" value="test.fasta" ftype="fasta"/> 293 <param name="input" value="test.fasta" ftype="fasta"/>
260 <param name="select_ftype" value="fasta"/> 294 <param name="select_ftype" value="fasta"/>
261 <param name="numnew" value="2"/> 295 <param name="numnew" value="2"/>
262 <param name="newfilenames" value="rand"/> 296 <param name="newfilenames" value="rand"/>
263 <param name="allocate" value="random"/> 297 <param name="allocate" value="random"/>
264 <param name="seed" value="1010"/> 298 <param name="seed" value="1010"/>
265 <output_collection name="list_output_fasta" type="list"> 299 <output_collection name="list_output_fasta" type="list">
266 <element name="rand_0.fasta" file="rand_0.fasta" ftype="fasta"/> 300 <element name="rand_000000.fasta" file="rand_0.fasta" ftype="fasta"/>
267 <element name="rand_1.fasta" file="rand_1.fasta" ftype="fasta"/> 301 <element name="rand_000001.fasta" file="rand_1.fasta" ftype="fasta"/>
268 </output_collection> 302 </output_collection>
269 </test> 303 </test>
270 <test> 304 <test>
271 <param name="input" value="test.fasta" ftype="fasta"/> 305 <param name="input" value="test.fasta" ftype="fasta"/>
272 <param name="select_ftype" value="fasta"/> 306 <param name="select_ftype" value="fasta"/>
273 <param name="numnew" value="2"/> 307 <param name="numnew" value="2"/>
274 <param name="newfilenames" value="fasta_batch"/> 308 <param name="newfilenames" value="fasta_batch"/>
275 <param name="allocate" value="batch"/> 309 <param name="allocate" value="batch"/>
276 <output_collection name="list_output_fasta" type="list"> 310 <output_collection name="list_output_fasta" type="list">
277 <element name="fasta_batch_0.fasta" file="fasta_batch_0.fasta" ftype="fasta"/> 311 <element name="fasta_batch_000000.fasta" file="fasta_batch_0.fasta" ftype="fasta"/>
278 <element name="fasta_batch_1.fasta" file="fasta_batch_1.fasta" ftype="fasta"/> 312 <element name="fasta_batch_000001.fasta" file="fasta_batch_1.fasta" ftype="fasta"/>
279 </output_collection> 313 </output_collection>
280 </test> 314 </test>
281 <test> 315 <test>
282 <param name="input" value="test.tabular" ftype="txt"/> 316 <param name="input" value="test.tabular" ftype="txt"/>
283 <param name="select_ftype" value="txt"/> 317 <param name="select_ftype" value="txt"/>
284 <param name="numnew" value="2"/> 318 <param name="numnew" value="2"/>
285 <param name="newfilenames" value="test"/> 319 <param name="newfilenames" value="test"/>
286 <output_collection name="list_output_txt" type="list"> 320 <output_collection name="list_output_txt" type="list">
287 <element name="test_0.txt" file="test_0.tabular" ftype="txt" lines_diff="1"/> 321 <element name="test_000000.txt" file="test_0.tabular" ftype="txt" lines_diff="1"/>
288 <element name="test_1.txt" file="test_1.tabular" ftype="txt" lines_diff="1"/> 322 <element name="test_000001.txt" file="test_1.tabular" ftype="txt" lines_diff="1"/>
289 </output_collection> 323 </output_collection>
290 </test> 324 </test>
291 <test> 325 <test>
292 <param name="input" value="test.tabular" ftype="txt"/> 326 <param name="input" value="test.tabular" ftype="txt"/>
293 <param name="select_ftype" value="generic"/> 327 <param name="select_ftype" value="generic"/>
294 <param name="generic_regex" value="^.*"/> 328 <param name="generic_regex" value="^.*"/>
295 <param name="numnew" value="2"/> 329 <param name="numnew" value="2"/>
296 <param name="newfilenames" value="test"/> 330 <param name="newfilenames" value="test"/>
297 <output_collection name="list_output_generic" type="list"> 331 <output_collection name="list_output_generic" type="list">
298 <element name="test_0" file="test_0.tabular" ftype="txt" lines_diff="1"/> 332 <element name="test_000000" file="test_0.tabular" ftype="txt" lines_diff="1"/>
299 <element name="test_1" file="test_1.tabular" ftype="txt" lines_diff="1"/> 333 <element name="test_000001" file="test_1.tabular" ftype="txt" lines_diff="1"/>
300 </output_collection> 334 </output_collection>
301 </test> 335 </test>
302 <test> 336 <test>
303 <param name="input" value="test.fasta" ftype="fasta"/> 337 <param name="input" value="test.fasta" ftype="fasta"/>
304 <param name="select_ftype" value="generic"/> 338 <param name="select_ftype" value="generic"/>
305 <param name="generic_regex" value="^>.*"/> 339 <param name="generic_regex" value="^>.*"/>
306 <param name="numnew" value="2"/> 340 <param name="numnew" value="2"/>
307 <param name="newfilenames" value="rand"/> 341 <param name="newfilenames" value="rand"/>
308 <param name="allocate" value="random"/> 342 <param name="allocate" value="random"/>
309 <param name="seed" value="1010"/> 343 <param name="seed" value="1010"/>
310 <output_collection name="list_output_generic" type="list"> 344 <output_collection name="list_output_generic" type="list">
311 <element name="rand_0" file="rand_0.fasta" ftype="fasta"/> 345 <element name="rand_000000" file="rand_0.fasta" ftype="fasta"/>
312 <element name="rand_1" file="rand_1.fasta" ftype="fasta"/> 346 <element name="rand_000001" file="rand_1.fasta" ftype="fasta"/>
313 </output_collection> 347 </output_collection>
314 </test> 348 </test>
315 <test> 349 <test>
316 <param name="input" value="3_molecules.sdf" ftype="sdf"/> 350 <param name="input" value="3_molecules.sdf" ftype="sdf"/>
317 <param name="select_ftype" value="generic"/> 351 <param name="select_ftype" value="generic"/>
318 <param name="generic_regex" value="^\$\$\$\$.*"/> 352 <param name="generic_regex" value="^\$\$\$\$.*"/>
319 <param name="numnew" value="1000"/> 353 <param name="numnew" value="1000"/>
320 <param name="newfilenames" value="mol"/> 354 <param name="newfilenames" value="mol"/>
321 <param name="allocate" value="batch"/> 355 <param name="allocate" value="batch"/>
322 <output_collection name="list_output_generic" type="list"> 356 <output_collection name="list_output_generic" type="list">
323 <element name="mol_0" file="mol_0.sdf" ftype="sdf"/> 357 <element name="mol_000000" file="mol_0.sdf" ftype="sdf"/>
324 <element name="mol_1" file="mol_1.sdf" ftype="sdf"/> 358 <element name="mol_000001" file="mol_1.sdf" ftype="sdf"/>
325 <element name="mol_2" file="mol_2.sdf" ftype="sdf"/> 359 <element name="mol_000002" file="mol_2.sdf" ftype="sdf"/>
326 </output_collection> 360 </output_collection>
327 </test> 361 </test>
328 </tests> 362 </tests>
329 <help><![CDATA[ 363 <help><![CDATA[
330 **Split file into a dataset collection** 364 **Split file into a dataset collection**
331 365
332 This tool splits a data sets consisting of records into multiple data sets within a collection. 366 This tool splits a data sets consisting of records into multiple data sets within a collection.
333 A record can be for instance simply a line, a FASTA sequence (header + sequence), a FASTQ sequence 367 A record can be for instance simply a line, a FASTA sequence (header + sequence), a FASTQ sequence
334 (headers + sequence + qualities), etc. The important property is that the begin of a new record 368 (headers + sequence + qualities), etc. The important property is that the begin of a new record
335 can be speciefied by a regular expression, e.g. ".*" for lines, ">.*" for FASTA, or "@.*" for FASTQ. 369 can be speciefied by a regular expression, e.g. ".*" for lines, ">.*" for FASTA, or "@.*" for FASTQ.
336 The tool has presets for text, tabular data sets (which are split by line), FASTA, FASTQ, and MGF. 370 The tool has presets for text, tabular data sets (which are split by line), FASTA, FASTQ, and MGF.
337 For other data types the text delimiting records can be specified manually using the generic splitter. 371 For other data types the text delimiting records can be specified manually using the generic splitter.
338 372
339 If splitting by line (or by some other item, like a FASTA entry or an MGF record, the splitting can be either done alternating, in original record order, or at random. 373 If splitting by line (or by some other item, like a FASTA entry or an MGF record, the splitting can be either done alternating, in original record order, or at random.
340 374
341 If t records are to be distributed to n new data sets, then the i-th record goes to data set 375 If t records are to be distributed to n new data sets, then the i-th record goes to data set
342 376
343 * floor(i / t * n) (for batch), 377 * floor(i / t * n) (for batch),
344 * i % n (for alternating), or 378 * i % n (for alternating), or
345 * a random data set 379 * a random data set
346 380
347 For instance, t=5 records are distributed as follows on n=2 data sets 381 For instance, t=5 records are distributed as follows on n=2 data sets
348 382
366 2 1 2 2 400 2 1 2 2
367 3 1 0 0 401 3 1 0 0
368 4 2 1 1 402 4 2 1 1
369 = === === ==== 403 = === === ====
370 404
371 Note that there are no guarantees when splitting at random that every result file will be non-empty, so downstream tools should be able to gracefully handle empty files. 405 Note that there are no guarantees when splitting at random that every result file will be non-empty, so downstream tools should be able to gracefully handle empty files.
372 406
373 If a tabular file is used as input, you may choose to split by line or by column. If split by column, a new file is created for each unique value in the column. 407 If a tabular file is used as input, you may choose to split by line or by column. If split by column, a new file is created for each unique value in the column.
374 In addition, (Python) regular expressions may be used to transform the value in the column to a new value. Caution should be used with this feature, as it could transform all values to the same value, or other unexpected behavior. 408 In addition, (Python) regular expressions may be used to transform the value in the column to a new value. Caution should be used with this feature, as it could transform all values to the same value, or other unexpected behavior.
375 The default regular expression uses each value in the column without modifying it. 409 The default regular expression uses each value in the column without modifying it.
376 ]]></help> 410 ]]></help>
377 <citations> 411 <citations>
378 <citation type="bibtex"> 412 <citation type="bibtex">
379 @misc{githubsplit, 413 @misc{githubsplit,
380 author = {Easterly, Caleb}, 414 author = {Easterly, Caleb},