comparison split_file_to_collection.xml @ 4:0850f2dfba13 draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 872590086696cfbc248519507ecb9063926297ad"
author bgruening
date Wed, 09 Oct 2019 07:34:49 -0400
parents 2ddc36385d7a
children e77b954f0da5
comparison
equal deleted inserted replaced
3:2ddc36385d7a 4:0850f2dfba13
1 <tool id="split_file_to_collection" name="Split file" version="0.2.0"> 1 <tool id="split_file_to_collection" name="Split file" version="0.3.0">
2 <description>to dataset collection</description> 2 <description>to dataset collection</description>
3 <macros> 3 <macros>
4 <xml name="regex_sanitizer"> 4 <xml name="regex_sanitizer">
5 <sanitizer> 5 <sanitizer>
6 <valid> 6 <valid>
13 <add source="&apos;" target="__sq__"/> 13 <add source="&apos;" target="__sq__"/>
14 </mapping> 14 </mapping>
15 </sanitizer> 15 </sanitizer>
16 </xml> 16 </xml>
17 <xml name="numnew_fname"> 17 <xml name="numnew_fname">
18 <param name="numnew" type="integer" label="Number of new files" min="1" value="1"/> 18 <conditional name="select_mode">
19 <param name="mode" type="select" label="Specify number of output files or number of records per file?" help="Specify the number of records ('chunk size') to place in each file. The 'Number of new files' parameter will be ignored.">
20 <option value="chunk">Number of records per file ('chunk mode')</option>
21 <option value="numnew">Number of output files</option>
22 </param>
23 <when value="chunk">
24 <param name="chunksize" type="integer" label="Chunk size" min="1" value="1" help="Number of records per output file."/>
25 </when>
26 <when value="numnew">
27 <param name="numnew" type="integer" label="Number of new files" min="1" value="1"/>
28 </when>
29 </conditional>
19 <param name="newfilenames" type="text" label="Base name for new files in collection" 30 <param name="newfilenames" type="text" label="Base name for new files in collection"
20 help="This will increment automatically - if input is 'file', then output is 'file0', 'file1', etc." value="split_file"/> 31 help="This will increment automatically - if input is 'file', then output is 'file0', 'file1', etc." value="split_file"/>
21 <conditional name="select_allocate"> 32 <conditional name="select_allocate">
22 <param name="allocate" type="select" label="Method to allocate records to new files" help="See the information section for a diagram"> 33 <param name="allocate" type="select" label="Method to allocate records to new files" help="See the information section for a diagram">
23 <option value="random">At random</option> 34 <option value="random">At random</option>
24 <option value="batch">Maintain record order</option> 35 <option value="batch">Maintain record order</option>
25 <option value="byrow" selected="true">Alternate output files</option> 36 <option value="byrow" selected="true">Alternate output files</option>
26 </param> 37 </param>
27 <when value="random"> 38 <when value="random">
28 <param name="seed" type="integer" label="Random number seed" help="For reproducibility, set this to some arbitrary integer (i.e. '1010')" value="1010"/> 39 <param name="seed" type="integer" label="Random number seed" help="For reproducibility, set this to some arbitrary integer (e.g. '1010')" value="1010"/>
29 </when> 40 </when>
30 <when value="batch"> 41 <when value="batch">
31 </when> 42 </when>
32 <when value="byrow"> 43 <when value="byrow">
33 </when> 44 </when>
49 #if $split_parms.split_by.select_split_by == "col": 60 #if $split_parms.split_by.select_split_by == "col":
50 --id_column '$split_parms.split_by.id_col' 61 --id_column '$split_parms.split_by.id_col'
51 --match '$split_parms.split_by.match_regex' 62 --match '$split_parms.split_by.match_regex'
52 --sub '$split_parms.split_by.sub_regex' 63 --sub '$split_parms.split_by.sub_regex'
53 #else 64 #else
54 --numnew '$split_parms.split_by.numnew' 65 #if $split_parms.split_by.select_mode.mode == "numnew":
66 --numnew '$split_parms.split_by.select_mode.numnew'
67 #else
68 --chunksize $split_parms.split_by.select_mode.chunksize
69 #end if
55 #if $split_parms.split_by.select_allocate.allocate == "random": 70 #if $split_parms.split_by.select_allocate.allocate == "random":
56 --rand 71 --rand
57 --seed '$split_parms.split_by.rand.seed' 72 --seed '$split_parms.split_by.rand.seed'
58 #end if 73 #end if
59 #if $split_parms.split_by.select_allocate.allocate == "batch": 74 #if $split_parms.split_by.select_allocate.allocate == "batch":
61 #end if 76 #end if
62 #end if 77 #end if
63 #else 78 #else
64 #if $split_parms.select_ftype == "generic" 79 #if $split_parms.select_ftype == "generic"
65 --generic_re '$split_parms.generic_regex' 80 --generic_re '$split_parms.generic_regex'
81 #if $split_parms.split_after == 'true':
82 --split_after
83 #end if
66 #end if 84 #end if
67 --numnew '$split_parms.numnew' 85 #if $split_parms.select_mode.mode == "numnew":
86 --numnew '$split_parms.select_mode.numnew'
87 #else
88 --chunksize $split_parms.select_mode.chunksize
89 #end if
68 #if $split_parms.select_allocate.allocate == "random": 90 #if $split_parms.select_allocate.allocate == "random":
69 --rand 91 --rand
70 --seed '$split_parms.select_allocate.seed' 92 --seed '$split_parms.select_allocate.seed'
71 #end if 93 #end if
72 #if $split_parms.select_allocate.allocate == "batch": 94 #if $split_parms.select_allocate.allocate == "batch":
91 <param name="select_ftype" type="select" label="Select the file type to split"> 113 <param name="select_ftype" type="select" label="Select the file type to split">
92 <option value="mgf">MGF</option> 114 <option value="mgf">MGF</option>
93 <option value="fastq">FASTQ</option> 115 <option value="fastq">FASTQ</option>
94 <option value="tabular">Tabular</option> 116 <option value="tabular">Tabular</option>
95 <option value="fasta">FASTA</option> 117 <option value="fasta">FASTA</option>
118 <option value="sdf">SD-files</option>
96 <option value="txt">Text files</option> 119 <option value="txt">Text files</option>
97 <option value="generic">Generic</option> 120 <option value="generic">Generic</option>
98 </param> 121 </param>
99 <when value="tabular"> 122 <when value="tabular">
100 <param name="input" type="data" format="tabular" label="Tabular file to split"/> 123 <param name="input" type="data" format="tabular" label="Tabular file to split"/>
128 </when> 151 </when>
129 <when value="fasta"> 152 <when value="fasta">
130 <param name="input" type="data" format="fasta" label="FASTA file to split"/> 153 <param name="input" type="data" format="fasta" label="FASTA file to split"/>
131 <expand macro="numnew_fname"/> 154 <expand macro="numnew_fname"/>
132 </when> 155 </when>
156 <when value="sdf">
157 <param name="input" type="data" format="sdf" label="SD-file to split"/>
158 <expand macro="numnew_fname"/>
159 </when>
133 <when value="txt"> 160 <when value="txt">
134 <param name="input" type="data" format="txt" label="Text file to split"/> 161 <param name="input" type="data" format="txt" label="Text file to split"/>
135 <expand macro="numnew_fname"/> 162 <expand macro="numnew_fname"/>
136 </when> 163 </when>
137 <when value="generic"> 164 <when value="generic">
138 <param name="input" type="data" format="txt" label="File to split"/> 165 <param name="input" type="data" format="txt" label="File to split"/>
139 <param name="generic_regex" type="text" label="Regex to match record separator" value="^.*"> 166 <param name="generic_regex" type="text" label="Regex to match record separator" value="^.*">
140 <expand macro="regex_sanitizer"/> 167 <expand macro="regex_sanitizer"/>
141 </param> 168 </param>
142 <expand macro="numnew_fname"/> 169 <expand macro="numnew_fname"/>
170 <param name="split_after" type="select" value="false" label="Split records before or after the separator?" help="If before, the separator will appear at the start of each record; if after, at the end">
171 <option value="false" selected="true">Before</option>
172 <option value="true">After</option>
173 </param>
143 </when> 174 </when>
144 </conditional> 175 </conditional>
145 </inputs> 176 </inputs>
146 <outputs> 177 <outputs>
147 <collection name="list_output_tab" type="list" label="${tool.name} on ${on_string}"> 178 <collection name="list_output_tab" type="list" label="${tool.name} on ${on_string}">
157 <filter>split_parms['select_ftype'] == "fasta"</filter> 188 <filter>split_parms['select_ftype'] == "fasta"</filter>
158 </collection> 189 </collection>
159 <collection name="list_output_fastq" type="list" label="${tool.name} on ${on_string}"> 190 <collection name="list_output_fastq" type="list" label="${tool.name} on ${on_string}">
160 <discover_datasets pattern="__name__" directory="out" visible="false" format="fastq"/> 191 <discover_datasets pattern="__name__" directory="out" visible="false" format="fastq"/>
161 <filter>split_parms['select_ftype'] == "fastq"</filter> 192 <filter>split_parms['select_ftype'] == "fastq"</filter>
193 </collection>
194 <collection name="list_output_sdf" type="list" label="${tool.name} on ${on_string}">
195 <discover_datasets pattern="__name__" directory="out" visible="false" format="sdf"/>
196 <filter>split_parms['select_ftype'] == "sdf"</filter>
162 </collection> 197 </collection>
163 <collection name="list_output_txt" type="list" label="${tool.name} on ${on_string}"> 198 <collection name="list_output_txt" type="list" label="${tool.name} on ${on_string}">
164 <discover_datasets pattern="__name__" directory="out" visible="false" format="txt"/> 199 <discover_datasets pattern="__name__" directory="out" visible="false" format="txt"/>
165 <filter>split_parms['select_ftype'] == "txt"</filter> 200 <filter>split_parms['select_ftype'] == "txt"</filter>
166 </collection> 201 </collection>
187 <test> 222 <test>
188 <param name="input" value="test.tabular" ftype="tabular"/> 223 <param name="input" value="test.tabular" ftype="tabular"/>
189 <param name="select_ftype" value="tabular"/> 224 <param name="select_ftype" value="tabular"/>
190 <param name="select_split_by" value="row"/> 225 <param name="select_split_by" value="row"/>
191 <param name="top" value="2"/> 226 <param name="top" value="2"/>
227 <param name="mode" value="numnew"/>
192 <param name="numnew" value="2"/> 228 <param name="numnew" value="2"/>
193 <param name="newfilenames" value="test"/> 229 <param name="newfilenames" value="test"/>
194 <output_collection name="list_output_tab" type="list"> 230 <output_collection name="list_output_tab" type="list">
195 <element name="test_000000.tabular" file="test_0.tabular" ftype="tabular"/> 231 <element name="test_000000.tabular" file="test_0.tabular" ftype="tabular"/>
196 <element name="test_000001.tabular" file="test_1.tabular" ftype="tabular"/> 232 <element name="test_000001.tabular" file="test_1.tabular" ftype="tabular"/>
199 <test> 235 <test>
200 <param name="input" value="test.tabular" ftype="tabular"/> 236 <param name="input" value="test.tabular" ftype="tabular"/>
201 <param name="select_ftype" value="tabular"/> 237 <param name="select_ftype" value="tabular"/>
202 <param name="select_split_by" value="row"/> 238 <param name="select_split_by" value="row"/>
203 <param name="top" value="2"/> 239 <param name="top" value="2"/>
240 <param name="mode" value="numnew"/>
204 <param name="numnew" value="2"/> 241 <param name="numnew" value="2"/>
205 <param name="newfilenames" value="batch_tab"/> 242 <param name="newfilenames" value="batch_tab"/>
206 <param name="allocate" value="batch"/> 243 <param name="allocate" value="batch"/>
207 <output_collection name="list_output_tab" type="list"> 244 <output_collection name="list_output_tab" type="list">
208 <element name="batch_tab_000000.tabular" file="batch_tab_0.tabular" ftype="tabular"/> 245 <element name="batch_tab_000000.tabular" file="batch_tab_0.tabular" ftype="tabular"/>
209 <element name="batch_tab_000001.tabular" file="batch_tab_1.tabular" ftype="tabular"/> 246 <element name="batch_tab_000001.tabular" file="batch_tab_1.tabular" ftype="tabular"/>
210 </output_collection> 247 </output_collection>
211 </test> 248 </test>
212 <test> 249 <test>
250 <param name="input" value="test.tabular" ftype="tabular"/>
251 <param name="select_ftype" value="tabular"/>
252 <param name="select_split_by" value="row"/>
253 <param name="top" value="2"/>
254 <param name="mode" value="chunk"/>
255 <param name="chunksize" value="2"/>
256 <param name="newfilenames" value="batch_tab"/>
257 <param name="allocate" value="batch"/>
258 <output_collection name="list_output_tab" type="list">
259 <element name="batch_tab_000000.tabular" file="batch_tab_0.tabular" ftype="tabular"/>
260 <element name="batch_tab_000001.tabular" file="batch_tab_1.tabular" ftype="tabular"/>
261 </output_collection>
262 </test>
263 <test>
213 <param name="select_ftype" value="txt"/> 264 <param name="select_ftype" value="txt"/>
214 <param name="input" value="karyotype.txt" ftype="txt"/> 265 <param name="input" value="karyotype.txt" ftype="txt"/>
266 <param name="mode" value="numnew"/>
215 <param name="numnew" value="24"/> 267 <param name="numnew" value="24"/>
216 <param name="newfilenames" value="chr"/> 268 <param name="newfilenames" value="chr"/>
217 <param name="allocate" value="batch"/> 269 <param name="allocate" value="batch"/>
218 270
219 <output_collection name="list_output_txt" type="list"> 271 <output_collection name="list_output_txt" type="list">
259 </output_collection> 311 </output_collection>
260 </test> 312 </test>
261 <test> 313 <test>
262 <param name="input" value="demo758Dacentroid.mgf" ftype="mgf"/> 314 <param name="input" value="demo758Dacentroid.mgf" ftype="mgf"/>
263 <param name="select_ftype" value="mgf"/> 315 <param name="select_ftype" value="mgf"/>
316 <param name="mode" value="numnew"/>
264 <param name="numnew" value="3"/> 317 <param name="numnew" value="3"/>
265 <param name="newfilenames" value="demo"/> 318 <param name="newfilenames" value="demo"/>
266 <output_collection name="list_output_mgf" type="list"> 319 <output_collection name="list_output_mgf" type="list">
267 <element name="demo_000000.mgf" file="demo_0.mgf" ftype="mgf"/> 320 <element name="demo_000000.mgf" file="demo_0.mgf" ftype="mgf"/>
268 <element name="demo_000001.mgf" file="demo_1.mgf" ftype="mgf"/> 321 <element name="demo_000001.mgf" file="demo_1.mgf" ftype="mgf"/>
270 </output_collection> 323 </output_collection>
271 </test> 324 </test>
272 <test> 325 <test>
273 <param name="input" value="test.fasta" ftype="fasta"/> 326 <param name="input" value="test.fasta" ftype="fasta"/>
274 <param name="select_ftype" value="fasta"/> 327 <param name="select_ftype" value="fasta"/>
328 <param name="mode" value="numnew"/>
275 <param name="numnew" value="2"/> 329 <param name="numnew" value="2"/>
276 <param name="newfilenames" value="test"/> 330 <param name="newfilenames" value="test"/>
277 <output_collection name="list_output_fasta" type="list"> 331 <output_collection name="list_output_fasta" type="list">
278 <element name="test_000000.fasta" file="test_0.fasta" ftype="fasta"/> 332 <element name="test_000000.fasta" file="test_0.fasta" ftype="fasta"/>
279 <element name="test_000001.fasta" file="test_1.fasta" ftype="fasta"/> 333 <element name="test_000001.fasta" file="test_1.fasta" ftype="fasta"/>
280 </output_collection> 334 </output_collection>
281 </test> 335 </test>
282 <test> 336 <test>
337 <param name="input" value="test.fasta" ftype="fasta"/>
338 <param name="select_ftype" value="fasta"/>
339 <param name="mode" value="chunk"/>
340 <param name="chunksize" value="3"/>
341 <param name="newfilenames" value="test"/>
342 <output_collection name="list_output_fasta" type="list">
343 <element name="test_000000.fasta" file="test_0.fasta" ftype="fasta"/>
344 <element name="test_000001.fasta" file="test_1.fasta" ftype="fasta"/>
345 </output_collection>
346 </test>
347 <test>
283 <param name="input" value="test.fastq" ftype="fastq"/> 348 <param name="input" value="test.fastq" ftype="fastq"/>
284 <param name="select_ftype" value="fastq"/> 349 <param name="select_ftype" value="fastq"/>
350 <param name="mode" value="numnew"/>
285 <param name="numnew" value="2"/> 351 <param name="numnew" value="2"/>
286 <param name="newfilenames" value="test"/> 352 <param name="newfilenames" value="test"/>
287 <output_collection name="list_output_fastq" type="list"> 353 <output_collection name="list_output_fastq" type="list">
288 <element name="test_000000.fastq" file="test_0.fastq" ftype="fastq"/> 354 <element name="test_000000.fastq" file="test_0.fastq" ftype="fastq"/>
289 <element name="test_000001.fastq" file="test_1.fastq" ftype="fastq"/> 355 <element name="test_000001.fastq" file="test_1.fastq" ftype="fastq"/>
290 </output_collection> 356 </output_collection>
291 </test> 357 </test>
292 <test> 358 <test>
293 <param name="input" value="test.fasta" ftype="fasta"/> 359 <param name="input" value="test.fasta" ftype="fasta"/>
294 <param name="select_ftype" value="fasta"/> 360 <param name="select_ftype" value="fasta"/>
361 <param name="mode" value="numnew"/>
295 <param name="numnew" value="2"/> 362 <param name="numnew" value="2"/>
296 <param name="newfilenames" value="rand"/> 363 <param name="newfilenames" value="rand"/>
297 <param name="allocate" value="random"/> 364 <param name="allocate" value="random"/>
298 <param name="seed" value="1010"/> 365 <param name="seed" value="1010"/>
299 <output_collection name="list_output_fasta" type="list"> 366 <output_collection name="list_output_fasta" type="list">
302 </output_collection> 369 </output_collection>
303 </test> 370 </test>
304 <test> 371 <test>
305 <param name="input" value="test.fasta" ftype="fasta"/> 372 <param name="input" value="test.fasta" ftype="fasta"/>
306 <param name="select_ftype" value="fasta"/> 373 <param name="select_ftype" value="fasta"/>
374 <param name="mode" value="numnew"/>
307 <param name="numnew" value="2"/> 375 <param name="numnew" value="2"/>
308 <param name="newfilenames" value="fasta_batch"/> 376 <param name="newfilenames" value="fasta_batch"/>
309 <param name="allocate" value="batch"/> 377 <param name="allocate" value="batch"/>
310 <output_collection name="list_output_fasta" type="list"> 378 <output_collection name="list_output_fasta" type="list">
311 <element name="fasta_batch_000000.fasta" file="fasta_batch_0.fasta" ftype="fasta"/> 379 <element name="fasta_batch_000000.fasta" file="fasta_batch_0.fasta" ftype="fasta"/>
313 </output_collection> 381 </output_collection>
314 </test> 382 </test>
315 <test> 383 <test>
316 <param name="input" value="test.tabular" ftype="txt"/> 384 <param name="input" value="test.tabular" ftype="txt"/>
317 <param name="select_ftype" value="txt"/> 385 <param name="select_ftype" value="txt"/>
386 <param name="mode" value="numnew"/>
318 <param name="numnew" value="2"/> 387 <param name="numnew" value="2"/>
319 <param name="newfilenames" value="test"/> 388 <param name="newfilenames" value="test"/>
320 <output_collection name="list_output_txt" type="list"> 389 <output_collection name="list_output_txt" type="list">
321 <element name="test_000000.txt" file="test_0.tabular" ftype="txt" lines_diff="1"/> 390 <element name="test_000000.txt" file="test_0.tabular" ftype="txt" lines_diff="1"/>
322 <element name="test_000001.txt" file="test_1.tabular" ftype="txt" lines_diff="1"/> 391 <element name="test_000001.txt" file="test_1.tabular" ftype="txt" lines_diff="1"/>
324 </test> 393 </test>
325 <test> 394 <test>
326 <param name="input" value="test.tabular" ftype="txt"/> 395 <param name="input" value="test.tabular" ftype="txt"/>
327 <param name="select_ftype" value="generic"/> 396 <param name="select_ftype" value="generic"/>
328 <param name="generic_regex" value="^.*"/> 397 <param name="generic_regex" value="^.*"/>
398 <param name="mode" value="numnew"/>
329 <param name="numnew" value="2"/> 399 <param name="numnew" value="2"/>
330 <param name="newfilenames" value="test"/> 400 <param name="newfilenames" value="test"/>
331 <output_collection name="list_output_generic" type="list"> 401 <output_collection name="list_output_generic" type="list">
332 <element name="test_000000" file="test_0.tabular" ftype="txt" lines_diff="1"/> 402 <element name="test_000000" file="test_0.tabular" ftype="txt" lines_diff="1"/>
333 <element name="test_000001" file="test_1.tabular" ftype="txt" lines_diff="1"/> 403 <element name="test_000001" file="test_1.tabular" ftype="txt" lines_diff="1"/>
335 </test> 405 </test>
336 <test> 406 <test>
337 <param name="input" value="test.fasta" ftype="fasta"/> 407 <param name="input" value="test.fasta" ftype="fasta"/>
338 <param name="select_ftype" value="generic"/> 408 <param name="select_ftype" value="generic"/>
339 <param name="generic_regex" value="^>.*"/> 409 <param name="generic_regex" value="^>.*"/>
410 <param name="mode" value="numnew"/>
340 <param name="numnew" value="2"/> 411 <param name="numnew" value="2"/>
341 <param name="newfilenames" value="rand"/> 412 <param name="newfilenames" value="rand"/>
342 <param name="allocate" value="random"/> 413 <param name="allocate" value="random"/>
343 <param name="seed" value="1010"/> 414 <param name="seed" value="1010"/>
344 <output_collection name="list_output_generic" type="list"> 415 <output_collection name="list_output_generic" type="list">
346 <element name="rand_000001" file="rand_1.fasta" ftype="fasta"/> 417 <element name="rand_000001" file="rand_1.fasta" ftype="fasta"/>
347 </output_collection> 418 </output_collection>
348 </test> 419 </test>
349 <test> 420 <test>
350 <param name="input" value="3_molecules.sdf" ftype="sdf"/> 421 <param name="input" value="3_molecules.sdf" ftype="sdf"/>
351 <param name="select_ftype" value="generic"/> 422 <param name="select_ftype" value="sdf"/>
352 <param name="generic_regex" value="^\$\$\$\$.*"/> 423 <param name="mode" value="numnew"/>
353 <param name="numnew" value="1000"/> 424 <param name="numnew" value="10"/>
354 <param name="newfilenames" value="mol"/> 425 <param name="newfilenames" value="mol"/>
355 <param name="allocate" value="batch"/> 426 <param name="allocate" value="batch"/>
427 <output_collection name="list_output_sdf" type="list">
428 <element name="mol_000000.sdf" file="mol_0.sdf" ftype="sdf"/>
429 <element name="mol_000001.sdf" file="mol_1.sdf" ftype="sdf"/>
430 <element name="mol_000002.sdf" file="mol_2.sdf" ftype="sdf"/>
431 </output_collection>
432 </test>
433 <test>
434 <param name="input" value="3_molecules.sdf" ftype="sdf"/>
435 <param name="select_ftype" value="sdf"/>
436 <param name="mode" value="chunk"/>
437 <param name="chunksize" value="1"/>
438 <param name="newfilenames" value="mol"/>
439 <param name="allocate" value="batch"/>
440 <output_collection name="list_output_sdf" type="list">
441 <element name="mol_000000.sdf" file="mol_0.sdf" ftype="sdf"/>
442 <element name="mol_000001.sdf" file="mol_1.sdf" ftype="sdf"/>
443 <element name="mol_000002.sdf" file="mol_2.sdf" ftype="sdf"/>
444 </output_collection>
445 </test>
446 <test>
447 <param name="input" value="test.fasta" ftype="fasta"/>
448 <param name="select_ftype" value="generic"/>
449 <param name="generic_regex" value="^>.*"/>
450 <param name="split_after" value="true"/>
451 <param name="mode" value="numnew"/>
452 <param name="numnew" value="2"/>
453 <param name="newfilenames" value="rand"/>
454 <param name="allocate" value="random"/>
455 <param name="seed" value="1010"/>
356 <output_collection name="list_output_generic" type="list"> 456 <output_collection name="list_output_generic" type="list">
357 <element name="mol_000000" file="mol_0.sdf" ftype="sdf"/> 457 <element name="rand_000001" file="split_after.fasta" ftype="fasta"/>
358 <element name="mol_000001" file="mol_1.sdf" ftype="sdf"/>
359 <element name="mol_000002" file="mol_2.sdf" ftype="sdf"/>
360 </output_collection> 458 </output_collection>
361 </test> 459 </test>
362 </tests> 460 </tests>
363 <help><![CDATA[ 461 <help><![CDATA[
364 **Split file into a dataset collection** 462 **Split file into a dataset collection**
365 463
366 This tool splits a data sets consisting of records into multiple data sets within a collection. 464 This tool splits a data set consisting of records into multiple data sets within a collection.
367 A record can be for instance simply a line, a FASTA sequence (header + sequence), a FASTQ sequence 465 A record can be for instance simply a line, a FASTA sequence (header + sequence), a FASTQ sequence
368 (headers + sequence + qualities), etc. The important property is that the begin of a new record 466 (headers + sequence + qualities), etc. The important property is that the beginning of a new record
369 can be speciefied by a regular expression, e.g. ".*" for lines, ">.*" for FASTA, or "@.*" for FASTQ. 467 can be specified by a regular expression, e.g. ".*" for lines, ">.*" for FASTA, or "@.*" for FASTQ.
370 The tool has presets for text, tabular data sets (which are split by line), FASTA, FASTQ, and MGF. 468 The tool has presets for text, tabular data sets (which are split by line), FASTA, FASTQ, SDF and MGF.
371 For other data types the text delimiting records can be specified manually using the generic splitter. 469 For other data types the text delimiting records can be specified manually using the generic splitter.
372 470 If the generic splitter is used, an option is also available to split records either before or after the
373 If splitting by line (or by some other item, like a FASTA entry or an MGF record, the splitting can be either done alternating, in original record order, or at random. 471 separator. If a preset filetype is used, this is selected automatically (after for SDF, before for all
472 others).
473
474 If splitting by line (or by some other item, like a FASTA entry or an MGF record), the splitting can be either done alternatingly, in original record order, or at random.
374 475
375 If t records are to be distributed to n new data sets, then the i-th record goes to data set 476 If t records are to be distributed to n new data sets, then the i-th record goes to data set
376 477
377 * floor(i / t * n) (for batch), 478 * floor(i / t * n) (for batch),
378 * i % n (for alternating), or 479 * i % n (for alternating), or
405 Note that there are no guarantees when splitting at random that every result file will be non-empty, so downstream tools should be able to gracefully handle empty files. 506 Note that there are no guarantees when splitting at random that every result file will be non-empty, so downstream tools should be able to gracefully handle empty files.
406 507
407 If a tabular file is used as input, you may choose to split by line or by column. If split by column, a new file is created for each unique value in the column. 508 If a tabular file is used as input, you may choose to split by line or by column. If split by column, a new file is created for each unique value in the column.
408 In addition, (Python) regular expressions may be used to transform the value in the column to a new value. Caution should be used with this feature, as it could transform all values to the same value, or other unexpected behavior. 509 In addition, (Python) regular expressions may be used to transform the value in the column to a new value. Caution should be used with this feature, as it could transform all values to the same value, or other unexpected behavior.
409 The default regular expression uses each value in the column without modifying it. 510 The default regular expression uses each value in the column without modifying it.
511
512 Two modes are available for the tool. For the main mode, the number of output files is selected. In this case, records are shared out between this number of files. Alternatively, 'chunking mode' can be selected, which puts a fixed number of records (the 'chunk size') into each output file.
513
410 ]]></help> 514 ]]></help>
411 <citations> 515 <citations>
412 <citation type="bibtex"> 516 <citation type="bibtex">
413 @misc{githubsplit, 517 @misc{githubsplit,
414 author = {Easterly, Caleb}, 518 author = {Easterly, Caleb},
415 year = {2018}, 519 year = {2018},
416 title = {A Galxy tool for splitting a file into a collection}, 520 title = {A Galaxy tool for splitting a file into a collection},
417 publisher = {GitHub}, 521 publisher = {GitHub},
418 journal = {GitHub repository}, 522 journal = {GitHub repository},
419 url = {https://github.com/bgruening/galaxytools/tools/text_processing/split_file_to_collection}, 523 url = {https://github.com/bgruening/galaxytools/tools/text_processing/split_file_to_collection},
420 }</citation> 524 }</citation>
421 </citations> 525 </citations>