comparison split_file_to_collection.xml @ 2:d150ac3d853d draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 06ffe450bafa280eee8a4331c9cfc9e1ece7c522"
author bgruening
date Wed, 28 Aug 2019 10:55:25 -0400
parents 750c1684d47c
children 2ddc36385d7a
comparison
equal deleted inserted replaced
1:750c1684d47c 2:d150ac3d853d
1 <tool id="split_file_to_collection" name="Split file" version="0.1.1"> 1 <tool id="split_file_to_collection" name="Split file" version="0.2.0">
2 <description>to dataset collection</description> 2 <description>to dataset collection</description>
3 <macros> 3 <macros>
4 <xml name="regex_sanitizer">
5 <sanitizer>
6 <valid>
7 <add preset="string.printable"/>
8 <remove value="&#92;" />
9 <remove value="&apos;" />
10 </valid>
11 <mapping initial="none">
12 <add source="&#92;" target="__backslash__" />
13 <add source="&apos;" target="__sq__"/>
14 </mapping>
15 </sanitizer>
16 </xml>
4 <xml name="numnew_fname"> 17 <xml name="numnew_fname">
5 <param name="numnew" type="integer" label="Number of new files" min="1" value="1"/> 18 <param name="numnew" type="integer" label="Number of new files" min="1" value="1"/>
6 <param name="newfilenames" type="text" label="Base name for new files in collection" 19 <param name="newfilenames" type="text" label="Base name for new files in collection"
7 help="This will increment automatically - if input is 'file', then output is 'file0', 'file1', etc." value="split_file"/> 20 help="This will increment automatically - if input is 'file', then output is 'file0', 'file1', etc." value="split_file"/>
8 <conditional name="select_allocate"> 21 <conditional name="select_allocate">
46 #if $split_parms.split_by.select_allocate.allocate == "batch": 59 #if $split_parms.split_by.select_allocate.allocate == "batch":
47 --batch 60 --batch
48 #end if 61 #end if
49 #end if 62 #end if
50 #else 63 #else
64 #if $split_parms.select_ftype == "generic"
65 --generic_re '$split_parms.generic_regex'
66 #end if
51 --numnew '$split_parms.numnew' 67 --numnew '$split_parms.numnew'
52 #if $split_parms.select_allocate.allocate == "random": 68 #if $split_parms.select_allocate.allocate == "random":
53 --rand 69 --rand
54 --seed '$split_parms.select_allocate.seed' 70 --seed '$split_parms.select_allocate.seed'
55 #end if 71 #end if
61 --file_names '$split_parms.split_by.newfilenames' 77 --file_names '$split_parms.split_by.newfilenames'
62 --file_ext '$split_parms.select_ftype' 78 --file_ext '$split_parms.select_ftype'
63 #end if 79 #end if
64 #if $split_parms.select_ftype != "tabular": 80 #if $split_parms.select_ftype != "tabular":
65 --file_names '$split_parms.newfilenames' 81 --file_names '$split_parms.newfilenames'
66 --file_ext '$split_parms.select_ftype' 82 #if $split_parms.select_ftype == "generic"
83 --file_ext '$split_parms.input.ext'
84 #else
85 --file_ext '$split_parms.select_ftype'
86 #end if
67 #end if 87 #end if
68 ]]></command> 88 ]]></command>
69 <inputs> 89 <inputs>
70 <conditional name="split_parms"> 90 <conditional name="split_parms">
71 <param name="select_ftype" type="select" label="Select the file type to split"> 91 <param name="select_ftype" type="select" label="Select the file type to split">
72 <option value="mgf">MGF</option> 92 <option value="mgf">MGF</option>
73 <option value="fastq">FASTQ</option> 93 <option value="fastq">FASTQ</option>
74 <option value="tabular">Tabular</option> 94 <option value="tabular">Tabular</option>
75 <option value="fasta">FASTA</option> 95 <option value="fasta">FASTA</option>
96 <option value="txt">Text files</option>
97 <option value="generic">Generic</option>
76 </param> 98 </param>
77 <when value="tabular"> 99 <when value="tabular">
78 <param name="input" type="data" format="tabular" label="Tabular file to split"/> 100 <param name="input" type="data" format="tabular" label="Tabular file to split"/>
79 <param name="top" type="integer" value="0" min="0" label="Number of header lines to transfer to new files"/> 101 <param name="top" type="integer" value="0" min="0" label="Number of header lines to transfer to new files"/>
80 <conditional name="split_by"> 102 <conditional name="split_by">
83 <option value="col">By column</option> 105 <option value="col">By column</option>
84 </param> 106 </param>
85 <when value="col"> 107 <when value="col">
86 <param name="id_col" type="data_column" label="Column to split on" data_ref="input"/> 108 <param name="id_col" type="data_column" label="Column to split on" data_ref="input"/>
87 <param name="match_regex" type="text" label="Regex to match contents of id column" value="(.*)"> 109 <param name="match_regex" type="text" label="Regex to match contents of id column" value="(.*)">
88 <sanitizer> 110 <expand macro="regex_sanitizer"/>
89 <valid> 111 </param>
90 <add preset="string.printable"/>
91 <remove value="&#92;" />
92 <remove value="&apos;" />
93 </valid>
94 <mapping initial="none">
95 <add source="&#92;" target="__backslash__" />
96 <add source="&apos;" target="__sq__"/>
97 </mapping>
98 </sanitizer>
99 </param>
100 <param name="sub_regex" type="text" label="Pattern to replace match with" value="\1"> 112 <param name="sub_regex" type="text" label="Pattern to replace match with" value="\1">
101 <sanitizer> 113 <expand macro="regex_sanitizer"/>
102 <valid>
103 <add preset="string.printable"/>
104 <remove value="&#92;" />
105 <remove value="&apos;" />
106 </valid>
107 <mapping initial="none">
108 <add source="&#92;" target="__backslash__" />
109 <add source="&apos;" target="__sq__"/>
110 </mapping>
111 </sanitizer>
112 </param> 114 </param>
113 </when> 115 </when>
114 <when value="row"> 116 <when value="row">
115 <expand macro="numnew_fname"/> 117 <expand macro="numnew_fname"/>
116 </when> 118 </when>
126 </when> 128 </when>
127 <when value="fasta"> 129 <when value="fasta">
128 <param name="input" type="data" format="fasta" label="FASTA file to split"/> 130 <param name="input" type="data" format="fasta" label="FASTA file to split"/>
129 <expand macro="numnew_fname"/> 131 <expand macro="numnew_fname"/>
130 </when> 132 </when>
133 <when value="txt">
134 <param name="input" type="data" format="txt" label="Text file to split"/>
135 <expand macro="numnew_fname"/>
136 </when>
137 <when value="generic">
138 <param name="input" type="data" format="txt" label="File to split"/>
139 <param name="generic_regex" type="text" label="Regex to match record separator" value="^.*">
140 <expand macro="regex_sanitizer"/>
141 </param>
142 <expand macro="numnew_fname"/>
143 </when>
131 </conditional> 144 </conditional>
132 </inputs> 145 </inputs>
133 <outputs> 146 <outputs>
134 <collection name="list_output_tab" type="list" label="${tool.name} on ${on_string}: output collection"> 147 <collection name="list_output_tab" type="list" label="${tool.name} on ${on_string}">
135 <discover_datasets pattern="__name__" directory="out" visible="false" format="tabular"/> 148 <discover_datasets pattern="__name__" directory="out" visible="false" format="tabular"/>
136 <filter>split_parms['select_ftype'] == "tabular"</filter> 149 <filter>split_parms['select_ftype'] == "tabular"</filter>
137 </collection> 150 </collection>
138 <collection name="list_output_mgf" type="list" label="${tool.name} on ${on_string}: output collection"> 151 <collection name="list_output_mgf" type="list" label="${tool.name} on ${on_string}">
139 <discover_datasets pattern="__name__" directory="out" visible="false" format="mgf"/> 152 <discover_datasets pattern="__name__" directory="out" visible="false" format="mgf"/>
140 <filter>split_parms['select_ftype'] == "mgf"</filter> 153 <filter>split_parms['select_ftype'] == "mgf"</filter>
141 </collection> 154 </collection>
142 <collection name="list_output_fasta" type="list" label="${tool.name} on ${on_string}: output collection"> 155 <collection name="list_output_fasta" type="list" label="${tool.name} on ${on_string}">
143 <discover_datasets pattern="__name__" directory="out" visible="false" format="fasta"/> 156 <discover_datasets pattern="__name__" directory="out" visible="false" format="fasta"/>
144 <filter>split_parms['select_ftype'] == "fasta"</filter> 157 <filter>split_parms['select_ftype'] == "fasta"</filter>
145 </collection> 158 </collection>
146 <collection name="list_output_fastq" type="list" label="${tool.name} on ${on_string}: output collection"> 159 <collection name="list_output_fastq" type="list" label="${tool.name} on ${on_string}">
147 <discover_datasets pattern="__name__" directory="out" visible="false" format="fastq"/> 160 <discover_datasets pattern="__name__" directory="out" visible="false" format="fastq"/>
148 <filter>split_parms['select_ftype'] == "fastq"</filter> 161 <filter>split_parms['select_ftype'] == "fastq"</filter>
162 </collection>
163 <collection name="list_output_txt" type="list" label="${tool.name} on ${on_string}">
164 <discover_datasets pattern="__name__" directory="out" visible="false" format="txt"/>
165 <filter>split_parms['select_ftype'] == "txt"</filter>
166 </collection>
167 <collection name="list_output_generic" type="list" label="${tool.name} on ${on_string}">
168 <discover_datasets pattern="__name_and_ext__" directory="out" visible="false"/>
169 <filter>split_parms['select_ftype'] == "generic"</filter>
149 </collection> 170 </collection>
150 </outputs> 171 </outputs>
151 <tests> 172 <tests>
152 <test> 173 <test>
153 <param name="input" value="test.tabular" ftype="tabular"/> 174 <param name="input" value="test.tabular" ftype="tabular"/>
255 <output_collection name="list_output_fasta" type="list"> 276 <output_collection name="list_output_fasta" type="list">
256 <element name="fasta_batch_0.fasta" file="fasta_batch_0.fasta" ftype="fasta"/> 277 <element name="fasta_batch_0.fasta" file="fasta_batch_0.fasta" ftype="fasta"/>
257 <element name="fasta_batch_1.fasta" file="fasta_batch_1.fasta" ftype="fasta"/> 278 <element name="fasta_batch_1.fasta" file="fasta_batch_1.fasta" ftype="fasta"/>
258 </output_collection> 279 </output_collection>
259 </test> 280 </test>
281 <test>
282 <param name="input" value="test.tabular" ftype="txt"/>
283 <param name="select_ftype" value="txt"/>
284 <param name="numnew" value="2"/>
285 <param name="newfilenames" value="test"/>
286 <output_collection name="list_output_txt" type="list">
287 <element name="test_0.txt" file="test_0.tabular" ftype="txt" lines_diff="1"/>
288 <element name="test_1.txt" file="test_1.tabular" ftype="txt" lines_diff="1"/>
289 </output_collection>
290 </test>
291 <test>
292 <param name="input" value="test.tabular" ftype="txt"/>
293 <param name="select_ftype" value="generic"/>
294 <param name="generic_regex" value="^.*"/>
295 <param name="numnew" value="2"/>
296 <param name="newfilenames" value="test"/>
297 <output_collection name="list_output_generic" type="list">
298 <element name="test_0" file="test_0.tabular" ftype="txt" lines_diff="1"/>
299 <element name="test_1" file="test_1.tabular" ftype="txt" lines_diff="1"/>
300 </output_collection>
301 </test>
302 <test>
303 <param name="input" value="test.fasta" ftype="fasta"/>
304 <param name="select_ftype" value="generic"/>
305 <param name="generic_regex" value="^>.*"/>
306 <param name="numnew" value="2"/>
307 <param name="newfilenames" value="rand"/>
308 <param name="allocate" value="random"/>
309 <param name="seed" value="1010"/>
310 <output_collection name="list_output_generic" type="list">
311 <element name="rand_0" file="rand_0.fasta" ftype="fasta"/>
312 <element name="rand_1" file="rand_1.fasta" ftype="fasta"/>
313 </output_collection>
314 </test>
315 <test>
316 <param name="input" value="3_molecules.sdf" ftype="sdf"/>
317 <param name="select_ftype" value="generic"/>
318 <param name="generic_regex" value="^\$\$\$\$.*"/>
319 <param name="numnew" value="1000"/>
320 <param name="newfilenames" value="mol"/>
321 <param name="allocate" value="batch"/>
322 <output_collection name="list_output_generic" type="list">
323 <element name="mol_0" file="mol_0.sdf" ftype="sdf"/>
324 <element name="mol_1" file="mol_1.sdf" ftype="sdf"/>
325 <element name="mol_2" file="mol_2.sdf" ftype="sdf"/>
326 </output_collection>
327 </test>
260 </tests> 328 </tests>
261 <help><![CDATA[ 329 <help><![CDATA[
262 **Split file into a dataset collection** 330 **Split file into a dataset collection**
263 331
264 This tool can split five types of files into a separate files within a dataset collection: MGF, FASTA, FASTQ, and tabular. 332 This tool splits a data sets consisting of records into multiple data sets within a collection.
333 A record can be for instance simply a line, a FASTA sequence (header + sequence), a FASTQ sequence
334 (headers + sequence + qualities), etc. The important property is that the begin of a new record
335 can be speciefied by a regular expression, e.g. ".*" for lines, ">.*" for FASTA, or "@.*" for FASTQ.
336 The tool has presets for text, tabular data sets (which are split by line), FASTA, FASTQ, and MGF.
337 For other data types the text delimiting records can be specified manually using the generic splitter.
338
339 If splitting by line (or by some other item, like a FASTA entry or an MGF record, the splitting can be either done alternating, in original record order, or at random.
340
341 If t records are to be distributed to n new data sets, then the i-th record goes to data set
342
343 * floor(i / t * n) (for batch),
344 * i % n (for alternating), or
345 * a random data set
346
347 For instance, t=5 records are distributed as follows on n=2 data sets
348
349 = === === ====
350 i bat alt rand
351 = === === ====
352 0 0 0 0
353 1 0 1 1
354 2 0 0 1
355 3 1 1 0
356 4 1 0 0
357 = === === ====
358
359 If the five records are distributed on n=3 data sets:
360
361 = === === ====
362 i bat alt rand
363 = === === ====
364 0 0 0 0
365 1 0 1 1
366 2 1 2 2
367 3 1 0 0
368 4 2 1 1
369 = === === ====
370
371 Note that there are no guarantees when splitting at random that every result file will be non-empty, so downstream tools should be able to gracefully handle empty files.
372
265 If a tabular file is used as input, you may choose to split by line or by column. If split by column, a new file is created for each unique value in the column. 373 If a tabular file is used as input, you may choose to split by line or by column. If split by column, a new file is created for each unique value in the column.
266 In addition, (Python) regular expressions may be used to transform the value in the column to a new value. Caution should be used with this feature, as it could transform all values to the same value, or other unexpected behavior. 374 In addition, (Python) regular expressions may be used to transform the value in the column to a new value. Caution should be used with this feature, as it could transform all values to the same value, or other unexpected behavior.
267 The default regular expression uses each value in the column without modifying it. 375 The default regular expression uses each value in the column without modifying it.
268
269 If splitting by line (or by some other item, like a FASTA entry or an MGF section), the splitting can be either done sequentially or at random.
270 Note that there are no guarantees when splitting at random that every result file will be non-empty, so downstream tools should be able to gracefully handle empty files.
271
272 **Note**
273
274 Due to current limitations with dataset collections, a log file is produced when running this tool. It will usually be empty, but if the tool fails, any errors will be printed to the log file.
275 ]]></help> 376 ]]></help>
276 <citations> 377 <citations>
277 <citation type="bibtex"> 378 <citation type="bibtex">
278 @misc{githubsplit, 379 @misc{githubsplit,
279 author = {Easterly, Caleb}, 380 author = {Easterly, Caleb},