Mercurial > repos > bgruening > split_file_to_collection
diff split_file_to_collection.xml @ 5:e77b954f0da5 draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
author | bgruening |
---|---|
date | Fri, 11 Oct 2019 18:24:43 -0400 |
parents | 0850f2dfba13 |
children | 6cbe2f30c2d7 |
line wrap: on
line diff
--- a/split_file_to_collection.xml Wed Oct 09 07:34:49 2019 -0400 +++ b/split_file_to_collection.xml Fri Oct 11 18:24:43 2019 -0400 @@ -1,4 +1,4 @@ -<tool id="split_file_to_collection" name="Split file" version="0.3.0"> +<tool id="split_file_to_collection" name="Split file" version="0.4.0"> <description>to dataset collection</description> <macros> <xml name="regex_sanitizer"> @@ -77,9 +77,13 @@ #end if #else #if $split_parms.select_ftype == "generic" - --generic_re '$split_parms.generic_regex' - #if $split_parms.split_after == 'true': - --split_after + #if $split_parms.split_method.select_split_method == "regex" + --generic_re '$split_parms.split_method.generic_regex' + #if $split_parms.split_method.split_after == 'true': + --split_after + #end if + #else + --generic_num $split_parms.split_method.record_length #end if #end if #if $split_parms.select_mode.mode == "numnew": @@ -163,14 +167,25 @@ </when> <when value="generic"> <param name="input" type="data" format="txt" label="File to split"/> - <param name="generic_regex" type="text" label="Regex to match record separator" value="^.*"> - <expand macro="regex_sanitizer"/> - </param> + <conditional name="split_method"> + <param name="select_split_method" type="select" label="Method to split files"> + <option value="regex">Specify record separator as regular expression</option> + <option value="number">Specify number of lines after which a record ends</option> + </param> + <when value="regex"> + <param name="generic_regex" type="text" label="Regex to match record separator" value="^.*"> + <expand macro="regex_sanitizer"/> + </param> + <param name="split_after" type="select" value="false" label="Split records before or after the separator?" help="If before, the separator will appear at the start of each record; if after, at the end"> + <option value="false" selected="true">Before</option> + <option value="true">After</option> + </param> + </when> + <when value="number"> + <param name="record_length" type="integer" value="1" label="Record length" help="The number of lines after which each record ends"/> + </when> + </conditional> <expand macro="numnew_fname"/> - <param name="split_after" type="select" value="false" label="Split records before or after the separator?" help="If before, the separator will appear at the start of each record; if after, at the end"> - <option value="false" selected="true">Before</option> - <option value="true">After</option> - </param> </when> </conditional> </inputs> @@ -205,6 +220,7 @@ </collection> </outputs> <tests> + <!-- 1 --> <test> <param name="input" value="test.tabular" ftype="tabular"/> <param name="select_ftype" value="tabular"/> @@ -219,6 +235,7 @@ <element name="foo3.tab" file="foo3.tab" ftype="tabular"/> </output_collection> </test> + <!-- 2 --> <test> <param name="input" value="test.tabular" ftype="tabular"/> <param name="select_ftype" value="tabular"/> @@ -232,6 +249,7 @@ <element name="test_000001.tabular" file="test_1.tabular" ftype="tabular"/> </output_collection> </test> + <!-- 3 --> <test> <param name="input" value="test.tabular" ftype="tabular"/> <param name="select_ftype" value="tabular"/> @@ -246,6 +264,7 @@ <element name="batch_tab_000001.tabular" file="batch_tab_1.tabular" ftype="tabular"/> </output_collection> </test> + <!-- 4 --> <test> <param name="input" value="test.tabular" ftype="tabular"/> <param name="select_ftype" value="tabular"/> @@ -260,6 +279,7 @@ <element name="batch_tab_000001.tabular" file="batch_tab_1.tabular" ftype="tabular"/> </output_collection> </test> + <!-- 5 --> <test> <param name="select_ftype" value="txt"/> <param name="input" value="karyotype.txt" ftype="txt"/> @@ -295,6 +315,7 @@ <element name="chr_000023.txt" file="chr_000023.txt" ftype="txt"/> </output_collection> </test> + <!-- 6 --> <test> <param name="input" value="psm.tabular" ftype="tabular"/> <param name="select_ftype" value="tabular"/> @@ -310,6 +331,7 @@ <element name="file4.tab" file="file4.tab" ftype="tabular"/> </output_collection> </test> + <!-- 7 splitting of mgf --> <test> <param name="input" value="demo758Dacentroid.mgf" ftype="mgf"/> <param name="select_ftype" value="mgf"/> @@ -322,6 +344,7 @@ <element name="demo_000002.mgf" file="demo_2.mgf" ftype="mgf"/> </output_collection> </test> + <!-- 8 splitting of fasta + desired number of files--> <test> <param name="input" value="test.fasta" ftype="fasta"/> <param name="select_ftype" value="fasta"/> @@ -333,6 +356,7 @@ <element name="test_000001.fasta" file="test_1.fasta" ftype="fasta"/> </output_collection> </test> + <!-- 9 splitting of fasta + desired chunksize --> <test> <param name="input" value="test.fasta" ftype="fasta"/> <param name="select_ftype" value="fasta"/> @@ -344,6 +368,7 @@ <element name="test_000001.fasta" file="test_1.fasta" ftype="fasta"/> </output_collection> </test> + <!-- 10 splitting of fastq, specify desired number of files --> <test> <param name="input" value="test.fastq" ftype="fastq"/> <param name="select_ftype" value="fastq"/> @@ -355,6 +380,23 @@ <element name="test_000001.fastq" file="test_1.fastq" ftype="fastq"/> </output_collection> </test> + <!-- 11 splitting of fastq, specify desired number of files + same as previous test, but by specifying the number of lines per record + explicitely (not using the preset of the python script) --> + <test> + <param name="input" value="test.fastq" ftype="fastq"/> + <param name="select_ftype" value="generic"/> + <param name="select_split_method" value="number"/> + <param name="record_length" value="4"/> + <param name="mode" value="numnew"/> + <param name="numnew" value="2"/> + <param name="newfilenames" value="test"/> + <output_collection name="list_output_generic" type="list"> + <element name="test_000000" file="test_0.fastq" ftype="fastq"/> + <element name="test_000001" file="test_1.fastq" ftype="fastq"/> + </output_collection> + </test> + <!-- splitting of fasta w random assignment and specific filename prefix --> <test> <param name="input" value="test.fasta" ftype="fasta"/> <param name="select_ftype" value="fasta"/> @@ -368,6 +410,7 @@ <element name="rand_000001.fasta" file="rand_1.fasta" ftype="fasta"/> </output_collection> </test> + <!-- splitting of fasta w batch assignment and specific filename prefix --> <test> <param name="input" value="test.fasta" ftype="fasta"/> <param name="select_ftype" value="fasta"/> @@ -380,6 +423,7 @@ <element name="fasta_batch_000001.fasta" file="fasta_batch_1.fasta" ftype="fasta"/> </output_collection> </test> + <!-- splitting of txt w default (alternating assignment) --> <test> <param name="input" value="test.tabular" ftype="txt"/> <param name="select_ftype" value="txt"/> @@ -391,9 +435,11 @@ <element name="test_000001.txt" file="test_1.tabular" ftype="txt" lines_diff="1"/> </output_collection> </test> + <!-- generic-regex splitting (of txt) w default assignement (alternating) --> <test> <param name="input" value="test.tabular" ftype="txt"/> <param name="select_ftype" value="generic"/> + <param name="select_split_method" value="regex"/> <param name="generic_regex" value="^.*"/> <param name="mode" value="numnew"/> <param name="numnew" value="2"/> @@ -403,9 +449,11 @@ <element name="test_000001" file="test_1.tabular" ftype="txt" lines_diff="1"/> </output_collection> </test> + <!-- generic-regex splitting (of a fasta) w random assignment --> <test> <param name="input" value="test.fasta" ftype="fasta"/> <param name="select_ftype" value="generic"/> + <param name="select_split_method" value="regex"/> <param name="generic_regex" value="^>.*"/> <param name="mode" value="numnew"/> <param name="numnew" value="2"/> @@ -417,6 +465,7 @@ <element name="rand_000001" file="rand_1.fasta" ftype="fasta"/> </output_collection> </test> + <!-- sdf + specify desired number of files --> <test> <param name="input" value="3_molecules.sdf" ftype="sdf"/> <param name="select_ftype" value="sdf"/> @@ -430,6 +479,7 @@ <element name="mol_000002.sdf" file="mol_2.sdf" ftype="sdf"/> </output_collection> </test> + <!-- sdf + specify desired number of records per file (chunksize) --> <test> <param name="input" value="3_molecules.sdf" ftype="sdf"/> <param name="select_ftype" value="sdf"/> @@ -443,10 +493,12 @@ <element name="mol_000002.sdf" file="mol_2.sdf" ftype="sdf"/> </output_collection> </test> + <!-- test split_after (by splitting fasta files after non-header lines) --> <test> <param name="input" value="test.fasta" ftype="fasta"/> <param name="select_ftype" value="generic"/> - <param name="generic_regex" value="^>.*"/> + <param name="select_split_method" value="regex"/> + <param name="generic_regex" value="^[^>].*"/> <param name="split_after" value="true"/> <param name="mode" value="numnew"/> <param name="numnew" value="2"/> @@ -454,7 +506,8 @@ <param name="allocate" value="random"/> <param name="seed" value="1010"/> <output_collection name="list_output_generic" type="list"> - <element name="rand_000001" file="split_after.fasta" ftype="fasta"/> + <element name="rand_000000" file="rand_0.fasta" ftype="fasta"/> + <element name="rand_000001" file="rand_1.fasta" ftype="fasta"/> </output_collection> </test> </tests> @@ -463,10 +516,11 @@ This tool splits a data set consisting of records into multiple data sets within a collection. A record can be for instance simply a line, a FASTA sequence (header + sequence), a FASTQ sequence -(headers + sequence + qualities), etc. The important property is that the beginning of a new record -can be specified by a regular expression, e.g. ".*" for lines, ">.*" for FASTA, or "@.*" for FASTQ. -The tool has presets for text, tabular data sets (which are split by line), FASTA, FASTQ, SDF and MGF. -For other data types the text delimiting records can be specified manually using the generic splitter. +(headers + sequence + qualities), etc. The important property is that the records either have a +specific length (e.g. 4 lines for FASTQ) or that the beginning/end of a new record +can be specified by a regular expression, e.g. ".*" for lines or ">.*" for FASTA. +The tool has presets for text, tabular data sets (which are split after each line), FASTA (new records start with ">.*"), FASTQ (records consist of 4 lines), SDF (records start with "^BEGIN IONS") and MGF (records end with "^$$$$"). +For other data types the text delimiting records or the number of lines making up a record can be specified manually using the generic splitter. If the generic splitter is used, an option is also available to split records either before or after the separator. If a preset filetype is used, this is selected automatically (after for SDF, before for all others).