Mercurial > repos > bgruening > split_file_to_collection

diff split_file_to_collection.xml @ 5:e77b954f0da5 draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
author: bgruening
date: Fri, 11 Oct 2019 18:24:43 -0400
parents: 0850f2dfba13
children: 6cbe2f30c2d7
--- a/split_file_to_collection.xml	Wed Oct 09 07:34:49 2019 -0400
+++ b/split_file_to_collection.xml	Fri Oct 11 18:24:43 2019 -0400
@@ -1,4 +1,4 @@
-<tool id="split_file_to_collection" name="Split file" version="0.3.0">
+<tool id="split_file_to_collection" name="Split file" version="0.4.0">
     <description>to dataset collection</description>
     <macros>
         <xml name="regex_sanitizer">
@@ -77,9 +77,13 @@
                 #end if
             #else
                 #if $split_parms.select_ftype == "generic"
-                    --generic_re '$split_parms.generic_regex'
-                    #if $split_parms.split_after == 'true':
-                        --split_after
+                    #if $split_parms.split_method.select_split_method == "regex"
+                        --generic_re '$split_parms.split_method.generic_regex'
+                        #if $split_parms.split_method.split_after == 'true':
+                            --split_after
+                        #end if
+                    #else
+                        --generic_num $split_parms.split_method.record_length
                     #end if
                 #end if
                 #if $split_parms.select_mode.mode == "numnew":
@@ -163,14 +167,25 @@
             </when>
             <when value="generic">
                 <param name="input" type="data" format="txt" label="File to split"/>
-                <param name="generic_regex" type="text" label="Regex to match record separator" value="^.*">
-                    <expand macro="regex_sanitizer"/>
-                </param>
+                <conditional name="split_method">
+                    <param name="select_split_method" type="select" label="Method to split files">
+                        <option value="regex">Specify record separator as regular expression</option>
+                        <option value="number">Specify number of lines after which a record ends</option>
+                    </param>
+                    <when value="regex">
+                        <param name="generic_regex" type="text" label="Regex to match record separator" value="^.*">
+                            <expand macro="regex_sanitizer"/>
+                        </param>
+                        <param name="split_after" type="select" value="false" label="Split records before or after the separator?" help="If before, the separator will appear at the start of each record; if after, at the end">
+                            <option value="false" selected="true">Before</option>
+                            <option value="true">After</option>
+                        </param>
+                    </when>
+                    <when value="number">
+                        <param name="record_length" type="integer" value="1" label="Record length" help="The number of lines after which each record ends"/>
+                    </when>
+                </conditional>
                 <expand macro="numnew_fname"/>
-                <param name="split_after" type="select" value="false" label="Split records before or after the separator?" help="If before, the separator will appear at the start of each record; if after, at the end">
-                    <option value="false" selected="true">Before</option>
-                    <option value="true">After</option>
-                </param>
             </when>
         </conditional>
     </inputs>
@@ -205,6 +220,7 @@
         </collection>
     </outputs>
     <tests>
+        <!-- 1 -->
         <test>
             <param name="input" value="test.tabular" ftype="tabular"/>
             <param name="select_ftype" value="tabular"/>
@@ -219,6 +235,7 @@
                 <element name="foo3.tab" file="foo3.tab" ftype="tabular"/>
             </output_collection>
         </test>
+        <!-- 2 -->
         <test>
             <param name="input" value="test.tabular" ftype="tabular"/>
             <param name="select_ftype" value="tabular"/>
@@ -232,6 +249,7 @@
                 <element name="test_000001.tabular" file="test_1.tabular" ftype="tabular"/>
             </output_collection>
         </test>
+        <!-- 3 -->
         <test>
             <param name="input" value="test.tabular" ftype="tabular"/>
             <param name="select_ftype" value="tabular"/>
@@ -246,6 +264,7 @@
                 <element name="batch_tab_000001.tabular" file="batch_tab_1.tabular" ftype="tabular"/>
             </output_collection>
         </test>
+        <!-- 4 -->
         <test>
             <param name="input" value="test.tabular" ftype="tabular"/>
             <param name="select_ftype" value="tabular"/>
@@ -260,6 +279,7 @@
                 <element name="batch_tab_000001.tabular" file="batch_tab_1.tabular" ftype="tabular"/>
             </output_collection>
         </test>
+        <!-- 5 -->
         <test>
             <param name="select_ftype" value="txt"/>
             <param name="input" value="karyotype.txt" ftype="txt"/>
@@ -295,6 +315,7 @@
                 <element name="chr_000023.txt" file="chr_000023.txt" ftype="txt"/>
             </output_collection>
         </test>
+        <!-- 6 -->
         <test>
             <param name="input" value="psm.tabular" ftype="tabular"/>
             <param name="select_ftype" value="tabular"/>
@@ -310,6 +331,7 @@
                 <element name="file4.tab" file="file4.tab" ftype="tabular"/>
             </output_collection>
         </test>
+        <!-- 7 splitting of mgf -->
         <test>
             <param name="input" value="demo758Dacentroid.mgf" ftype="mgf"/>
             <param name="select_ftype" value="mgf"/>
@@ -322,6 +344,7 @@
                 <element name="demo_000002.mgf" file="demo_2.mgf" ftype="mgf"/>
             </output_collection>
         </test>
+        <!-- 8 splitting of fasta + desired number of files-->
         <test>
             <param name="input" value="test.fasta" ftype="fasta"/>
             <param name="select_ftype" value="fasta"/>
@@ -333,6 +356,7 @@
                 <element name="test_000001.fasta" file="test_1.fasta" ftype="fasta"/>
             </output_collection>
         </test>
+        <!-- 9 splitting of fasta + desired chunksize -->
         <test>
             <param name="input" value="test.fasta" ftype="fasta"/>
             <param name="select_ftype" value="fasta"/>
@@ -344,6 +368,7 @@
                 <element name="test_000001.fasta" file="test_1.fasta" ftype="fasta"/>
             </output_collection>
         </test>
+        <!-- 10 splitting of fastq, specify desired number of files -->
         <test>
             <param name="input" value="test.fastq" ftype="fastq"/>
             <param name="select_ftype" value="fastq"/>
@@ -355,6 +380,23 @@
                 <element name="test_000001.fastq" file="test_1.fastq" ftype="fastq"/>
             </output_collection>
         </test>
+        <!-- 11 splitting of fastq, specify desired number of files 
+             same as previous test, but by specifying the number of lines per record
+             explicitely (not using the preset of the python script) -->
+        <test>
+            <param name="input" value="test.fastq" ftype="fastq"/>
+            <param name="select_ftype" value="generic"/>
+            <param name="select_split_method" value="number"/>
+            <param name="record_length" value="4"/>
+            <param name="mode" value="numnew"/>
+            <param name="numnew" value="2"/>
+            <param name="newfilenames" value="test"/>
+            <output_collection name="list_output_generic" type="list">
+                <element name="test_000000" file="test_0.fastq" ftype="fastq"/>
+                <element name="test_000001" file="test_1.fastq" ftype="fastq"/>
+            </output_collection>
+        </test>
+        <!-- splitting of fasta w random assignment and specific filename prefix -->
         <test>
             <param name="input" value="test.fasta" ftype="fasta"/>
             <param name="select_ftype" value="fasta"/>
@@ -368,6 +410,7 @@
                 <element name="rand_000001.fasta" file="rand_1.fasta" ftype="fasta"/>
             </output_collection>
         </test>
+        <!-- splitting of fasta w batch assignment and specific filename prefix -->
         <test>
             <param name="input" value="test.fasta" ftype="fasta"/>
             <param name="select_ftype" value="fasta"/>
@@ -380,6 +423,7 @@
                 <element name="fasta_batch_000001.fasta" file="fasta_batch_1.fasta" ftype="fasta"/>
             </output_collection>
         </test>
+        <!-- splitting of txt w default (alternating assignment) -->
         <test>
             <param name="input" value="test.tabular" ftype="txt"/>
             <param name="select_ftype" value="txt"/>
@@ -391,9 +435,11 @@
                 <element name="test_000001.txt" file="test_1.tabular" ftype="txt" lines_diff="1"/>
             </output_collection>
         </test>
+        <!-- generic-regex splitting (of txt) w default assignement (alternating) -->
         <test>
             <param name="input" value="test.tabular" ftype="txt"/>
             <param name="select_ftype" value="generic"/>
+            <param name="select_split_method" value="regex"/>
             <param name="generic_regex" value="^.*"/>
             <param name="mode" value="numnew"/>
             <param name="numnew" value="2"/>
@@ -403,9 +449,11 @@
                 <element name="test_000001" file="test_1.tabular" ftype="txt" lines_diff="1"/>
             </output_collection>
         </test>
+        <!-- generic-regex splitting (of a fasta) w random assignment -->
         <test>
             <param name="input" value="test.fasta" ftype="fasta"/>
             <param name="select_ftype" value="generic"/>
+            <param name="select_split_method" value="regex"/>
             <param name="generic_regex" value="^>.*"/>
             <param name="mode" value="numnew"/>
             <param name="numnew" value="2"/>
@@ -417,6 +465,7 @@
                 <element name="rand_000001" file="rand_1.fasta" ftype="fasta"/>
             </output_collection>
         </test>
+        <!-- sdf + specify desired number of files -->
         <test>
             <param name="input" value="3_molecules.sdf" ftype="sdf"/>
             <param name="select_ftype" value="sdf"/>
@@ -430,6 +479,7 @@
                 <element name="mol_000002.sdf" file="mol_2.sdf" ftype="sdf"/>
             </output_collection>
         </test>
+        <!-- sdf + specify desired number of records per file (chunksize) -->
         <test>
             <param name="input" value="3_molecules.sdf" ftype="sdf"/>
             <param name="select_ftype" value="sdf"/>
@@ -443,10 +493,12 @@
                 <element name="mol_000002.sdf" file="mol_2.sdf" ftype="sdf"/>
             </output_collection>
         </test>
+        <!-- test split_after (by splitting fasta files after non-header lines) -->
         <test>
             <param name="input" value="test.fasta" ftype="fasta"/>
             <param name="select_ftype" value="generic"/>
-            <param name="generic_regex" value="^>.*"/>
+            <param name="select_split_method" value="regex"/>
+            <param name="generic_regex" value="^[^>].*"/>
             <param name="split_after" value="true"/>
             <param name="mode" value="numnew"/>
             <param name="numnew" value="2"/>
@@ -454,7 +506,8 @@
             <param name="allocate" value="random"/>
             <param name="seed" value="1010"/>
             <output_collection name="list_output_generic" type="list">
-                <element name="rand_000001" file="split_after.fasta" ftype="fasta"/>
+                <element name="rand_000000" file="rand_0.fasta" ftype="fasta"/>
+                <element name="rand_000001" file="rand_1.fasta" ftype="fasta"/>
             </output_collection>
         </test>
     </tests>
@@ -463,10 +516,11 @@
 
 This tool splits a data set consisting of records into multiple data sets within a collection.
 A record can be for instance simply a line, a FASTA sequence (header + sequence), a FASTQ sequence
-(headers + sequence + qualities), etc. The important property is that the beginning of a new record
-can be specified by a regular expression, e.g. ".*" for lines, ">.*" for FASTA, or "@.*" for FASTQ.
-The tool has presets for text, tabular data sets (which are split by line), FASTA, FASTQ, SDF and MGF.
-For other data types the text delimiting records can be specified manually using the generic splitter. 
+(headers + sequence + qualities), etc. The important property is that the records either have a 
+specific length (e.g. 4 lines for FASTQ) or that the beginning/end of a new record
+can be specified by a regular expression, e.g. ".*" for lines or ">.*" for FASTA.
+The tool has presets for text, tabular data sets (which are split after each line), FASTA (new records start with ">.*"), FASTQ (records consist of 4 lines), SDF (records start with "^BEGIN IONS") and MGF (records end with "^$$$$").
+For other data types the text delimiting records or the number of lines making up a record can be specified manually using the generic splitter. 
 If the generic splitter is used, an option is also available to split records either before or after the
 separator. If a preset filetype is used, this is selected automatically (after for SDF, before for all
 others).
author	bgruening
date	Fri, 11 Oct 2019 18:24:43 -0400
parents	0850f2dfba13
children	6cbe2f30c2d7