Mercurial > repos > bgruening > split_file_to_collection

diff split_file_to_collection.xml @ 2:d150ac3d853d draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 06ffe450bafa280eee8a4331c9cfc9e1ece7c522"
author: bgruening
date: Wed, 28 Aug 2019 10:55:25 -0400
parents: 750c1684d47c
children: 2ddc36385d7a
--- a/split_file_to_collection.xml	Mon Feb 18 15:20:56 2019 -0500
+++ b/split_file_to_collection.xml	Wed Aug 28 10:55:25 2019 -0400
@@ -1,6 +1,19 @@
-<tool id="split_file_to_collection" name="Split file" version="0.1.1">
+<tool id="split_file_to_collection" name="Split file" version="0.2.0">
     <description>to dataset collection</description>
     <macros>
+        <xml name="regex_sanitizer">
+            <sanitizer>
+                <valid>
+                    <add preset="string.printable"/>
+                    <remove value="&#92;" />
+                    <remove value="&apos;" />
+                </valid>
+                <mapping initial="none">
+                    <add source="&#92;" target="__backslash__" />
+                    <add source="&apos;" target="__sq__"/>
+                </mapping>
+            </sanitizer>
+        </xml>
         <xml name="numnew_fname">
             <param name="numnew" type="integer" label="Number of new files" min="1" value="1"/>
             <param name="newfilenames" type="text" label="Base name for new files in collection"
@@ -48,6 +61,9 @@
                     #end if
                 #end if
             #else
+                #if $split_parms.select_ftype == "generic"
+                    --generic_re '$split_parms.generic_regex'
+                #end if
                 --numnew '$split_parms.numnew'
                 #if $split_parms.select_allocate.allocate == "random":
                     --rand
@@ -63,7 +79,11 @@
         #end if
         #if $split_parms.select_ftype != "tabular":
             --file_names '$split_parms.newfilenames'
-            --file_ext '$split_parms.select_ftype'
+            #if $split_parms.select_ftype == "generic"
+                --file_ext '$split_parms.input.ext'
+            #else
+                --file_ext '$split_parms.select_ftype'
+            #end if
         #end if
     ]]></command>
     <inputs>
@@ -73,6 +93,8 @@
                 <option value="fastq">FASTQ</option>
                 <option value="tabular">Tabular</option>
                 <option value="fasta">FASTA</option>
+                <option value="txt">Text files</option>
+                <option value="generic">Generic</option>
             </param>
             <when value="tabular">
                 <param name="input" type="data" format="tabular" label="Tabular file to split"/>
@@ -85,30 +107,10 @@
                     <when value="col">
                         <param name="id_col" type="data_column" label="Column to split on" data_ref="input"/>
                         <param name="match_regex" type="text" label="Regex to match contents of id column" value="(.*)">
-	 		                <sanitizer>
-          		                <valid>
-            		                <add preset="string.printable"/>
-            		                <remove value="&#92;" />
-           		                    <remove value="&apos;" />
-          			            </valid>
-          		                <mapping initial="none">
-            		                <add source="&#92;" target="__backslash__" />
-            		                <add source="&apos;" target="__sq__"/>
-          		                </mapping>
-			                </sanitizer>
-		                </param>
+                            <expand macro="regex_sanitizer"/>
+                        </param>
                         <param name="sub_regex" type="text" label="Pattern to replace match with" value="\1">
-                            <sanitizer>
-                                <valid>
-                                    <add preset="string.printable"/>
-                                    <remove value="&#92;" />
-                                    <remove value="&apos;" />
-                                </valid>
-                                <mapping initial="none">
-                                    <add source="&#92;" target="__backslash__" />
-                                    <add source="&apos;" target="__sq__"/>
-                                </mapping>
-			                </sanitizer>
+                            <expand macro="regex_sanitizer"/>
                         </param>
                     </when>
                     <when value="row">
@@ -128,25 +130,44 @@
                 <param name="input" type="data" format="fasta" label="FASTA file to split"/>
                 <expand macro="numnew_fname"/>
             </when>
+            <when value="txt">
+                <param name="input" type="data" format="txt" label="Text file to split"/>
+                <expand macro="numnew_fname"/>
+            </when>
+            <when value="generic">
+                <param name="input" type="data" format="txt" label="File to split"/>
+                <param name="generic_regex" type="text" label="Regex to match record separator" value="^.*">
+                    <expand macro="regex_sanitizer"/>
+                </param>
+                <expand macro="numnew_fname"/>
+            </when>
         </conditional>
     </inputs>
     <outputs>
-        <collection name="list_output_tab" type="list" label="${tool.name} on ${on_string}: output collection">
+        <collection name="list_output_tab" type="list" label="${tool.name} on ${on_string}">
             <discover_datasets pattern="__name__" directory="out" visible="false" format="tabular"/>
             <filter>split_parms['select_ftype'] == "tabular"</filter>
         </collection>
-        <collection name="list_output_mgf" type="list" label="${tool.name} on ${on_string}: output collection">
+        <collection name="list_output_mgf" type="list" label="${tool.name} on ${on_string}">
             <discover_datasets pattern="__name__" directory="out" visible="false" format="mgf"/>
             <filter>split_parms['select_ftype'] == "mgf"</filter>
         </collection>
-        <collection name="list_output_fasta" type="list" label="${tool.name} on ${on_string}: output collection">
+        <collection name="list_output_fasta" type="list" label="${tool.name} on ${on_string}">
             <discover_datasets pattern="__name__" directory="out" visible="false" format="fasta"/>
             <filter>split_parms['select_ftype'] == "fasta"</filter>
         </collection>
-        <collection name="list_output_fastq" type="list" label="${tool.name} on ${on_string}: output collection">
+        <collection name="list_output_fastq" type="list" label="${tool.name} on ${on_string}">
             <discover_datasets pattern="__name__" directory="out" visible="false" format="fastq"/>
             <filter>split_parms['select_ftype'] == "fastq"</filter>
         </collection>
+        <collection name="list_output_txt" type="list" label="${tool.name} on ${on_string}">
+            <discover_datasets pattern="__name__" directory="out" visible="false" format="txt"/>
+            <filter>split_parms['select_ftype'] == "txt"</filter>
+        </collection>
+        <collection name="list_output_generic" type="list" label="${tool.name} on ${on_string}">
+            <discover_datasets pattern="__name_and_ext__" directory="out" visible="false"/>
+            <filter>split_parms['select_ftype'] == "generic"</filter>
+        </collection>
     </outputs>
     <tests>
         <test>
@@ -257,21 +278,101 @@
                 <element name="fasta_batch_1.fasta" file="fasta_batch_1.fasta" ftype="fasta"/>
             </output_collection>
         </test> 
+        <test>
+            <param name="input" value="test.tabular" ftype="txt"/>
+            <param name="select_ftype" value="txt"/>
+            <param name="numnew" value="2"/>
+            <param name="newfilenames" value="test"/> 
+            <output_collection name="list_output_txt" type="list">
+                <element name="test_0.txt" file="test_0.tabular" ftype="txt" lines_diff="1"/>
+                <element name="test_1.txt" file="test_1.tabular" ftype="txt" lines_diff="1"/>
+            </output_collection>
+        </test>
+        <test>
+            <param name="input" value="test.tabular" ftype="txt"/>
+            <param name="select_ftype" value="generic"/>
+            <param name="generic_regex" value="^.*"/>
+            <param name="numnew" value="2"/>
+            <param name="newfilenames" value="test"/> 
+            <output_collection name="list_output_generic" type="list">
+                <element name="test_0" file="test_0.tabular" ftype="txt" lines_diff="1"/>
+                <element name="test_1" file="test_1.tabular" ftype="txt" lines_diff="1"/>
+            </output_collection>
+        </test>
+        <test>
+            <param name="input" value="test.fasta" ftype="fasta"/>
+            <param name="select_ftype" value="generic"/>
+            <param name="generic_regex" value="^>.*"/>
+            <param name="numnew" value="2"/>
+            <param name="newfilenames" value="rand"/>
+            <param name="allocate" value="random"/>
+            <param name="seed" value="1010"/> 
+            <output_collection name="list_output_generic" type="list">
+                <element name="rand_0" file="rand_0.fasta" ftype="fasta"/>
+                <element name="rand_1" file="rand_1.fasta" ftype="fasta"/>
+            </output_collection>
+        </test>
+        <test>
+            <param name="input" value="3_molecules.sdf" ftype="sdf"/>
+            <param name="select_ftype" value="generic"/>
+            <param name="generic_regex" value="^\$\$\$\$.*"/>
+            <param name="numnew" value="1000"/>
+            <param name="newfilenames" value="mol"/>
+            <param name="allocate" value="batch"/>
+            <output_collection name="list_output_generic" type="list">
+                <element name="mol_0" file="mol_0.sdf" ftype="sdf"/>
+                <element name="mol_1" file="mol_1.sdf" ftype="sdf"/>
+                <element name="mol_2" file="mol_2.sdf" ftype="sdf"/>
+            </output_collection>
+        </test>
     </tests>
     <help><![CDATA[
 **Split file into a dataset collection**
 
-This tool can split five types of files into a separate files within a dataset collection: MGF, FASTA, FASTQ, and tabular.
+This tool splits a data sets consisting of records into multiple data sets within a collection. 
+A record can be for instance simply a line, a FASTA sequence (header + sequence), a FASTQ sequence
+(headers + sequence + qualities), etc. The important property is that the begin of a new record
+can be speciefied by a regular expression, e.g. ".*" for lines, ">.*" for FASTA, or "@.*" for FASTQ. 
+The tool has presets for text, tabular data sets (which are split by line), FASTA, FASTQ, and MGF. 
+For other data types the text delimiting records can be specified manually using the generic splitter. 
+
+If splitting by line (or by some other item, like a FASTA entry or an MGF record, the splitting can be either done alternating, in original record order, or at random. 
+
+If t records are to be distributed to n new data sets, then the i-th record goes to data set
+
+* floor(i / t * n) (for batch), 
+* i % n (for alternating), or
+* a random data set
+
+For instance, t=5 records are distributed as follows on n=2 data sets
+
+= === === ====
+i bat alt rand
+= === === ====
+0 0   0   0
+1 0   1   1
+2 0   0   1
+3 1   1   0
+4 1   0   0
+= === === ====
+
+If the five records are distributed on n=3 data sets:
+
+= === === ====
+i bat alt rand
+= === === ====
+0 0   0   0
+1 0   1   1
+2 1   2   2
+3 1   0   0
+4 2   1   1
+= === === ====
+
+Note that there are no guarantees when splitting at random that every result file will be non-empty, so downstream tools should be able to gracefully handle empty files. 
+
 If a tabular file is used as input, you may choose to split by line or by column. If split by column, a new file is created for each unique value in the column.
 In addition, (Python) regular expressions may be used to transform the value in the column to a new value. Caution should be used with this feature, as it could transform all values to the same value, or other unexpected behavior.
 The default regular expression uses each value in the column without modifying it. 
-
-If splitting by line (or by some other item, like a FASTA entry or an MGF section), the splitting can be either done sequentially or at random. 
-Note that there are no guarantees when splitting at random that every result file will be non-empty, so downstream tools should be able to gracefully handle empty files. 
-
-**Note**
-
-Due to current limitations with dataset collections, a log file is produced when running this tool. It will usually be empty, but if the tool fails, any errors will be printed to the log file. 
     ]]></help>
     <citations>
         <citation type="bibtex">
author	bgruening
date	Wed, 28 Aug 2019 10:55:25 -0400
parents	750c1684d47c
children	2ddc36385d7a