Mercurial > repos > bgruening > split_file_on_column

--- a/split_file_on_column.xml	Mon Jul 04 12:26:46 2022 +0000
+++ b/split_file_on_column.xml	Tue Jul 19 13:25:20 2022 +0000
@@ -1,14 +1,14 @@
-<tool id="tp_split_on_column" name="Split by group" version="0.5">
+<tool id="tp_split_on_column" name="Split by group" version="0.6">
     <requirements>
-        <requirement type="package" version="5.0.1">gawk</requirement>
+        <requirement type="package" version="5.1.0">gawk</requirement>
     </requirements>
     <command>
 <![CDATA[
     mkdir tmp_out &&
     #if $include_header
-        awk -F '\t' 'NR==1{hdr=$0;next}f!="tmp_out/"\$$column".$infile.ext"{if(f) close(f); f="tmp_out/"\$$column".$infile.ext";print hdr>f} {print >> f}' $infile
+        awk -F '\t' 'NR==1{hdr=$0;next}f!="tmp_out/"\$$column".$infile.ext"{if(f) close(f); f="tmp_out/"\$$column".$infile.ext"}; {if (!seen[f]++) print hdr>f; print >> f}' $infile
     #else
-        awk -F'\t' '{print > "tmp_out/"\$$column".$infile.ext" }' '$infile'
+        awk -F'\t' '{print >> "tmp_out/"\$$column".$infile.ext" }' '$infile'
     #end if
 ]]>
     </command>
@@ -80,6 +80,43 @@
                 </element>
             </output_collection>
         </test>
+        <test><!-- test with unsorted column, no header -->
+            <param name="infile" value="5cols-unsorted.tabular" ftype="tabular" />
+            <param name="column" value="5" />
+            <param name="include_header" value="false"/>
+            <output_collection name="split_output" type="list">
+                <element name="1">
+                    <assert_contents>
+                        <has_n_lines n="3" />
+                    </assert_contents>
+                </element>
+                <element name="2">
+                    <assert_contents>
+                        <has_n_lines n="2" />
+                    </assert_contents>
+                </element>
+            </output_collection>
+        </test>
+        <test><!-- test with unsorted column, with header -->
+            <param name="infile" value="5cols-unsorted-with-header.tabular" ftype="tabular" />
+            <param name="column" value="5" />
+            <param name="include_header" value="true"/>
+            <output_collection name="split_output" type="list">
+                <element name="1">
+                    <assert_contents>
+                        <has_n_lines n="4" />
+                        <has_line_matching expression="Column1\tColumn2\tColumn3\tColumn4\tColumn5" />
+
+                    </assert_contents>
+                </element>
+                <element name="2">
+                    <assert_contents>
+                        <has_n_lines n="3" />
+                        <has_line_matching expression="Column1\tColumn2\tColumn3\tColumn4\tColumn5" />
+                    </assert_contents>
+                </element>
+            </output_collection>
+        </test>
     </tests>
     <help>
 <![CDATA[
@@ -112,7 +149,7 @@
     chr4 60 80


-will produce a collectiion with 4 elements::
+will produce a collection with 3 elements::

     chr1 10 20
     chr1 30 40
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/5cols-unsorted-with-header.tabular	Tue Jul 19 13:25:20 2022 +0000
@@ -0,0 +1,6 @@
+Column1	Column2	Column3	Column4	Column5
+chr7	56632	56652	cluster	1
+chr7	56736	56756	cluster	2
+chr7	56761	56781	cluster	1
+chr7	56772	56792	cluster	1
+chr7	56775	56795	cluster	2
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/5cols-unsorted.tabular	Tue Jul 19 13:25:20 2022 +0000
@@ -0,0 +1,5 @@
+chr7	56632	56652	cluster	1
+chr7	56736	56756	cluster	2
+chr7	56761	56781	cluster	1
+chr7	56772	56792	cluster	1
+chr7	56775	56795	cluster	2