Repository 'split_file_on_column'
hg clone https://toolshed.g2.bx.psu.edu/repos/bgruening/split_file_on_column

Changeset 4:37a53100b67e (2021-02-25)
Previous changeset 3:b60f2452580e (2015-12-23) Next changeset 5:d4b5b70e82cb (2022-07-04)
Commit message:
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_on_column commit 4d0bfcf37bfbedafc7ff0672dfe452766ca8a606"
modified:
split_file_on_column.xml
test-data/5cols.tabular
added:
test-data/5cols-with-header.tabular
removed:
split_file_on_column.tar.gz
tool_dependencies.xml
b
diff -r b60f2452580e -r 37a53100b67e split_file_on_column.tar.gz
b
Binary file split_file_on_column.tar.gz has changed
b
diff -r b60f2452580e -r 37a53100b67e split_file_on_column.xml
--- a/split_file_on_column.xml Wed Dec 23 03:50:48 2015 -0500
+++ b/split_file_on_column.xml Thu Feb 25 15:54:13 2021 +0000
[
@@ -1,17 +1,24 @@
-<tool id="tp_split_on_column" name="Split file" version="0.2">
+<tool id="tp_split_on_column" name="Split file" version="0.4">
     <description>according to the values of a column</description>
     <requirements>
-        <requirement type="package" version="4.1.0">gnu_awk</requirement>
+        <requirement type="package" version="5.0.1">gawk</requirement>
     </requirements>
     <command>
 <![CDATA[
-        mkdir tmp_out &&
-        awk -F'\t' '{print > "tmp_out/"\$$column".$infile.ext" }' $infile
+    mkdir tmp_out &&
+    #if $include_header
+        awk -F '\t' 'NR==1{hdr=$0;next}f!="tmp_out/"\$$column".$infile.ext"{if(f) close(f); f="tmp_out/"\$$column".$infile.ext";print hdr>f} {print >> f}' $infile
+    #else
+        awk -F'\t' '{print > "tmp_out/"\$$column".$infile.ext" }' '$infile'
+    #end if
 ]]>
     </command>
     <inputs>
         <param format="tabular" name="infile" type="data" label="File to select" />
         <param name="column" label="on column" type="data_column" data_ref="infile" accept_default="true" />
+
+        <param name="include_header" type="boolean" label="Include the header in all splitted files?"
+               help="Include the first line (the assumed header line) in all splitted files." />
     </inputs>
     <outputs>
         <collection name="split_output" type="list" label="Table split on first column">
@@ -22,6 +29,45 @@
         <test>
             <param name="infile" value="5cols.tabular" ftype="tabular"/>
             <param name="column" value="5" />
+            <param name="include_header" value="false"/>
+            <output_collection name="split_output" type="list">
+                <element name="1">
+                    <assert_contents>
+                        <has_text_matching expression="chr7\t56632\t56652\tcluster\t1" />
+                    </assert_contents>
+                </element>
+                <element name="2">
+                    <assert_contents>
+                        <has_text_matching expression="chr7\t56761\t56781\tcluster\t2" />
+                    </assert_contents>
+                </element>
+            </output_collection>
+        </test>
+        <test>
+            <param name="infile" value="5cols-with-header.tabular" ftype="tabular" />
+            <param name="column" value="5" />
+            <param name="include_header" value="true"/>
+            <output_collection name="split_output" type="list">
+                <element name="1">
+                    <assert_contents>
+                        <has_line_matching expression="Column1\tColumn2\tColumn3\tColumn4\tColumn5" />
+                        <has_n_lines n="3" />
+                        <has_text_matching expression="chr7\t56632\t56652\tcluster\t1" />
+                    </assert_contents>
+                </element>
+                <element name="2">
+                    <assert_contents>
+                        <has_line_matching expression="Column1\tColumn2\tColumn3\tColumn4\tColumn5" />
+                        <has_n_lines n="4" />
+                        <has_text_matching expression="chr7\t56761\t56781\tcluster\t2" />
+                    </assert_contents>
+                </element>
+            </output_collection>
+        </test>
+        <test>
+            <param name="infile" value="5cols-with-header.tabular" ftype="tabular" />
+            <param name="column" value="5" />
+            <param name="include_header" value="false"/>
             <output_collection name="split_output" type="list">
                 <element name="1">
                     <assert_contents>
@@ -43,12 +89,15 @@
 
 This tool splits a file into different smaller files using a specific column.
 It will work like the group tool, but every group is saved to its own file.
+You have the option to include the header (first line) in all splitted files.
+If you have a header and don't want keep it, please remove it before you use this tool.
+For example with the "Remove beginning of a file" tool.
 
 -----
 
 **Example**
 
-Splitting on column 5 from this::
+Splitting a file without header on column 5 from this::
 
     chr7  56632  56652  cluster 1
     chr7  56736  56756  cluster 1
@@ -66,7 +115,19 @@
     chr7  56772  56792  cluster 2
     chr7  56775  56795  cluster 2
 
-
 ]]>
     </help>
+    <citations>
+        <citation type="bibtex">
+@misc{githubsplit_file_on_column,
+      author = {Gruening, Bjoern},
+      year = {2015},
+      title = {split_file_on_column},
+      publisher = {GitHub},
+      journal = {GitHub repository},
+      url = {https://github.com/bgruening/galaxytools},
+     }
+        </citation>
+    </citations>
 </tool>
+
b
diff -r b60f2452580e -r 37a53100b67e test-data/5cols-with-header.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/5cols-with-header.tabular Thu Feb 25 15:54:13 2021 +0000
b
@@ -0,0 +1,6 @@
+Column1 Column2 Column3 Column4 Column5
+chr7 56632 56652 cluster 1
+chr7 56736 56756 cluster 1
+chr7 56761 56781 cluster 2
+chr7 56772 56792 cluster 2
+chr7 56775 56795 cluster 2
b
diff -r b60f2452580e -r 37a53100b67e test-data/5cols.tabular
--- a/test-data/5cols.tabular Wed Dec 23 03:50:48 2015 -0500
+++ b/test-data/5cols.tabular Thu Feb 25 15:54:13 2021 +0000
b
@@ -1,5 +1,5 @@
 chr7 56632 56652 cluster 1
-chr7 56736 56756 cluster 1
+chr7 56736 56756 cluster 1
 chr7 56761 56781 cluster 2
 chr7 56772 56792 cluster 2
 chr7 56775 56795 cluster 2
b
diff -r b60f2452580e -r 37a53100b67e tool_dependencies.xml
--- a/tool_dependencies.xml Wed Dec 23 03:50:48 2015 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,6 +0,0 @@
-<?xml version="1.0"?>
-<tool_dependency>
-    <package name="gnu_awk" version="4.1.0">
-        <repository changeset_revision="f145f856ec57" name="package_gnu_awk_4_1_0" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" />
-    </package>
-</tool_dependency>