Mercurial > repos > bgruening > text_processing

diff sorted_uniq.xml @ 0:5314e5d6f040 draft
Imported from capsule None
author: bgruening
date: Thu, 29 Jan 2015 07:53:17 -0500
children: 37e1eb05b1b4
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sorted_uniq.xml	Thu Jan 29 07:53:17 2015 -0500
@@ -0,0 +1,105 @@
+<tool id="tp_uniq_tool" name="Unique lines" version="@BASE_VERSION@.0">
+    <description>assuming sorted input file</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements">
+        <requirement type="package" version="4.2.2-sandbox">gnu_sed</requirement>
+    </expand>
+    <version_command>uniq --version | head -n 1</version_command>
+    <command>
+<![CDATA[
+        uniq
+            #if $skipfields:
+                -f $skipfields
+            #end if
+            $ignorecase
+
+            #if $grouping.grouping_select == 'yes':
+                --group=$grouping.group
+            #else:
+                $grouping.count
+                $grouping.repeated
+                $grouping.uniqueonly
+            #end if
+
+            "$infile"
+
+            #if $grouping.grouping_select == 'no' and $grouping.count:
+                ## count will print the count with spaces in front of the line and
+                ## with a space (not a tab) after the number, we need to cahnge that
+                | sed -e 's/ *//' -e 's/ /\t/'
+            #end if
+            > "$outfile"
+]]>
+    </command>
+    <inputs>
+        <param name="infile" format="tabular" type="data"
+            label="File to scan for unique values" help="Make sure you have sorted this file" />
+
+        <conditional name="grouping">
+            <param name="grouping_select" type="select" label="Do you want to group each unique group?">
+                <option value="no">No</option>
+                <option value="yes">Yes</option>
+            </param>
+            <when value="no">
+                <param name="count" type="boolean" truevalue="-c" falsevalue=""
+                    label="Counting number of occurrences" help="Prefix lines by the number of occurrences. (-c)" />
+                <param name="repeated" type="boolean" truevalue="-d" falsevalue=""
+                    label="Only print duplicate lines" help="(-d)"/>
+                <param name="uniqueonly" type="boolean" checked="True" truevalue="-u" falsevalue=""
+                    label="Only print unique lines" help="(-u)" />
+            </when>
+            <when value="yes">
+                <param name="group" type="select"
+                    label="Output all lines, and delimit each unique group" help="(--group)">
+                    <option value="">No grouping at all</option>
+                    <option value="separate">Separate unique groups with a single delimiter</option>
+                    <option value="prepend">Output a delimiter before each group of unique items</option>
+                    <option value="append">Output a delimiter after each group of unique items</option>
+                    <option value="both">Output a delimiter around each group of unique items</option>
+                </param>
+            </when>
+        </conditional>
+
+        <param name="ignorecase" type="boolean" truevalue="-i" falsevalue=""
+            label="Ignore differences in case when comparing" help="(-i)"/>
+        <param name="skipfields" type="integer" size="2" value="0"
+            label="Avoid comparing the first N fields" help="Use zero to start from the first field. (-f)" />
+    </inputs>
+    <outputs>
+        <data name="outfile" format_source="infile" metadata_source="infile"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="infile" value="sorted_uniq1.tabular" />
+            <param name="grouping_select" value="no"/>
+            <param name="count" value="True"/>
+            <param name="ignorecase" value="True"/>
+            <param name="uniqueonly" value="True"/>
+            <output name="outfile" file="sorted_uniq_results1.tabular" />
+        </test>
+        <test>
+            <param name="infile" value="sorted_uniq1.tabular" />
+            <param name="ignorecase" value="True"/>
+            <param name="grouping_select" value="yes"/>
+            <param name="group" value="separate"/>
+            <output name="outfile" file="sorted_uniq_results2.tabular" />
+        </test>
+    </tests>
+    <help>
+<![CDATA[
+This tool takes a sorted file and look for lines that are unique.
+
+.. class:: warningmark
+
+Please make sure your file is sorted, or else this tool will give you an erroneous output.
+
+.. class:: infomark
+
+You can sort your file using either the "Sort" tool in "Filter and Sort", or the "Sort" tool in "Unix Tools".
+
+@REFERENCES@
+]]>
+    </help>
+</tool>
author	bgruening
date	Thu, 29 Jan 2015 07:53:17 -0500
parents
children	37e1eb05b1b4