Mercurial > repos > bgruening > text_processing

<tool id="tp_uniq_tool" name="Unique lines" version="@BASE_VERSION@.0">
    <description>assuming sorted input file</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="requirements">
        <requirement type="package" version="4.2.3.dev0">sed</requirement>
    </expand>
    <version_command>uniq --version | head -n 1</version_command>
    <command>
<![CDATA[
        uniq
            #if $skipfields:
                -f $skipfields
            #end if
            $ignorecase

            #if $grouping.grouping_select == 'yes':
                --group=$grouping.group
            #else:
                $grouping.count
                $grouping.repeated
                $grouping.uniqueonly
            #end if

            "$infile"

            #if $grouping.grouping_select == 'no' and $grouping.count:
                ## count will print the count with spaces in front of the line and
                ## with a space (not a tab) after the number, we need to cahnge that
                | sed -e 's/ *//' -e 's/ /\t/'
            #end if
            > "$outfile"
]]>
    </command>
    <inputs>
        <param name="infile" format="tabular" type="data"
            label="File to scan for unique values" help="Make sure you have sorted this file" />

        <conditional name="grouping">
            <param name="grouping_select" type="select" label="Do you want to group each unique group?">
                <option value="no">No</option>
                <option value="yes">Yes</option>
            </param>
            <when value="no">
                <param name="count" type="boolean" truevalue="-c" falsevalue=""
                    label="Counting number of occurrences" help="Prefix lines by the number of occurrences. (-c)" />
                <param name="repeated" type="boolean" truevalue="-d" falsevalue=""
                    label="Only print duplicate lines" help="(-d)"/>
                <param name="uniqueonly" type="boolean" checked="True" truevalue="-u" falsevalue=""
                    label="Only print unique lines" help="(-u)" />
            </when>
            <when value="yes">
                <param name="group" type="select"
                    label="Output all lines, and delimit each unique group" help="(--group)">
                    <option value="">No grouping at all</option>
                    <option value="separate">Separate unique groups with a single delimiter</option>
                    <option value="prepend">Output a delimiter before each group of unique items</option>
                    <option value="append">Output a delimiter after each group of unique items</option>
                    <option value="both">Output a delimiter around each group of unique items</option>
                </param>
            </when>
        </conditional>

        <param name="ignorecase" type="boolean" truevalue="-i" falsevalue=""
            label="Ignore differences in case when comparing" help="(-i)"/>
        <param name="skipfields" type="integer" value="0"
            label="Avoid comparing the first N fields" help="Use zero to start from the first field. (-f)" />
    </inputs>
    <outputs>
        <data name="outfile" format_source="infile" metadata_source="infile"/>
    </outputs>
    <tests>
        <test>
            <param name="infile" value="sorted_uniq1.tabular" />
            <param name="grouping_select" value="no"/>
            <param name="count" value="True"/>
            <param name="ignorecase" value="True"/>
            <param name="uniqueonly" value="True"/>
            <output name="outfile" file="sorted_uniq_results1.tabular" />
        </test>
        <test>
            <param name="infile" value="sorted_uniq1.tabular" />
            <param name="ignorecase" value="True"/>
            <param name="grouping_select" value="yes"/>
            <param name="group" value="separate"/>
            <output name="outfile" file="sorted_uniq_results2.tabular" />
        </test>
    </tests>
    <help>
<![CDATA[
This tool takes a sorted file and look for lines that are unique.

.. class:: warningmark

Please make sure your file is sorted, or else this tool will give you an erroneous output.

.. class:: infomark

You can sort your file using either the "Sort" tool in "Filter and Sort", or the "Sort" tool in "Unix Tools".

@REFERENCES@
]]>
    </help>
    <expand macro="citations" />
</tool>
author	bgruening
date	Sat, 06 May 2017 13:09:05 -0400
parents	20344ce0c811
children	74a8bef53a00