Mercurial > repos > bgruening > text_processing
diff sorted_uniq.xml @ 0:5314e5d6f040 draft
Imported from capsule None
author | bgruening |
---|---|
date | Thu, 29 Jan 2015 07:53:17 -0500 |
parents | |
children | 37e1eb05b1b4 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sorted_uniq.xml Thu Jan 29 07:53:17 2015 -0500 @@ -0,0 +1,105 @@ +<tool id="tp_uniq_tool" name="Unique lines" version="@BASE_VERSION@.0"> + <description>assuming sorted input file</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements"> + <requirement type="package" version="4.2.2-sandbox">gnu_sed</requirement> + </expand> + <version_command>uniq --version | head -n 1</version_command> + <command> +<![CDATA[ + uniq + #if $skipfields: + -f $skipfields + #end if + $ignorecase + + #if $grouping.grouping_select == 'yes': + --group=$grouping.group + #else: + $grouping.count + $grouping.repeated + $grouping.uniqueonly + #end if + + "$infile" + + #if $grouping.grouping_select == 'no' and $grouping.count: + ## count will print the count with spaces in front of the line and + ## with a space (not a tab) after the number, we need to cahnge that + | sed -e 's/ *//' -e 's/ /\t/' + #end if + > "$outfile" +]]> + </command> + <inputs> + <param name="infile" format="tabular" type="data" + label="File to scan for unique values" help="Make sure you have sorted this file" /> + + <conditional name="grouping"> + <param name="grouping_select" type="select" label="Do you want to group each unique group?"> + <option value="no">No</option> + <option value="yes">Yes</option> + </param> + <when value="no"> + <param name="count" type="boolean" truevalue="-c" falsevalue="" + label="Counting number of occurrences" help="Prefix lines by the number of occurrences. (-c)" /> + <param name="repeated" type="boolean" truevalue="-d" falsevalue="" + label="Only print duplicate lines" help="(-d)"/> + <param name="uniqueonly" type="boolean" checked="True" truevalue="-u" falsevalue="" + label="Only print unique lines" help="(-u)" /> + </when> + <when value="yes"> + <param name="group" type="select" + label="Output all lines, and delimit each unique group" help="(--group)"> + <option value="">No grouping at all</option> + <option value="separate">Separate unique groups with a single delimiter</option> + <option value="prepend">Output a delimiter before each group of unique items</option> + <option value="append">Output a delimiter after each group of unique items</option> + <option value="both">Output a delimiter around each group of unique items</option> + </param> + </when> + </conditional> + + <param name="ignorecase" type="boolean" truevalue="-i" falsevalue="" + label="Ignore differences in case when comparing" help="(-i)"/> + <param name="skipfields" type="integer" size="2" value="0" + label="Avoid comparing the first N fields" help="Use zero to start from the first field. (-f)" /> + </inputs> + <outputs> + <data name="outfile" format_source="infile" metadata_source="infile"/> + </outputs> + <tests> + <test> + <param name="infile" value="sorted_uniq1.tabular" /> + <param name="grouping_select" value="no"/> + <param name="count" value="True"/> + <param name="ignorecase" value="True"/> + <param name="uniqueonly" value="True"/> + <output name="outfile" file="sorted_uniq_results1.tabular" /> + </test> + <test> + <param name="infile" value="sorted_uniq1.tabular" /> + <param name="ignorecase" value="True"/> + <param name="grouping_select" value="yes"/> + <param name="group" value="separate"/> + <output name="outfile" file="sorted_uniq_results2.tabular" /> + </test> + </tests> + <help> +<![CDATA[ +This tool takes a sorted file and look for lines that are unique. + +.. class:: warningmark + +Please make sure your file is sorted, or else this tool will give you an erroneous output. + +.. class:: infomark + +You can sort your file using either the "Sort" tool in "Filter and Sort", or the "Sort" tool in "Unix Tools". + +@REFERENCES@ +]]> + </help> +</tool>