Mercurial > repos > jjohnson > pandas_pivot_table

<tool id="pandas_pivot_table" name="Pivot Table" version="@VERSION@.1" python_template_version="3.5">
    <description>transform tabular data</description>
    <macros>
        <token name="@VERSION@">1.1.4</token>
        <token name="@AGGFUNC@">'(min|max|sum|size|count|mean|std|var|prod|all|any)'</token>
        <token name="@AGGFUNCS@">(@AGGFUNC@|[[]\s*@AGGFUNC@(,\s*@AGGFUNC@)*])</token>
        <token name="@AGGITEM@">'\S+'\s*:\s*@AGGFUNCS@</token>
        <token name="@AGGDICT@">{@AGGITEM@(,\s*@AGGITEM@)*}</token>
        <token name="@AGGF@">(@AGGFUNCS@|@AGGDICT@)</token>
    </macros>
    <requirements>
        <requirement type="package" version="@VERSION@">pandas</requirement>
    </requirements>
    <command detect_errors="exit_code"><![CDATA[
      $__tool_directory__/pandas_pivot_table.py
        #if $skiprows
            --skiprows $skiprows
        #end if
        #if $header.header_choice == 'prefix'
            --prefix $header.prefix
        #elif $header.header_choice == 'enter_names'
            --header $header.names
        #end if
        --index '$pvt_index'
        --columns '$pvt_columns'
        --values '$pvt_values'
        --aggfunc="$aggfunc"
        #if $fill_value
            --fill_value '$fill_value'
        #end if
        #if $float_format
            --float_format '$float_format'
        #end if
        --input '$input'
        --output '$output'
    ]]></command>
    <inputs>
        <param name="input" type="data" format="tabular" label="Tabular table for pivot transformation"/>
        <conditional name="header">
            <param name="header_choice" type="select" label="Use as header">
                <option value="first_line">Dataset has column names in the first line</option>
                <option value="enter_names">Enter names for columns</option>
                <option value="prefix">Prefix + column position (0-indexed)</option>
            </param>
            <when value="first_line"/>
            <when value="enter_names">
                <param name="names" type="text" value="" label="Names for columns (no duplicates) separated by commas">
                    <validator  type="regex" message="Column names separated by commas">^[A-Za-z]\w*(,[A-Za-z]\w*)*$</validator>
                </param>
            </when>
            <when value="prefix">
                <param name="prefix" type="text" value="C" label="Prefix before each column number (0-indexed)">
                    <validator  type="regex" message="A-Za-z,A-Za-z0-9_">^[A-Za-z]\w*$</validator>
                </param>
            </when>
        </conditional>
	<param name="skiprows" type="integer" value="0" min="0" label="Skip table rows"/>
	<param name="pvt_index" type="text" value="" label="Pivot table index columns">
            <validator type="regex" message="Column names separated by commas">^\S+(,\S+)*$</validator>
        </param>
	<param name="pvt_columns" type="text" value="" label="Pivot table columns to split into output columns">
            <validator type="regex" message="Column names separated by commas">^\S+(,\S+)*$</validator>
        </param>
	<param name="pvt_values" type="text" value="" label="Pivot table value  columns">
            <validator type="regex" message="Column names separated by commas">^\S+(,\S+)*$</validator>
        </param>
	<param name="aggfunc" type="text" value="" label="Pivot table aggregate function">
            <help><![CDATA[
                <ul>
                <li>Available Number Functions: @AGGFUNC@</li>
                <li>Specify functions as (remember the single quotes):</li>
                    <ul>
                      <li>  - A single function applied to each <i>value</i> column:  <b>'min'</b></li>
                      <li>  - An array of functions applied to each <i>value</i> column:  <b>['min', 'max', 'mean', 'std']</b></li>
                      <li>  - A dictionary of <i>value column : function(s)</i>: <b>{'A' : 'sum', 'B' : ['min', 'max']}</b></li>
                    </ul>
                </ul>
            ]]></help>
            <validator type="regex" message="Do not forget the single quotes">@AGGF@</validator>
            <sanitizer>
                <valid initial="string.printable">
                </valid>
            </sanitizer>
        </param>
	<param name="fill_value" type="text" value="" optional="true" label="Fill value (optional)"
            help="Value to replace missing values with (in the resulting pivot table, after aggregation) default is an empty field"/>
	<param name="float_format" type="text" value="" optional="true" label="Output floating point format (optional)">
            <help><![CDATA[Default is six decimal places: <i>%0.<b>6</b>f</i> For scientific: <i>%0.6<b>e</b></i>]]></help>
            <validator type="regex" message="%0.6f">^%\d+.\d+[fFeEgGn]$</validator>
            <sanitizer>
                <valid initial="string.digits">
                     <add value="%" />
                     <add value="." />
                     <add value="f" />
                     <add value="e" />
                     <add value="g" />
                     <add value="F" />
                     <add value="E" />
                     <add value="G" />
                     <add value="n" />
                 </valid>
            </sanitizer>
        </param>
    </inputs>
    <outputs>
        <data name="output" format="tabular"/>
    </outputs>
    <tests>
        <test>
            <param name="input" ftype="tabular" value="table1.tsv"/>
            <conditional name="header">
                <param name="header_choice" value="first_line"/>
            </conditional>
	    <param name="pvt_index" value="A" />
	    <param name="pvt_columns" value="C"/>
	    <param name="pvt_values" value="D"/>
	    <param name="aggfunc" value="'max'"/>
            <output name="output">
                <assert_contents>
                    <has_text_matching expression="bar\t7\t6" />
                </assert_contents>
            </output>
        </test>
        <test>
            <param name="input" ftype="tabular" value="table1.tsv"/>
            <conditional name="header">
                <param name="header_choice" value="first_line"/>
            </conditional>
	    <param name="pvt_index" value="A" />
	    <param name="pvt_columns" value="C"/>
	    <param name="pvt_values" value="D"/>
	    <param name="aggfunc" value="['min','max']"/>
            <output name="output">
                <assert_contents>
                    <has_text_matching expression="bar\t4\t5\t7\t6" />
                </assert_contents>
            </output>
        </test>
        <test>
            <param name="input" ftype="tabular" value="table1.tsv"/>
            <conditional name="header">
                <param name="header_choice" value="first_line"/>
            </conditional>
	    <param name="pvt_index" value="C,B" />
	    <param name="pvt_columns" value="A"/>
	    <param name="pvt_values" value="D,E"/>
	    <param name="aggfunc" value="{'D' : ['min','sum'], 'E' : 'mean'}"/>
            <output name="output">
                <assert_contents>
                    <has_text_matching expression="C\tB\tbar_min_D\tfoo_min_D\tbar_sum_D\tfoo_sum_D\tbar_mean_E\tfoo_mean_E"/>
                    <has_text_matching expression="large\tone\t4[.]\d+\t2[.]\d+\t4[.]\d+\t4[.]\d+\t6[.]\d+\t4[.]5\d+"/>
                </assert_contents>
            </output>
        </test>

    </tests>
    <help><![CDATA[
Perform a pivot table operation on a tabular dataset.

This uses the python pandas_ package to read_ a tabular file, perform a pivot_table_ operation, and write_ out the result as a tabular dataset.

.. _pandas: https://pandas.pydata.org/pandas-docs/stable/index.html
.. _pivot_table: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.pivot_table.html
.. _read: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_table.html
.. _write: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html

**Examples**

**Input dataset**::

    A   B   C     D  E
    foo one small 1  2
    foo one large 2  4
    foo one large 2  5
    foo two small 3  5
    foo two small 3  6
    bar one large 4  6
    bar one small 5  8
    bar two small 6  9
    bar two large 7  9


**Example 1**

  Params:  *Index: A  Columns: C  values: D  aggfunc: "max"*

  Output::

    A   large_D	 small_D
    bar 7        6
    foo 2        3


**Example 2**

  Params:  *Index: A  Columns: C  values: D  aggfunc: ["min", "max"]*

  Output::

    A   large_D_min small_D_min large_D_max small_D_max
    bar 4           5           7           6
    foo 2           1           2           3


**Example 3**

  Params: *Index: A  Columns: C  values: D,E  aggfunc: "mean"*

  Output::

    A   large_D	 small_D  large_E  small_E
    bar 5.500000 5.500000 7.500000 8.500000
    foo 2.000000 2.333333 4.500000 4.333333


**Example 4**

  Params:  *Index: A  Columns: C  values: D,E  aggfunc: {"D" : [ "min","sum"], "E" : "mean"}*

  Output::

    A   large_min_D small_min_D large_sum_D small_sum_D large_mean_E small_mean_E
    bar 4           5           11          11          7.500000     8.500000
    foo 2           1           4           7           4.500000     4.333333


**Example 5**

  Params:  *Index: B,C  Columns: A  values: E  aggfunc: ["min","mean","max"]*

  Output::

    B   C     bar_E_min foo_E_min bar_E_mean foo_E_mean bar_E_max foo_E_max
    one large 6.000000  4.000000  6.000000   4.500000   6.000000  5.000000
    one small 8.000000  2.000000  8.000000   2.000000   8.000000  2.000000
    two large 9.000000            9.000000              9.000000
    two small 9.000000  5.000000  9.000000   5.500000   9.000000  6.000000

    ]]></help>
    <citations>
        <citation type="doi">doi:10.5281/zenodo.4161697</citation>
    </citations>
</tool>
author	jjohnson
date	Thu, 17 Dec 2020 22:23:11 +0000
parents	6f05390deffa
children	eaf2444a2a50