changeset 0:621144f8dbe9 draft

"planemo upload for repository https://github.com/jj-umn/galaxytools/tree/master/pandas_pivot_table/ commit 80684939b0bf75abb5cc70a9878054c1f734b651-dirty"
author jjohnson
date Wed, 02 Dec 2020 22:59:57 +0000
parents
children c02f59711eb6
files pandas_pivot_table.py pandas_pivot_table.xml test-data/table1.tsv
diffstat 3 files changed, 350 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pandas_pivot_table.py	Wed Dec 02 22:59:57 2020 +0000
@@ -0,0 +1,128 @@
+#!/usr/bin/env python
+
+import argparse
+import json
+import re
+import pandas as pd
+import sys
+from json.decoder import JSONDecodeError
+
+
+def __main__():
+    p = argparse.ArgumentParser()
+    p.add_argument(
+        '-i', '--input',
+        type=argparse.FileType('r'),
+        required=True,
+        help='Tabular input file to pivot'
+    )
+    p.add_argument(
+        '-o', '--output',
+        type=argparse.FileType('w'),
+        required=True,
+        help='Output file'
+    )
+    p.add_argument(
+        '-S', '--skiprows',
+        type=int,
+        default=0,
+        help='Input column names'
+    )
+    p.add_argument(
+        '-H', '--header',
+        default=None,
+        help='Input column names'
+    )
+    p.add_argument(
+        '-P', '--prefix',
+        default=None,
+        help='Prefix for input column names'
+    )
+    p.add_argument(
+        '-I', '--index',
+        help='index columns'
+    )
+    p.add_argument(
+        '-C', '--columns',
+        help='columns values which are returned as columns'
+    )
+    p.add_argument(
+        '-V', '--values',
+        help='values'
+    )
+    p.add_argument(
+        '-F', '--aggfunc',
+        help='aggregate functions on the values'
+    )
+    p.add_argument(
+        '-N', '--fill_value',
+        default=None,
+        help='fill value for missing values'
+    )
+    args = p.parse_args()
+
+    def getValueType(val):
+        if val or 0. == val:
+            try:
+                return int(val)
+            except ValueError:
+                try:
+                    return float(val)
+                except ValueError:
+                    return val
+        return None
+
+    def getColumn(name, dfcols):
+        if name in dfcols:
+            return name
+        else:
+            try:
+                i = int(name)
+                return dfcols[i]
+            except:
+                print('%s not a column in %s' % (name, dfcols), file=sys.stderr)
+                exit(1)
+
+    def getColumns(val, dfcols):
+        fields = [v.strip() for v in val.split(',')]
+        cols = []
+        for name in fields:
+            cols.append(getColumn(name, dfcols))
+        return cols
+
+    def getAggFunc(funcStr, dfcols):
+        af = funcStr
+        try:
+            af = json.loads(funcStr)
+        except JSONDecodeError as de:
+            print('"%s" is not a json string: ' % funcStr, de.msg, file=sys.stderr)
+            exit(1)
+        if isinstance(af, dict):
+            aggfunc = {getColumn(k, dfcols) : v for k,v in af.items()}
+        elif isinstance(af, list):
+            aggfunc = af
+        else:
+            aggfunc = af
+        return aggfunc
+
+    if args.prefix:
+        df = pd.read_table(args.input, skiprows=args.skiprows, header=None, prefix=args.prefix)
+    elif args.header:
+        df = pd.read_table(args.input, skiprows=args.skiprows, header=args.header)
+    else:
+        df = pd.read_table(args.input, skiprows=args.skiprows)
+    df_columns = df.columns.tolist()
+    index = getColumns(args.index, df_columns)
+    columns = getColumns(args.columns, df_columns)
+    values = getColumns(args.values, df_columns)
+    fill_value = getValueType(args.fill_value)
+    aggfunc = getAggFunc(args.aggfunc, values)
+    pdf = df.pivot_table(index=index, columns=columns,
+                         values=values, aggfunc=aggfunc,
+                         fill_value=fill_value)
+    pdf_cols = ['_'.join(reversed(p)) if isinstance(p, tuple) else p for p in pdf.columns.tolist()]
+    pdf.to_csv(args.output, sep='\t', float_format='%0.6f', header=pdf_cols)
+
+
+if __name__ == "__main__":
+    __main__()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pandas_pivot_table.xml	Wed Dec 02 22:59:57 2020 +0000
@@ -0,0 +1,212 @@
+<tool id="pandas_pivot_table" name="Pivot Table" version="@VERSION@.0" python_template_version="3.5">
+    <description>transform tabular data</description>
+    <macros>
+        <token name="@VERSION@">1.1.4</token>
+    </macros>
+    <requirements>
+        <requirement type="package" version="@VERSION@">pandas</requirement>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+      $__tool_directory__/pandas_pivot_table.py 
+        #if $skiprows
+            --skiprows $skiprows
+        #end if
+        #if $header.header_choice == 'prefix'
+            --prefix $header.prefix
+        #elif $header.header_choice == 'enter_names'
+            --header $header.names
+        #end if
+        --index '$pvt_index'
+        --columns '$pvt_columns'
+        --values '$pvt_values'
+        --aggfunc='$aggfunc'
+        --input '$input'
+        --output '$output'
+    ]]></command>
+    <inputs>
+        <param name="input" type="data" format="tabular" label="Tabular table for pivot transformation"/>
+        <conditional name="header">
+            <param name="header_choice" type="select" label="Use as header">
+                <option value="first_line">Dataset has column names in the first line</option>
+                <option value="enter_names">Enter names for columns</option>
+                <option value="prefix">Prefix + column position (0-indexed)</option>
+            </param>
+            <when value="first_line"/>
+            <when value="enter_names">
+                <param name="names" type="text" value="" label="Names for columns (no duplicates) separated by commas">
+                    <validator  type="regex" message="Column names separated by commas">^[A-Za-z]\w*(,[A-Za-z]\w*)*$</validator>
+                </param>
+            </when>
+            <when value="prefix">
+                <param name="prefix" type="text" value="C" label="Prefix before each column number (0-indexed)">
+                    <validator  type="regex" message="A-Za-z,A-Za-z0-9_">^[A-Za-z]\w*$</validator>
+                </param>
+            </when>
+        </conditional>         
+	<param name="skiprows" type="integer" value="0" min="0" label="Skip table rows"/>
+	<param name="pvt_index" type="text" value="" label="Pivot table index columns">
+            <validator type="regex" message="Column names separated by commas">^\S+(,\S+)*$</validator>
+        </param> 
+	<param name="pvt_columns" type="text" value="" label="Pivot table columns to split into output columns">
+            <validator type="regex" message="Column names separated by commas">^\S+(,\S+)*$</validator>
+        </param> 
+	<param name="pvt_values" type="text" value="" label="Pivot table value  columns">
+            <validator type="regex" message="Column names separated by commas">^\S+(,\S+)*$</validator>
+        </param> 
+	<param name="aggfunc" type="text" value="" label="Pivot table aggregate function">
+            <help><![CDATA[A valid JSON string, e.g.:  
+              <ul>
+              <li>A single function applied to each <i>value</i> column:  <b>"min"</b></li>
+              <li>An array of functions applied to each <i>value</i> column:  <b>["min", "max", "mean", "std"]</b></li>
+              <li>A dictionary of <i>value column : functions</i>: <b>{"A" : "sum", "B" : ["min", "max"]}</b></li>
+              </ul>
+            ]]></help>
+            <sanitizer>
+                <valid initial="string.printable">
+                   <remove value="&apos;"/>
+                </valid>
+                <mapping initial="none">
+                    <add source="&apos;" target="__sq__"/>
+                </mapping>
+            </sanitizer>
+        </param> 
+    </inputs>
+    <outputs>
+        <data name="output" format="tabular"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="input" ftype="tabular" value="table1.tsv"/>
+            <conditional name="header">
+                <param name="header_choice" value="first_line"/>
+            </conditional>
+	    <param name="pvt_index" value="A" />
+	    <param name="pvt_columns" value="C"/>
+	    <param name="pvt_values" value="D"/>
+	    <param name="aggfunc" value="&quot;max&quot;"/>
+            <output name="output">
+                <assert_contents>
+                    <has_text_matching expression="bar\t7\t6" />
+                </assert_contents>
+            </output> 
+        </test>
+        <test>
+            <param name="input" ftype="tabular" value="table1.tsv"/>
+            <conditional name="header">
+                <param name="header_choice" value="first_line"/>
+            </conditional>
+	    <param name="pvt_index" value="A" />
+	    <param name="pvt_columns" value="C"/>
+	    <param name="pvt_values" value="D"/>
+	    <param name="aggfunc" value="[&quot;min&quot;,&quot;max&quot;]"/>
+            <output name="output">
+                <assert_contents>
+                    <has_text_matching expression="bar\t4\t5\t7\t6" />
+                </assert_contents>
+            </output> 
+        </test>
+        <test>
+            <param name="input" ftype="tabular" value="table1.tsv"/>
+            <conditional name="header">
+                <param name="header_choice" value="first_line"/>
+            </conditional>
+	    <param name="pvt_index" value="C,B" />
+	    <param name="pvt_columns" value="A"/>
+	    <param name="pvt_values" value="D,E"/>
+	    <param name="aggfunc" value="{&quot;D&quot; : [&quot;min&quot;,&quot;sum&quot;], &quot;E&quot; : &quot;mean&quot;}"/>
+            <output name="output">
+                <assert_contents>
+                    <has_text_matching expression="C\tB\tbar_min_D\tfoo_min_D\tbar_sum_D\tfoo_sum_D\tbar_mean_E\tfoo_mean_E"/>
+                    <has_text_matching expression="large\tone\t4[.]\d+\t2[.]\d+\t4[.]\d+\t4[.]\d+\t6[.]\d+\t4[.]5\d+"/>
+                </assert_contents>
+            </output> 
+        </test>
+
+    </tests>
+    <help><![CDATA[
+Perform a pivot table operation on a tabular dataset.
+
+This uses the python pandas_ package to read_ a tabular file, perform a pivot_table_ operation, and write_ out the result as a tabular dataset. 
+
+.. _pandas: https://pandas.pydata.org/pandas-docs/stable/index.html
+.. _pivot_table: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.pivot_table.html
+.. _read: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_table.html
+.. _write: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html
+
+**Examples**
+
+**Input dataset**::
+
+    A   B   C     D  E
+    foo one small 1  2
+    foo one large 2  4
+    foo one large 2  5
+    foo two small 3  5
+    foo two small 3  6
+    bar one large 4  6
+    bar one small 5  8
+    bar two small 6  9
+    bar two large 7  9
+
+
+**Example 1**
+  
+  Params:  *Index: A  Columns: C  values: D  aggfunc: "max"*
+  
+  Output::
+  
+    A   large_D	 small_D 
+    bar 7        6
+    foo 2        3
+
+
+**Example 2**
+  
+  Params:  *Index: A  Columns: C  values: D  aggfunc: ["min", "max"]*
+  
+  Output::
+  
+    A   large_D_min small_D_min large_D_max small_D_max
+    bar 4           5           7           6
+    foo 2           1           2           3
+
+
+**Example 3**
+  
+  Params: *Index: A  Columns: C  values: D,E  aggfunc: "mean"*
+  
+  Output::
+  
+    A   large_D	 small_D  large_E  small_E
+    bar 5.500000 5.500000 7.500000 8.500000
+    foo 2.000000 2.333333 4.500000 4.333333
+
+
+**Example 4**
+  
+  Params:  *Index: A  Columns: C  values: D,E  aggfunc: {"D" : [ "min","sum"], "E" : "mean"}*
+  
+  Output::
+  
+    A   large_min_D small_min_D large_sum_D small_sum_D large_mean_E small_mean_E
+    bar 4           5           11          11          7.500000     8.500000
+    foo 2           1           4           7           4.500000     4.333333
+
+
+**Example 5**
+  
+  Params:  *Index: B,C  Columns: A  values: E  aggfunc: ["min","mean","max"]*
+  
+  Output::
+  
+    B   C     bar_E_min foo_E_min bar_E_mean foo_E_mean bar_E_max foo_E_max
+    one large 6.000000  4.000000  6.000000   4.500000   6.000000  5.000000
+    one small 8.000000  2.000000  8.000000   2.000000   8.000000  2.000000
+    two large 9.000000            9.000000              9.000000          
+    two small 9.000000  5.000000  9.000000   5.500000   9.000000  6.000000
+
+    ]]></help>
+    <citations>
+        <citation type="doi">doi:10.5281/zenodo.4161697</citation>
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/table1.tsv	Wed Dec 02 22:59:57 2020 +0000
@@ -0,0 +1,10 @@
+A	B	C	D	E
+foo	one	small	1	2
+foo	one	large	2	4
+foo	one	large	2	5
+foo	two	small	3	5
+foo	two	small	3	6
+bar	one	large	4	6
+bar	one	small	5	8
+bar	two	small	6	9
+bar	two	large	7	9