Mercurial > repos > jjohnson > pandas_pivot_table
changeset 0:621144f8dbe9 draft
"planemo upload for repository https://github.com/jj-umn/galaxytools/tree/master/pandas_pivot_table/ commit 80684939b0bf75abb5cc70a9878054c1f734b651-dirty"
author | jjohnson |
---|---|
date | Wed, 02 Dec 2020 22:59:57 +0000 |
parents | |
children | c02f59711eb6 |
files | pandas_pivot_table.py pandas_pivot_table.xml test-data/table1.tsv |
diffstat | 3 files changed, 350 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pandas_pivot_table.py Wed Dec 02 22:59:57 2020 +0000 @@ -0,0 +1,128 @@ +#!/usr/bin/env python + +import argparse +import json +import re +import pandas as pd +import sys +from json.decoder import JSONDecodeError + + +def __main__(): + p = argparse.ArgumentParser() + p.add_argument( + '-i', '--input', + type=argparse.FileType('r'), + required=True, + help='Tabular input file to pivot' + ) + p.add_argument( + '-o', '--output', + type=argparse.FileType('w'), + required=True, + help='Output file' + ) + p.add_argument( + '-S', '--skiprows', + type=int, + default=0, + help='Input column names' + ) + p.add_argument( + '-H', '--header', + default=None, + help='Input column names' + ) + p.add_argument( + '-P', '--prefix', + default=None, + help='Prefix for input column names' + ) + p.add_argument( + '-I', '--index', + help='index columns' + ) + p.add_argument( + '-C', '--columns', + help='columns values which are returned as columns' + ) + p.add_argument( + '-V', '--values', + help='values' + ) + p.add_argument( + '-F', '--aggfunc', + help='aggregate functions on the values' + ) + p.add_argument( + '-N', '--fill_value', + default=None, + help='fill value for missing values' + ) + args = p.parse_args() + + def getValueType(val): + if val or 0. == val: + try: + return int(val) + except ValueError: + try: + return float(val) + except ValueError: + return val + return None + + def getColumn(name, dfcols): + if name in dfcols: + return name + else: + try: + i = int(name) + return dfcols[i] + except: + print('%s not a column in %s' % (name, dfcols), file=sys.stderr) + exit(1) + + def getColumns(val, dfcols): + fields = [v.strip() for v in val.split(',')] + cols = [] + for name in fields: + cols.append(getColumn(name, dfcols)) + return cols + + def getAggFunc(funcStr, dfcols): + af = funcStr + try: + af = json.loads(funcStr) + except JSONDecodeError as de: + print('"%s" is not a json string: ' % funcStr, de.msg, file=sys.stderr) + exit(1) + if isinstance(af, dict): + aggfunc = {getColumn(k, dfcols) : v for k,v in af.items()} + elif isinstance(af, list): + aggfunc = af + else: + aggfunc = af + return aggfunc + + if args.prefix: + df = pd.read_table(args.input, skiprows=args.skiprows, header=None, prefix=args.prefix) + elif args.header: + df = pd.read_table(args.input, skiprows=args.skiprows, header=args.header) + else: + df = pd.read_table(args.input, skiprows=args.skiprows) + df_columns = df.columns.tolist() + index = getColumns(args.index, df_columns) + columns = getColumns(args.columns, df_columns) + values = getColumns(args.values, df_columns) + fill_value = getValueType(args.fill_value) + aggfunc = getAggFunc(args.aggfunc, values) + pdf = df.pivot_table(index=index, columns=columns, + values=values, aggfunc=aggfunc, + fill_value=fill_value) + pdf_cols = ['_'.join(reversed(p)) if isinstance(p, tuple) else p for p in pdf.columns.tolist()] + pdf.to_csv(args.output, sep='\t', float_format='%0.6f', header=pdf_cols) + + +if __name__ == "__main__": + __main__()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pandas_pivot_table.xml Wed Dec 02 22:59:57 2020 +0000 @@ -0,0 +1,212 @@ +<tool id="pandas_pivot_table" name="Pivot Table" version="@VERSION@.0" python_template_version="3.5"> + <description>transform tabular data</description> + <macros> + <token name="@VERSION@">1.1.4</token> + </macros> + <requirements> + <requirement type="package" version="@VERSION@">pandas</requirement> + </requirements> + <command detect_errors="exit_code"><![CDATA[ + $__tool_directory__/pandas_pivot_table.py + #if $skiprows + --skiprows $skiprows + #end if + #if $header.header_choice == 'prefix' + --prefix $header.prefix + #elif $header.header_choice == 'enter_names' + --header $header.names + #end if + --index '$pvt_index' + --columns '$pvt_columns' + --values '$pvt_values' + --aggfunc='$aggfunc' + --input '$input' + --output '$output' + ]]></command> + <inputs> + <param name="input" type="data" format="tabular" label="Tabular table for pivot transformation"/> + <conditional name="header"> + <param name="header_choice" type="select" label="Use as header"> + <option value="first_line">Dataset has column names in the first line</option> + <option value="enter_names">Enter names for columns</option> + <option value="prefix">Prefix + column position (0-indexed)</option> + </param> + <when value="first_line"/> + <when value="enter_names"> + <param name="names" type="text" value="" label="Names for columns (no duplicates) separated by commas"> + <validator type="regex" message="Column names separated by commas">^[A-Za-z]\w*(,[A-Za-z]\w*)*$</validator> + </param> + </when> + <when value="prefix"> + <param name="prefix" type="text" value="C" label="Prefix before each column number (0-indexed)"> + <validator type="regex" message="A-Za-z,A-Za-z0-9_">^[A-Za-z]\w*$</validator> + </param> + </when> + </conditional> + <param name="skiprows" type="integer" value="0" min="0" label="Skip table rows"/> + <param name="pvt_index" type="text" value="" label="Pivot table index columns"> + <validator type="regex" message="Column names separated by commas">^\S+(,\S+)*$</validator> + </param> + <param name="pvt_columns" type="text" value="" label="Pivot table columns to split into output columns"> + <validator type="regex" message="Column names separated by commas">^\S+(,\S+)*$</validator> + </param> + <param name="pvt_values" type="text" value="" label="Pivot table value columns"> + <validator type="regex" message="Column names separated by commas">^\S+(,\S+)*$</validator> + </param> + <param name="aggfunc" type="text" value="" label="Pivot table aggregate function"> + <help><![CDATA[A valid JSON string, e.g.: + <ul> + <li>A single function applied to each <i>value</i> column: <b>"min"</b></li> + <li>An array of functions applied to each <i>value</i> column: <b>["min", "max", "mean", "std"]</b></li> + <li>A dictionary of <i>value column : functions</i>: <b>{"A" : "sum", "B" : ["min", "max"]}</b></li> + </ul> + ]]></help> + <sanitizer> + <valid initial="string.printable"> + <remove value="'"/> + </valid> + <mapping initial="none"> + <add source="'" target="__sq__"/> + </mapping> + </sanitizer> + </param> + </inputs> + <outputs> + <data name="output" format="tabular"/> + </outputs> + <tests> + <test> + <param name="input" ftype="tabular" value="table1.tsv"/> + <conditional name="header"> + <param name="header_choice" value="first_line"/> + </conditional> + <param name="pvt_index" value="A" /> + <param name="pvt_columns" value="C"/> + <param name="pvt_values" value="D"/> + <param name="aggfunc" value=""max""/> + <output name="output"> + <assert_contents> + <has_text_matching expression="bar\t7\t6" /> + </assert_contents> + </output> + </test> + <test> + <param name="input" ftype="tabular" value="table1.tsv"/> + <conditional name="header"> + <param name="header_choice" value="first_line"/> + </conditional> + <param name="pvt_index" value="A" /> + <param name="pvt_columns" value="C"/> + <param name="pvt_values" value="D"/> + <param name="aggfunc" value="["min","max"]"/> + <output name="output"> + <assert_contents> + <has_text_matching expression="bar\t4\t5\t7\t6" /> + </assert_contents> + </output> + </test> + <test> + <param name="input" ftype="tabular" value="table1.tsv"/> + <conditional name="header"> + <param name="header_choice" value="first_line"/> + </conditional> + <param name="pvt_index" value="C,B" /> + <param name="pvt_columns" value="A"/> + <param name="pvt_values" value="D,E"/> + <param name="aggfunc" value="{"D" : ["min","sum"], "E" : "mean"}"/> + <output name="output"> + <assert_contents> + <has_text_matching expression="C\tB\tbar_min_D\tfoo_min_D\tbar_sum_D\tfoo_sum_D\tbar_mean_E\tfoo_mean_E"/> + <has_text_matching expression="large\tone\t4[.]\d+\t2[.]\d+\t4[.]\d+\t4[.]\d+\t6[.]\d+\t4[.]5\d+"/> + </assert_contents> + </output> + </test> + + </tests> + <help><![CDATA[ +Perform a pivot table operation on a tabular dataset. + +This uses the python pandas_ package to read_ a tabular file, perform a pivot_table_ operation, and write_ out the result as a tabular dataset. + +.. _pandas: https://pandas.pydata.org/pandas-docs/stable/index.html +.. _pivot_table: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.pivot_table.html +.. _read: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_table.html +.. _write: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html + +**Examples** + +**Input dataset**:: + + A B C D E + foo one small 1 2 + foo one large 2 4 + foo one large 2 5 + foo two small 3 5 + foo two small 3 6 + bar one large 4 6 + bar one small 5 8 + bar two small 6 9 + bar two large 7 9 + + +**Example 1** + + Params: *Index: A Columns: C values: D aggfunc: "max"* + + Output:: + + A large_D small_D + bar 7 6 + foo 2 3 + + +**Example 2** + + Params: *Index: A Columns: C values: D aggfunc: ["min", "max"]* + + Output:: + + A large_D_min small_D_min large_D_max small_D_max + bar 4 5 7 6 + foo 2 1 2 3 + + +**Example 3** + + Params: *Index: A Columns: C values: D,E aggfunc: "mean"* + + Output:: + + A large_D small_D large_E small_E + bar 5.500000 5.500000 7.500000 8.500000 + foo 2.000000 2.333333 4.500000 4.333333 + + +**Example 4** + + Params: *Index: A Columns: C values: D,E aggfunc: {"D" : [ "min","sum"], "E" : "mean"}* + + Output:: + + A large_min_D small_min_D large_sum_D small_sum_D large_mean_E small_mean_E + bar 4 5 11 11 7.500000 8.500000 + foo 2 1 4 7 4.500000 4.333333 + + +**Example 5** + + Params: *Index: B,C Columns: A values: E aggfunc: ["min","mean","max"]* + + Output:: + + B C bar_E_min foo_E_min bar_E_mean foo_E_mean bar_E_max foo_E_max + one large 6.000000 4.000000 6.000000 4.500000 6.000000 5.000000 + one small 8.000000 2.000000 8.000000 2.000000 8.000000 2.000000 + two large 9.000000 9.000000 9.000000 + two small 9.000000 5.000000 9.000000 5.500000 9.000000 6.000000 + + ]]></help> + <citations> + <citation type="doi">doi:10.5281/zenodo.4161697</citation> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/table1.tsv Wed Dec 02 22:59:57 2020 +0000 @@ -0,0 +1,10 @@ +A B C D E +foo one small 1 2 +foo one large 2 4 +foo one large 2 5 +foo two small 3 5 +foo two small 3 6 +bar one large 4 6 +bar one small 5 8 +bar two small 6 9 +bar two large 7 9