Mercurial > repos > devteam > column_maker
changeset 5:9cd341095afd draft
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/column_maker commit a993d43d9d1702a6cf584683cf72527a3f999236"
author | devteam |
---|---|
date | Wed, 30 Dec 2020 00:50:15 +0000 |
parents | 6e8d94597139 |
children | 13b6f0007d9e |
files | column_maker.py column_maker.xml |
diffstat | 2 files changed, 198 insertions(+), 177 deletions(-) [+] |
line wrap: on
line diff
--- a/column_maker.py Wed Jul 15 10:38:50 2020 -0400 +++ b/column_maker.py Wed Dec 30 00:50:15 2020 +0000 @@ -5,31 +5,48 @@ original file. The tool will skip over invalid lines within the file, informing the user about the number of lines skipped. """ -from __future__ import print_function +import argparse +import json import re -import sys - -assert sys.version_info[:2] >= (2, 4) -inp_file = sys.argv[1] -out_file = sys.argv[2] -expr = sys.argv[3] -round_result = sys.argv[4] +parser = argparse.ArgumentParser() +parser.add_argument('input', type=argparse.FileType('r'), help="input file") +parser.add_argument('output', type=argparse.FileType('wt'), help="output file") +parser.add_argument('cond', nargs='?', type=str, help="expression") +parser.add_argument('round', nargs='?', type=str, choices=['yes', 'no'], + help="round result") +parser.add_argument('columns', nargs='?', type=int, help="number of columns") +parser.add_argument('column_types', nargs='?', type=str, help="comma separated list of column types") +parser.add_argument('avoid_scientific_notation', nargs='?', type=str, choices=['yes', 'no'], + help="avoid scientific notation") +parser.add_argument('--load_json', default=None, type=argparse.FileType('r'), + help="overwrite parsed arguments from json file") +args = parser.parse_args() + +argparse_dict = vars(args) +if args.load_json: + json_dict = json.load(args.load_json) + argparse_dict.update(json_dict) + +fh = argparse_dict['input'] +out = argparse_dict['output'] +expr = argparse_dict['cond'] +round_result = argparse_dict['round'] try: - in_columns = int(sys.argv[5]) + in_columns = int(argparse_dict['columns']) except Exception: exit("Missing or invalid 'columns' metadata value, click the pencil icon in the history item and select the Auto-detect option to correct it. This tool can only be used with tab-delimited data.") if in_columns < 2: # To be considered tabular, data must fulfill requirements of the sniff.is_column_based() method. exit("Missing or invalid 'columns' metadata value, click the pencil icon in the history item and select the Auto-detect option to correct it. This tool can only be used with tab-delimited data.") try: - in_column_types = sys.argv[6].split(',') + in_column_types = argparse_dict['column_types'].split(',') except Exception: exit("Missing or invalid 'column_types' metadata value, click the pencil icon in the history item and select the Auto-detect option to correct it. This tool can only be used with tab-delimited data.") if len(in_column_types) != in_columns: exit("The 'columns' metadata setting does not conform to the 'column_types' metadata setting, click the pencil icon in the history item and select the Auto-detect option to correct it. This tool can only be used with tab-delimited data.") -avoid_scientific_notation = sys.argv[7] +avoid_scientific_notation = argparse_dict['avoid_scientific_notation'] # Unescape if input has been escaped mapped_str = { @@ -74,7 +91,6 @@ invalid_line = None lines_kept = 0 total_lines = 0 -out = open(out_file, 'wt') # Read input file, skipping invalid lines, and perform computation that will result in a new column code = ''' @@ -89,7 +105,6 @@ ) from numpy import format_float_positional -fh = open(inp_file) for i, line in enumerate(fh): total_lines += 1 line = line.rstrip('\\r\\n')
--- a/column_maker.xml Wed Jul 15 10:38:50 2020 -0400 +++ b/column_maker.xml Wed Dec 30 00:50:15 2020 +0000 @@ -1,164 +1,170 @@ -<tool id="Add_a_column1" name="Compute" version="1.3.1"> - <description>an expression on every row</description> - <requirements> - <requirement type="package" version="2.7.13">python</requirement> - <requirement type="package" version="4.4">sed</requirement> - <requirement type="package" version="1.14">numpy</requirement> - </requirements> - <command detect_errors="aggressive"><![CDATA[ - #if $header_lines_conditional.header_lines_select == "yes": - (sed -n '1,1p' '$input' | sed "s|$|%${header_lines_conditional.header_new_column_name}|" | tr "%" "\t") > header && - sed '1,1d' '$input' > data && - #else: - touch header && - ln -s '$input' data && - #end if - - python '$__tool_directory__/column_maker.py' - data column_maker_output - "$cond" - $round - ${input.metadata.columns} - "${input.metadata.column_types}" - $avoid_scientific_notation && - cat header column_maker_output > '$out_file1' - ]]></command> - <inputs> - <param name="cond" type="text" value="c3-c2" label="Add expression"/> - <param format="tabular" name="input" type="data" label="as a new column to" help="Dataset missing? See TIP below"/> - <param name="round" type="select" label="Round result?"> - <option value="no">NO</option> - <option value="yes">YES</option> - </param> - <conditional name="header_lines_conditional"> - <param name="header_lines_select" type="select" label="Skip a header line" help="# characters are already considered as comments and kept" > - <option value="no" >no</option> - <option value="yes" >yes</option> - </param> - <when value="no"> - </when> - <when value="yes"> - <param name="header_new_column_name" type="text" value="New Column" label="The new column name" /> - </when> - </conditional> - <param name="avoid_scientific_notation" type="select" label="Avoid scientific notation" help="If yes, use fully expanded decimal representation when writing new columns (use only if expression produces decimal numbers)."> - <option value="no">no</option> - <option value="yes">yes</option> - </param> - </inputs> - <outputs> - <data format_source="input" name="out_file1" metadata_source="input"/> - </outputs> - <tests> - <test> - <param name="cond" value="c3-c2"/> - <param name="input" value="1.bed"/> - <param name="round" value="no"/> - <output name="out_file1" file="column_maker_out1.interval"/> - </test> - <test> - <param name="cond" value="c4*1"/> - <param name="input" value="1.interval"/> - <param name="round" value="no"/> - <output name="out_file1" file="column_maker_out2.interval"/> - </test> - <test> - <param name="cond" value="c4*1"/> - <param name="input" value="1.header.tsv"/> - <param name="round" value="no"/> - <conditional name="header_lines_conditional"> - <param name="header_lines_select" value="yes" /> - <param name="header_new_column_name" value="value1_again" /> - </conditional> - <output name="out_file1" file="column_maker_out2.header.tsv"/> - </test> - <test> - <param name="cond" value="c4*1"/> - <param name="input" value="1.interval"/> - <param name="round" value="yes"/> - <output name="out_file1" file="column_maker_out3.interval"/> - </test> - <test> - <param name="cond" value="float(.0000000000001)"/> - <param name="input" value="1.bed"/> - <param name="round" value="no"/> - <output name="out_file1"> - <assert_contents> - <has_text text="CCDS10397" /> - <has_text text="1e-13" /> - </assert_contents> - </output> - </test> - <test> - <param name="cond" value="float(.0000000000001)"/> - <param name="input" value="1.bed"/> - <param name="round" value="no"/> - <param name="avoid_scientific_notation" value="yes"/> - <output name="out_file1"> - <assert_contents> - <has_text text="CCDS10397" /> - <has_text text=".0000000000001" /> - <not_has_text text="1e-13" /> - </assert_contents> - </output> - </test> - </tests> - <help> - - .. class:: infomark - -**TIP:** If your data is not TAB delimited, use *Text Manipulation->Convert* - ------ - -**What it does** - -This tool computes an expression for every row of a dataset and appends the result as a new column (field). - -- Columns are referenced with **c** and a **number**. For example, **c1** refers to the first column of a tab-delimited file - -- **c3-c2** will add a length column to the dataset if **c2** and **c3** are start and end position - ------ - -**Example** - -If this is your input:: - - chr1 151077881 151077918 2 200 - - chr1 151081985 151082078 3 500 + - -computing "c4*c5" will produce:: - - chr1 151077881 151077918 2 200 - 400.0 - chr1 151081985 151082078 3 500 + 1500.0 - -if, at the same time, "Round result?" is set to **YES** results will look like this:: - - chr1 151077881 151077918 2 200 - 400 - chr1 151081985 151082078 3 500 + 1500 - -You can also use this tool to evaluate expressions. For example, computing "c3>=c2" for Input will result in the following:: - - chr1 151077881 151077918 2 200 - True - chr1 151081985 151082078 3 500 + True - -or computing "type(c2)==type('') for Input will return:: - - chr1 151077881 151077918 2 200 - False - chr1 151081985 151082078 3 500 + False - - -The following built-in functions are available:: - - abs | all | any | bin | bool | chr | ceil | cmp | complex - - divmod | exp | float | log | log10 | floor | hex | int | len | long - - max | min | oct | ord | pow | range | reversed - - round | sorted | sqrt | str | sum | type | unichr | unicode | - - </help> - <citations /> -</tool> +<tool id="Add_a_column1" name="Compute" version="1.4"> + <description>an expression on every row</description> + <requirements> + <requirement type="package" version="3.8">python</requirement> + <requirement type="package" version="4.4">sed</requirement> + <requirement type="package" version="1.19.1">numpy</requirement> + </requirements> + <command detect_errors="aggressive"><![CDATA[ + #if $header_lines_conditional.header_lines_select == "yes": + (sed -n '1,1p' '$input' | sed "s|$|%${header_lines_conditional.header_new_column_name}|" | tr "%" "\t") > header && + sed '1,1d' '$input' > data && + #else: + touch header && + ln -s '$input' data && + #end if + + ## inject colums and column_types metadata into inputs json + #import json + #set inputs_dict = json.load(open($inputs)) + #set inputs_dict['columns'] = $input.metadata.columns + #set inputs_dict['column_types'] = $input.metadata.column_types + #set x = json.dump($inputs_dict, open($inputs, 'w')) + + python '$__tool_directory__/column_maker.py' + data column_maker_output + --load_json '$inputs' + && cat header column_maker_output > '$out_file1' + ]]></command> + <configfiles> + <inputs name="inputs"/> + </configfiles> + <inputs> + <param name="cond" type="text" value="c3-c2" label="Add expression"/> + <param format="tabular" name="input" type="data" label="as a new column to" help="Dataset missing? See TIP below"/> + <param name="round" type="select" label="Round result?"> + <option value="no">NO</option> + <option value="yes">YES</option> + </param> + <conditional name="header_lines_conditional"> + <param name="header_lines_select" type="select" label="Skip a header line" help="# characters are already considered as comments and kept" > + <option value="no" >no</option> + <option value="yes" >yes</option> + </param> + <when value="no"> + </when> + <when value="yes"> + <param name="header_new_column_name" type="text" value="New Column" label="The new column name" /> + </when> + </conditional> + <param name="avoid_scientific_notation" type="select" label="Avoid scientific notation" help="If yes, use fully expanded decimal representation when writing new columns (use only if expression produces decimal numbers)."> + <option value="no">no</option> + <option value="yes">yes</option> + </param> + </inputs> + <outputs> + <data format_source="input" name="out_file1" metadata_source="input"/> + </outputs> + <tests> + <test> + <param name="cond" value="c3-c2"/> + <param name="input" value="1.bed"/> + <param name="round" value="no"/> + <output name="out_file1" file="column_maker_out1.interval"/> + </test> + <test> + <param name="cond" value="c4*1"/> + <param name="input" value="1.interval"/> + <param name="round" value="no"/> + <output name="out_file1" file="column_maker_out2.interval"/> + </test> + <test> + <param name="cond" value="c4*1"/> + <param name="input" value="1.header.tsv"/> + <param name="round" value="no"/> + <conditional name="header_lines_conditional"> + <param name="header_lines_select" value="yes" /> + <param name="header_new_column_name" value="value1_again" /> + </conditional> + <output name="out_file1" file="column_maker_out2.header.tsv"/> + </test> + <test> + <param name="cond" value="c4*1"/> + <param name="input" value="1.interval"/> + <param name="round" value="yes"/> + <output name="out_file1" file="column_maker_out3.interval"/> + </test> + <test> + <param name="cond" value="float(.0000000000001)"/> + <param name="input" value="1.bed"/> + <param name="round" value="no"/> + <output name="out_file1"> + <assert_contents> + <has_text text="CCDS10397" /> + <has_text text="1e-13" /> + </assert_contents> + </output> + </test> + <test> + <param name="cond" value="float(.0000000000001)"/> + <param name="input" value="1.bed"/> + <param name="round" value="no"/> + <param name="avoid_scientific_notation" value="yes"/> + <output name="out_file1"> + <assert_contents> + <has_text text="CCDS10397" /> + <has_text text=".0000000000001" /> + <not_has_text text="1e-13" /> + </assert_contents> + </output> + </test> + </tests> + <help> + + .. class:: infomark + +**TIP:** If your data is not TAB delimited, use *Text Manipulation->Convert* + +----- + +**What it does** + +This tool computes an expression for every row of a dataset and appends the result as a new column (field). + +- Columns are referenced with **c** and a **number**. For example, **c1** refers to the first column of a tab-delimited file + +- **c3-c2** will add a length column to the dataset if **c2** and **c3** are start and end position + +----- + +**Example** + +If this is your input:: + + chr1 151077881 151077918 2 200 - + chr1 151081985 151082078 3 500 + + +computing "c4*c5" will produce:: + + chr1 151077881 151077918 2 200 - 400.0 + chr1 151081985 151082078 3 500 + 1500.0 + +if, at the same time, "Round result?" is set to **YES** results will look like this:: + + chr1 151077881 151077918 2 200 - 400 + chr1 151081985 151082078 3 500 + 1500 + +You can also use this tool to evaluate expressions. For example, computing "c3>=c2" for Input will result in the following:: + + chr1 151077881 151077918 2 200 - True + chr1 151081985 151082078 3 500 + True + +or computing "type(c2)==type('') for Input will return:: + + chr1 151077881 151077918 2 200 - False + chr1 151081985 151082078 3 500 + False + + +The following built-in functions are available:: + + abs | all | any | bin | bool | chr | ceil | cmp | complex + + divmod | exp | float | log | log10 | floor | hex | int | len | long + + max | min | oct | ord | pow | range | reversed + + round | sorted | sqrt | str | sum | type | unichr | unicode | + + </help> + <citations /> +</tool>