Galaxy |

Changeset 9:6595517c2dd8 (2022-07-28)

Previous changeset 8:02026300aa45 (2021-03-09) Next changeset 10:aff5135563c6 (2024-08-22)

Commit message:
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_maker commit fe76077775aaca531f6a563fdfcbd73fbf1528e7

modified:
column_maker.py
column_maker.xml

added:
test-data/bed12.bed
test-data/bed12_modified.bed
test-data/bed_from_chrom_pos_ref.bed
test-data/chrom_pos_ref.tab
test-data/olympics.tsv
test-data/olympics_bmi_out.tab
test-data/short_line_test.tab
test-data/short_line_test_out.tab

diff -r 02026300aa45 -r 6595517c2dd8 column_maker.py
--- a/column_maker.py Tue Mar 09 18:33:10 2021 +0000
+++ b/column_maker.py Thu Jul 28 15:28:30 2022 +0000

[

b'@@ -1,13 +1,14 @@\n #!/usr/bin/env python\n """\n-This tool takes a tab-delimited textfile as input and creates another column in\n-the file which is the result of a computation performed on every row in the\n-original file. The tool will skip over invalid lines within the file,\n-informing the user about the number of lines skipped.\n+This tool takes a tab-delimited textfile as input and creates new columns in\n+the file which are the result of a computation performed on every row in the\n+original file. The tool will skip over empty and comment (starting with a #)\n+lines within the file. It does not change the formatting of any original,\n+retained columns.\n """\n \n import argparse\n-import json\n+import enum\n import re\n import sys\n # Functions that may be used in the compute expression\n@@ -20,143 +21,371 @@\n sqrt,\n )\n \n-from numpy import format_float_positional # noqa: F401\n+from numpy import format_float_positional\n+\n+\n+class Mode(enum.Enum):\n+ APPEND = \'\'\n+ INSERT = \'I\'\n+ REPLACE = \'R\'\n+\n+\n+def from_str(s, to_type):\n+ if to_type is list:\n+ return [part.strip(\' \') for part in s.split(\',\')]\n+ else:\n+ return to_type(s)\n+\n+\n+def to_str(obj):\n+ if type(obj) is list:\n+ return \',\'.join([to_str(i) for i in obj])\n+ if args.avoid_scientific_notation and type(obj) is float:\n+ return format_float_positional(obj)\n+ return str(obj)\n+\n \n parser = argparse.ArgumentParser()\n-parser.add_argument(\'input\', type=argparse.FileType(\'r\'), help="input file")\n-parser.add_argument(\'output\', type=argparse.FileType(\'wt\'), help="output file")\n-parser.add_argument(\'cond\', nargs=\'?\', type=str, help="expression")\n-parser.add_argument(\'columns\', nargs=\'?\', type=int, help="number of columns")\n-parser.add_argument(\'column_types\', nargs=\'?\', type=str, help="comma separated list of column types")\n-parser.add_argument(\'--round\', action="store_true",\n- help="round result")\n-parser.add_argument(\'--avoid_scientific_notation\', action="store_true",\n- help="avoid scientific notation")\n-parser.add_argument(\'--header_new_column_name\', default=None, type=str,\n- help="First line of input is a header line with column "\n- "names and this should become the name of the new "\n- "column")\n-parser.add_argument(\'--load_json\', default=None, type=argparse.FileType(\'r\'),\n- help="overwrite parsed arguments from json file")\n+parser.add_argument(\'input\', type=str, help=\'input file\')\n+parser.add_argument(\'output\', type=str, help=\'output file\')\n+parser.add_argument(\n+ \'-t\', \'--column-types\', nargs=\'?\', required=True,\n+ help=\'A comma-separated list of column types in the input file\'\n+)\n+parser.add_argument(\n+ \'--avoid-scientific-notation\', action=\'store_true\',\n+ help=\'avoid scientific notation\'\n+)\n+parser.add_argument(\n+ \'--header\', action=\'store_true\',\n+ help=\'The input has a header line with column names. \'\n+ \'Actions must specify names of newly calculated columns.\'\n+)\n+parser.add_argument(\n+ \'--fail-on-non-existent-columns\', action=\'store_true\',\n+ help=\'If an action references a column number that is not existent \'\n+ \'when the expression gets computed, the default behavior is to treat \'\n+ \'this as a case of rows for which the expression cannot be computed. \'\n+ \'The behavior of the tool will then depend on which of the \'\n+ \'non-computable switches is in effect. With this flag, in contrast, \'\n+ \'the tool will fail directly upon encountering a non-existing column.\'\n+)\n+non_computable = parser.add_mutually_exclusive_group()\n+non_computable.add_argument(\'--fail-on-non-computable\', action=\'store_true\')\n+non_computable.add_argument(\'--skip-non-computable\', action=\'store_true\')\n+non_computable.add_argument(\'--keep-non-computable\', action=\'store_true\')\n+non_computable.add_argument(\'--non-computable-blank\', action=\'store_true\')\n+non_computable.add_argument('..b' if not invalid_line:\n+ first_invalid_line = i + 1\n+ invalid_line = line\n+ break\n+ if args.keep_non_computable:\n+ # write the original line unchanged and stop computing\n+ # for this line\n+ out.write(line + \'\\n\')\n+ break\n+ if args.non_computable_blank:\n+ new_val = \'\'\n+ elif args.non_computable_default is not None:\n+ new_val = args.non_computable_default\n+ else:\n+ # --fail_on_non_computable\n+ # (which is default behavior, too)\n+ sys.exit(\n+ \'Could not compute a new column value using "%s" on \'\n+ \'line #%d: "%s". Error was "%s"\'\n+ % (ex, i, line, str(e))\n+ )\n+ if mode is Mode.INSERT:\n+ fields.insert(col_idx, new_val)\n+ typed_fields.insert(col_idx, new_val)\n+ elif mode is Mode.REPLACE:\n+ if col_idx > len(fields):\n+ # Intentionally allow "replacing" one column beyond\n+ # current fields since this can be used to fix\n+ # short lines in the input.\n+ sys.exit(\n+ \'Cannot replace column #%d in line with %d columns: \'\n+ \'"%s"\' % (col_idx + 1, len(fields), line)\n+ )\n+ fields[col_idx:col_idx + 1] = [new_val]\n+ typed_fields[col_idx:col_idx + 1] = [new_val]\n+ else:\n+ fields.append(new_val)\n+ typed_fields.append(new_val)\n+ else:\n+ fields = [to_str(field) for field in fields]\n+ out.write(\'\\t\'.join(fields) + \'\\n\')\n+ lines_computed += 1\n \n-valid_expr = True\n-try:\n- exec(code)\n-except Exception as e:\n- if str(e).startswith(\'invalid syntax\'):\n- valid_expr = False\n- sys.exit(\'Expression "%s" likely invalid. See tool tips, syntax and examples.\' % expr)\n- else:\n- sys.exit(str(e))\n-finally:\n- out.close()\n \n-if valid_expr:\n- valid_lines = total_lines - skipped_lines\n- print(\'Creating column %d with expression %s\' % (in_columns + 1, expr))\n- if valid_lines > 0:\n- print(\'kept %4.2f%% of %d lines.\' % (100.0 * lines_kept / valid_lines,\n- total_lines))\n- else:\n- print(\'Possible invalid expression "%s" or non-existent column referenced. See tool tips, syntax and examples.\' % expr)\n- if skipped_lines > 0:\n- print(\'Skipped %d invalid lines starting at line #%d: "%s"\' %\n- (skipped_lines, first_invalid_line, invalid_line))\n+valid_lines = total_lines - skipped_lines\n+if valid_lines > 0:\n+ print(\n+ \'Computed new column values for %4.2f%% of %d lines written.\'\n+ % (100.0 * lines_computed / valid_lines, valid_lines)\n+ )\n+elif args.fail_on_non_existent_columns:\n+ # Warn the user that there could be an issue with an expression.\n+ print(\n+ \'Could not compute a new column for any input row! \'\n+ \'Please check your expression(s) "%s" for problems.\'\n+ % actions\n+ )\n+else:\n+ # Same, but the problem could also be a reference to a non-existent\n+ # column.\n+ print(\n+ \'Could not compute a new column for any input row! \'\n+ \'Please check your expression(s) "%s" for references to non-existent \'\n+ \'columns or other problems.\'\n+ % actions\n+ )\n+if skipped_lines > 0:\n+ print(\'Skipped %d invalid lines starting at line #%d: "%s"\' %\n+ (skipped_lines, first_invalid_line, invalid_line))\n+if lines_computed < valid_lines:\n+ print(\n+ \'Rewrote %d lines unmodified because computation of a new value failed\'\n+ % (valid_lines - lines_computed)\n+ )\n'

diff -r 02026300aa45 -r 6595517c2dd8 column_maker.xml
--- a/column_maker.xml Tue Mar 09 18:33:10 2021 +0000
+++ b/column_maker.xml Thu Jul 28 15:28:30 2022 +0000

[

b'@@ -1,101 +1,161 @@\n-<tool id="Add_a_column1" name="Compute" version="1.6">\n- <description>an expression on every row</description>\n+<tool id="Add_a_column1" name="Compute" version="2.0">\n+ <description>on rows</description>\n+ <macros>\n+ <xml name="compute_repeat">\n+ <repeat name="expressions" title="Expressions" min="1" default="1">\n+ <param name="cond" type="text" value="c3-c2" label="Add expression">\n+ <sanitizer>\n+ <valid initial="default">\n+ <add value="<" />\n+ <add value=">" />\n+ <add value=""" />\n+ <add value="'" />\n+ </valid>\n+ </sanitizer>\n+ </param>\n+ <conditional name="add_column">\n+ <param name="mode" type="select" label="Mode of the operation">\n+ <option value="">Append</option>\n+ <option value="I">Insert</option>\n+ <option value="R">Replace</option>\n+ </param>\n+ <when value="">\n+ <param name="pos" type="hidden" value="" />\n+ </when>\n+ <when value="I">\n+ <param name="pos" type="integer" min="1" value="1" label="Insert new column before existing column number" />\n+ </when>\n+ <when value="R">\n+ <param name="pos" type="integer" min="1" value="1" label="Use new column to replace column number" />\n+ </when>\n+ </conditional>\n+ <yield />\n+ </repeat>\n+ </xml>\n+ </macros>\n <requirements>\n <requirement type="package" version="3.8">python</requirement>\n- <requirement type="package" version="1.19.1">numpy</requirement>\n+ <requirement type="package" version="1.23.1">numpy</requirement>\n </requirements>\n <command detect_errors="aggressive"><![CDATA[\n-## inject colums and column_types metadata into inputs json\n-#import json\n-#set inputs_dict = json.load(open($inputs))\n-#set inputs_dict[\'columns\'] = $input.metadata.columns\n-#set inputs_dict[\'column_types\'] = $input.metadata.column_types\n-## flatten conditional\n-#if $header_lines_conditional.header_lines_select == "yes":\n- #set inputs_dict[\'header_new_column_name\'] = str($header_lines_conditional.header_new_column_name)\n+python \'$__tool_directory__/column_maker.py\'\n+#if str($error_handling.auto_col_types) == \'on\':\n+ #set $col_types = $input.metadata.column_types\n+#else:\n+ #set $col_types = \',\'.join([\'str\' for t in $input.metadata.column_types.split(\',\')])\n #end if\n-#set x = json.dump($inputs_dict, open($inputs, \'w\'))\n-\n-python \'$__tool_directory__/column_maker.py\'\n+--column-types $col_types\n+$avoid_scientific_notation\n+#if str($ops.header_lines_select) == \'yes\':\n+ --header\n+#end if\n+--file \'$expressions_file\'\n+$error_handling.fail_on_non_existent_columns\n+$error_handling.non_computable.action\n+#if str($error_handling.non_computable.action) == \'--non-computable-default\':\n+ \'$error_handling.non_computable.default_value\'\n+#end if\n \'$input\'\n \'$out_file1\'\n---load_json \'$inputs\'\n ]]></command>\n <configfiles>\n- <inputs name="inputs"/>\n+ <configfile name="expressions_file"><![CDATA[\n+#if str($ops.header_lines_select) == \'yes\':\n+ #for $expr in $ops.expressions:\n+${expr.cond};${expr.add_column.pos}${expr.add_column.mode};${expr.new_column_name}\n+ #end for\n+#else:\n+ #for $expr in $ops.expressions:\n+${expr.cond};${expr.add_column.pos}${expr.add_column.mode};\n+ #end for\n+#end if\n+]]></configfile>\n </configfiles>\n <inputs>\n- <param name="cond" type="text" value="c3-c2" label="Add expression">\n- <sanitizer>\n- <valid initial="default">\n- <add value="'..b'uting "type(c2)==type(\'\') for Input will return::\n+-----\n+\n+**Error handling**\n+\n+The tool will always fail on syntax errors in and other unrecoverable parsing\n+errors with any of your expressions. For other problems, however, it offers\n+control over how they should be handled:\n \n- chr1 151077881 151077918 2 200 - False\n- chr1 151081985 151082078 3 500 + False\n+1. The default for "Autodetect column types" is "Yes", which means the tool\n+ will evaluate each column value as the type that Galaxy assumes for the\n+ column. This default behavior will allow you to write simpler expressions.\n+ The arithmetic expression "c4 * c5" from the first simple example,\n+ for instance, works only because Galaxy realizes that c4 and c5 are integer\n+ columns. Occasionally, this autodetection can cause issues. A common\n+ such situation are missing values in columns that Galaxy thinks are of\n+ numeric type. If you\'re getting errors like "Failed to convert some of the\n+ columns in line #X ...", a solution might be to turn off column type\n+ autodetection. The price you will have to pay for doing so is that now you\n+ will have to handle type conversions yourself. In the first example you would\n+ now have to use the epression: "int(c4) * int(c5)".\n \n+2. By default, if any expression references columns that are not existing before\n+ that expression gets computed, the tool will fail, but you can uncheck the\n+ "Fail on references to non-existent columns" option. If you do so, the result\n+ will depend on your choice for "If an expression cannot be computed for a row"\n+ (see 3.)\n \n-The following built-in functions are available::\n+3. The default for rows, for which an expression fails to compute is, again, to\n+ fail the tool run, but you can also choose to:\n+\n+ - skip the row on output\n+\n+ This is a simple way to only keep lines conforming to an expected standard.\n+ It is also easy to mask problems with your expressions with this option so\n+ take a look at the results and try to understand what gets skipped and for\n+ what reasons (the stdout of the tool will contain information about both).\n+\n+ - keep the row unchanged\n \n- abs | all | any | bin | bool | chr | ceil | cmp | complex\n+ This can be a good solution if your input contains special separator lines\n+ that don\'t follow the general tabular format of other lines and you would\n+ like to keep those lines\n+\n+ - produce an empty column value for the row\n \n- divmod | exp | float | log | log10 | floor | hex | int | len | long\n+ This will use the empty string as a substitute for non-computable items.\n+ Different from the "keep the row unchanged option" the problematic line will\n+ have a column added or changed. This option is a good choice for inputs\n+ in which all rows have the same tabular layout where you want to make sure\n+ that the same is true for the output, i.e. that all output lines still have\n+ the same number of columns.\n+\n+ - fill in a replacement value\n+\n+ This option is very similar to the previous one, but lets you control the\n+ replacement value.\n+\n+**Example**\n+\n+In the following input::\n \n- max | min | oct | ord | pow | range | reversed\n+ chr1 151077881 151077918 2 200 -\n+ chr1 151081985 151082078 3 500 +\n+ chr1 151090031 151090938 4 700\n+\n+the last line does not have a strand column. This violates the bed file format\n+specification, which says that unknown strand is to be encoded as ``.`` in the\n+strand column.\n+\n+You can fix the file with the following tool run:\n \n- round | sorted | sqrt | str | sum | type | unichr | unicode |\n+**Add expression**: `c6`\n+\n+**Mode of the operation**: `Replace`\n+\n+**Use new column to replace column number**: `6`\n \n+**Fail on references to non-existent columns**: `No`\n+\n+**If an expression cannot be computed for a row**: `Fill in a replacement value`\n+\n+**Replacement value**: `.`\n ]]></help>\n <citations />\n </tool>\n'

diff -r 02026300aa45 -r 6595517c2dd8 test-data/bed12.bed
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/bed12.bed Thu Jul 28 15:28:30 2022 +0000

@@ -0,0 +1,3 @@
+chr1 14756 15038 JUNC00000001 294 - 14756 15038 255,0,0 2 73,69 0,213
+chr1 14969 15836 JUNC00000002 144 - 14969 15836 255,0,0 2 69,41 0,826
+chr1 15905 16677 JUNC00000003 12 - 15905 16677 255,0,0 2 42,71 0,701

diff -r 02026300aa45 -r 6595517c2dd8 test-data/bed12_modified.bed
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/bed12_modified.bed Thu Jul 28 15:28:30 2022 +0000

@@ -0,0 +1,3 @@
+chr1 14756 15038 JUNC00000001 73 - 14756 15038 255,0,0 2 73,69 0,213
+chr1 14969 15836 JUNC00000002 69 - 14969 15836 255,0,0 2 69,41 0,826
+chr1 15905 16677 JUNC00000003 71 - 15905 16677 255,0,0 2 42,71 0,701

diff -r 02026300aa45 -r 6595517c2dd8 test-data/bed_from_chrom_pos_ref.bed
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/bed_from_chrom_pos_ref.bed Thu Jul 28 15:28:30 2022 +0000

@@ -0,0 +1,6 @@
+NC_045512.2 28361 28370
+NC_045512.2 28880 28881
+NC_045512.2 28881 28882
+NC_045512.2 28882 28883
+NC_045512.2 29509 29510
+NC_045512.2 29733 29759

diff -r 02026300aa45 -r 6595517c2dd8 test-data/chrom_pos_ref.tab
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/chrom_pos_ref.tab Thu Jul 28 15:28:30 2022 +0000

@@ -0,0 +1,6 @@
+NC_045512.2 28361 GGAGAACGCA
+NC_045512.2 28881 G
+NC_045512.2 28882 G
+NC_045512.2 28883 G
+NC_045512.2 29510 A
+NC_045512.2 29733 CGAGGCCACGCGGAGTACGATCGAGTG

diff -r 02026300aa45 -r 6595517c2dd8 test-data/olympics.tsv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/olympics.tsv Thu Jul 28 15:28:30 2022 +0000

@@ -0,0 +1,7 @@
+athlete_id name sex birth_year birth_day birth_place height weight team noc games year season city sport event medal
+1 Jean-François Blanchy M 1886 12 December Bordeaux, Gironde (FRA) NA NA France FRA 1912 Summer Olympics 1912 Summer Stockholm Tennis Doubles, Men NA
+1 Jean-François Blanchy M 1886 12 December Bordeaux, Gironde (FRA) NA NA France FRA 1912 Summer Olympics 1912 Summer Stockholm Tennis Singles, Men NA
+1 Jean-François Blanchy M 1886 12 December Bordeaux, Gironde (FRA) NA NA France FRA 1920 Summer Olympics 1920 Summer Antwerpen Tennis Doubles, Men NA
+1 Jean-François Blanchy M 1886 12 December Bordeaux, Gironde (FRA) NA NA France FRA 1920 Summer Olympics 1920 Summer Antwerpen Tennis Doubles, Mixed NA
+1 Jean-François Blanchy M 1886 12 December Bordeaux, Gironde (FRA) NA NA France FRA 1920 Summer Olympics 1920 Summer Antwerpen Tennis Singles, Men NA
+2 Arnaud Boetsch M 1969 1 April Meulan, Yvelines (FRA) 183 76 France FRA 1996 Summer Olympics 1996 Summer Atlanta Tennis Doubles, Men NA

diff -r 02026300aa45 -r 6595517c2dd8 test-data/olympics_bmi_out.tab
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/olympics_bmi_out.tab Thu Jul 28 15:28:30 2022 +0000

@@ -0,0 +1,7 @@
+athlete_id name sex birth_year birth_day birth_place height weight team noc games year season city sport event medal BMI
+1 Jean-François Blanchy M 1886 12 December Bordeaux, Gironde (FRA) NA NA France FRA 1912 Summer Olympics 1912 Summer Stockholm Tennis Doubles, Men NA NA
+1 Jean-François Blanchy M 1886 12 December Bordeaux, Gironde (FRA) NA NA France FRA 1912 Summer Olympics 1912 Summer Stockholm Tennis Singles, Men NA NA
+1 Jean-François Blanchy M 1886 12 December Bordeaux, Gironde (FRA) NA NA France FRA 1920 Summer Olympics 1920 Summer Antwerpen Tennis Doubles, Men NA NA
+1 Jean-François Blanchy M 1886 12 December Bordeaux, Gironde (FRA) NA NA France FRA 1920 Summer Olympics 1920 Summer Antwerpen Tennis Doubles, Mixed NA NA
+1 Jean-François Blanchy M 1886 12 December Bordeaux, Gironde (FRA) NA NA France FRA 1920 Summer Olympics 1920 Summer Antwerpen Tennis Singles, Men NA NA
+2 Arnaud Boetsch M 1969 1 April Meulan, Yvelines (FRA) 183 76 France FRA 1996 Summer Olympics 1996 Summer Atlanta Tennis Doubles, Men NA 22.694018931589476

diff -r 02026300aa45 -r 6595517c2dd8 test-data/short_line_test.tab
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/short_line_test.tab Thu Jul 28 15:28:30 2022 +0000

@@ -0,0 +1,3 @@
+chr1 151077881 151077918 2 200 -
+chr1 151081985 151082078 3 500 +
+chr1 151090031 151090938 4 700

diff -r 02026300aa45 -r 6595517c2dd8 test-data/short_line_test_out.tab
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/short_line_test_out.tab Thu Jul 28 15:28:30 2022 +0000

@@ -0,0 +1,3 @@
+chr1 151077881 151077918 2 200 -
+chr1 151081985 151082078 3 500 +
+chr1 151090031 151090938 4 700 .