# HG changeset patch # User iuc # Date 1659022110 0 # Node ID 6595517c2dd8e3f0cb01a3b0b749ba1d6ae8079c # Parent 02026300aa45d26266c8f03b4481766768e40518 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_maker commit fe76077775aaca531f6a563fdfcbd73fbf1528e7 diff -r 02026300aa45 -r 6595517c2dd8 column_maker.py --- a/column_maker.py Tue Mar 09 18:33:10 2021 +0000 +++ b/column_maker.py Thu Jul 28 15:28:30 2022 +0000 @@ -1,13 +1,14 @@ #!/usr/bin/env python """ -This tool takes a tab-delimited textfile as input and creates another column in -the file which is the result of a computation performed on every row in the -original file. The tool will skip over invalid lines within the file, -informing the user about the number of lines skipped. +This tool takes a tab-delimited textfile as input and creates new columns in +the file which are the result of a computation performed on every row in the +original file. The tool will skip over empty and comment (starting with a #) +lines within the file. It does not change the formatting of any original, +retained columns. """ import argparse -import json +import enum import re import sys # Functions that may be used in the compute expression @@ -20,143 +21,371 @@ sqrt, ) -from numpy import format_float_positional # noqa: F401 +from numpy import format_float_positional + + +class Mode(enum.Enum): + APPEND = '' + INSERT = 'I' + REPLACE = 'R' + + +def from_str(s, to_type): + if to_type is list: + return [part.strip(' ') for part in s.split(',')] + else: + return to_type(s) + + +def to_str(obj): + if type(obj) is list: + return ','.join([to_str(i) for i in obj]) + if args.avoid_scientific_notation and type(obj) is float: + return format_float_positional(obj) + return str(obj) + parser = argparse.ArgumentParser() -parser.add_argument('input', type=argparse.FileType('r'), help="input file") -parser.add_argument('output', type=argparse.FileType('wt'), help="output file") -parser.add_argument('cond', nargs='?', type=str, help="expression") -parser.add_argument('columns', nargs='?', type=int, help="number of columns") -parser.add_argument('column_types', nargs='?', type=str, help="comma separated list of column types") -parser.add_argument('--round', action="store_true", - help="round result") -parser.add_argument('--avoid_scientific_notation', action="store_true", - help="avoid scientific notation") -parser.add_argument('--header_new_column_name', default=None, type=str, - help="First line of input is a header line with column " - "names and this should become the name of the new " - "column") -parser.add_argument('--load_json', default=None, type=argparse.FileType('r'), - help="overwrite parsed arguments from json file") +parser.add_argument('input', type=str, help='input file') +parser.add_argument('output', type=str, help='output file') +parser.add_argument( + '-t', '--column-types', nargs='?', required=True, + help='A comma-separated list of column types in the input file' +) +parser.add_argument( + '--avoid-scientific-notation', action='store_true', + help='avoid scientific notation' +) +parser.add_argument( + '--header', action='store_true', + help='The input has a header line with column names. ' + 'Actions must specify names of newly calculated columns.' +) +parser.add_argument( + '--fail-on-non-existent-columns', action='store_true', + help='If an action references a column number that is not existent ' + 'when the expression gets computed, the default behavior is to treat ' + 'this as a case of rows for which the expression cannot be computed. ' + 'The behavior of the tool will then depend on which of the ' + 'non-computable switches is in effect. With this flag, in contrast, ' + 'the tool will fail directly upon encountering a non-existing column.' +) +non_computable = parser.add_mutually_exclusive_group() +non_computable.add_argument('--fail-on-non-computable', action='store_true') +non_computable.add_argument('--skip-non-computable', action='store_true') +non_computable.add_argument('--keep-non-computable', action='store_true') +non_computable.add_argument('--non-computable-blank', action='store_true') +non_computable.add_argument('--non-computable-default') + +group = parser.add_mutually_exclusive_group(required=True) +group.add_argument( + '-a', '--actions', nargs='*', type=str, + help='One or more action(s) of the format EXPR;[COL_ADD_SPEC];[COL_NAME]' +) +group.add_argument( + '-f', '--file', type=str, + help='File to read actions from (mutually exclusive with -a)' +) args = parser.parse_args() -argparse_dict = vars(args) -if args.load_json: - json_dict = json.load(args.load_json) - argparse_dict.update(json_dict) - -fh = argparse_dict['input'] -out = argparse_dict['output'] -expr = argparse_dict['cond'] -round_result = argparse_dict['round'] -avoid_scientific_notation = argparse_dict['avoid_scientific_notation'] +if not args.column_types: + with open(args.input) as fh: + if not fh.readline(): + # Generally, the input must have at least one column to be + # considered tabular, but empty files are ok and should produce + # empty output. + with open(args.output, 'w') as out: + pass + sys.exit() + sys.exit( + "Missing column types. " + "In Galaxy, click the pencil icon on the history item and " + "select the Auto-detect option to correct it. " + "This tool can only be used with tab-delimited data." + ) -if argparse_dict['header_new_column_name'] is not None: - header_line = fh.readline().strip('\n') - out.write( - '{0}\t{1}\n'.format( - header_line, argparse_dict['header_new_column_name'] - ) - ) -try: - in_columns = int(argparse_dict['columns']) - if in_columns < 1: - # To be considered tabular, data must have at least one column. - raise ValueError -except Exception: - if not fh.readline(): - # empty file content is ok and should produce empty output - out.close() - sys.exit() - sys.exit("Missing or invalid 'columns' metadata value, click the pencil icon in the history item and select the Auto-detect option to correct it. This tool can only be used with tab-delimited data.") +in_column_types = [t.strip() for t in args.column_types.split(',')] +in_columns = len(in_column_types) + +# Prepare initial column variable names and type cast representations +# for column data types +cols, type_casts = [], [] +for n, col_type in enumerate(in_column_types, start=1): + col_name = "c%d" % n + cols.append(col_name) +col_str = ', '.join(cols) # 'c1, c2, c3, c4' + +# Define lambda for type-casting of original row fields try: - in_column_types = argparse_dict['column_types'].split(',') -except Exception: - sys.exit("Missing or invalid 'column_types' metadata value, click the pencil icon in the history item and select the Auto-detect option to correct it. This tool can only be used with tab-delimited data.") -if len(in_column_types) != in_columns: - sys.exit("The 'columns' metadata setting does not conform to the 'column_types' metadata setting, click the pencil icon in the history item and select the Auto-detect option to correct it. This tool can only be used with tab-delimited data.") + cast_types = eval( + 'lambda fields: [from_str(s, t) for s, t in zip(fields, [%s])]' + % args.column_types + ) +except Exception as e: + sys.exit( + 'While parsing column types, the following problem occured: "%s"' + % e + ) -operators = 'is|not|or|and' -builtin_and_math_functions = 'abs|all|any|bin|chr|cmp|complex|divmod|float|bool|hex|int|len|long|max|min|oct|ord|pow|range|reversed|round|sorted|str|sum|type|unichr|unicode|log|log10|exp|sqrt|ceil|floor' -string_and_list_methods = [name for name in dir('') + dir([]) if not name.startswith('_')] -whitelist = r"^([c0-9\+\-\*\/\(\)\.\'\"><=,:! ]|%s|%s|%s)*$" % (operators, builtin_and_math_functions, '|'.join(string_and_list_methods)) -if not re.compile(whitelist).match(expr): - sys.exit("Invalid expression") -if avoid_scientific_notation: - expr = "format_float_positional(%s)" % expr +# Get and parse actions +if args.file: + actions = [] + with open(args.file) as i: + for line in i: + line = line.strip() + if line: + actions.append(line) +else: + actions = args.actions -# Prepare the column variable names and wrappers for column data types -cols, type_casts = [], [] -for col in range(1, in_columns + 1): - col_name = "c%d" % col - cols.append(col_name) - col_type = in_column_types[col - 1].strip() - if not round_result and col_type == 'int': - col_type = 'float' - type_cast = "%s(%s)" % (col_type, col_name) - type_casts.append(type_cast) +# each action must be a full data row manipulation instruction of the form: +# EXPR;[COL_ADD_SPEC];[COL_NAME] +# where EXPR is the actual expression to compute on the row, +# COL_ADD_SPEC consists of a column index and a mode identifier for how the +# new column should be added. +# Examples: 3I (insert new col before current column 3), +# 2R (replace current column 2 with new column); +# a missing COL_ADD_SPEC is interpreted as mode A (append new column at the +# end of the row). +# COL_NAME is required with the --header option and specifies the name of the +# new column; without --header, any COL_NAME gets ignored. +operators = 'is|not|or|and' +builtin_and_math_functions = ( + 'abs|all|any|ascii|bin|bool|chr|complex|divmod|float|format|hex|int|len|' + 'list|map|max|min|oct|ord|pow|range|reversed|round|set|sorted|str|sum|type|' + 'log|log10|exp|sqrt|ceil|floor' +) +imported_numpy_function = 'format_float_positional' +string_and_list_methods = [ + name for name in dir('') + dir([]) if not name.startswith('_') +] +whitelist = r"^([c0-9\+\-\*\/\(\)\.\'\"><=,:! ]|%s|%s|%s|%s)*$" % ( + operators, + builtin_and_math_functions, + imported_numpy_function, + '|'.join(string_and_list_methods) +) +valid_pat = re.compile(whitelist) +ops = [] +num_cols = in_columns +for ac in actions: + try: + expr_string, col_add_spec, new_col_name = ac.split(';') + except ValueError: + sys.exit( + 'Invalid Action: "%s". ' + 'Required format: EXPR;[COL_ADD_SPEC];[COL_NAME]' % ac + ) + if not valid_pat.match(expr_string): + sys.exit('Invalid expression: "%s"' % expr_string) + try: + expr_lambda = eval('lambda %s: %s' % (col_str, expr_string)) + except Exception as e: + if str(e).startswith('invalid syntax'): + sys.exit( + 'Expression "%s" caused a syntax error during parsing.' + % expr_string + ) + else: + sys.exit( + 'While parsing expression "%s" the following problem occured: ' + '"%s"' % (expr_string, str(e)) + ) + try: + new_col_idx = int(col_add_spec[:-1] or '0') - 1 + except ValueError: + sys.exit( + 'COL_ADD_SPECS need to start with a (1-based) column index. ' + 'Could not parse a column index from "%s"' % col_add_spec + ) + try: + mode = Mode(col_add_spec[-1:]) + except ValueError: + sys.exit( + 'COL_ADD_SPECS need to end in a single-character mode identifier ' + '("I", or "R"), or be empty (for Append mode). ' + 'Could not parse a valid identifier from "%s"' % col_add_spec + ) + if mode is Mode.REPLACE: + if new_col_idx < 0 or new_col_idx >= num_cols: + sys.exit( + 'Cannot replace the contents of column %d as specified by ' + 'action "%s". No such column at this point of the ' + 'computation' % (new_col_idx + 1, ac) + ) + if not new_col_name and args.header: + sys.exit( + 'A name is required for any new columns when using an existing ' + 'header line (--header option), but found none in action: ' + '"%s"' % ac + ) + # Successfully parsed the instruction + # Store the expression lambda, the index and name of the new column, and + # the original string representation of the expression (for use in + # potential later error messages). + ops.append([expr_lambda, new_col_idx, mode, new_col_name, expr_string]) + if mode is Mode.APPEND or mode is Mode.INSERT: + # If the current expression results in an additional column, + # we need to handle the new field in subsequent lambda functions. + num_cols += 1 + col_str += ', c%d' % num_cols -col_str = ', '.join(cols) # 'c1, c2, c3, c4' -type_cast_str = ', '.join(type_casts) # 'str(c1), int(c2), int(c3), str(c4)' -assign = "%s = line.split('\\t')" % col_str -if len(cols) == 1: - # Single column, unpacking by assignment won't work - assign += '[0]' -wrap = "%s = %s" % (col_str, type_cast_str) + +# ready to start parsing the input file +print( + 'Computing %d new columns with instructions %s' + % (num_cols - in_columns, actions) +) skipped_lines = 0 first_invalid_line = 0 invalid_line = None -lines_kept = 0 +lines_computed = 0 total_lines = 0 +non_existent_col_pat = re.compile(r"name 'c\d+' is not defined") + +with open(args.input, encoding='utf-8') as fh, \ + open(args.output, 'w', encoding='utf-8') as out: + if args.header: + # compute new header line from original + header_cols = fh.readline().strip('\n').split('\t') + for _, col_idx, mode, col_name, _ in ops: + if mode is Mode.INSERT: + header_cols.insert(col_idx, col_name) + elif mode is Mode.REPLACE: + header_cols[col_idx] = col_name + else: + header_cols.append(col_name) + out.write('\t'.join(header_cols) + '\n') -# Read input file, skipping invalid lines, and perform computation that will result in a new column -code = ''' -for i, line in enumerate(fh): - total_lines += 1 - line = line.rstrip('\\r\\n') - if not line or line.startswith('#'): - skipped_lines += 1 - if not invalid_line: - first_invalid_line = i + 1 - invalid_line = line - continue - try: - %s - %s - new_val = %s - if round_result: - new_val = int(round(new_val)) - new_line = line + '\\t' + str(new_val) + "\\n" - out.write(new_line) - lines_kept += 1 - except Exception: - skipped_lines += 1 - if not invalid_line: - first_invalid_line = i + 1 - invalid_line = line -fh.close() -''' % (assign, wrap, expr) + # read data, skipping empty and comment lines, and perform computations + # that will result in new columns + for i, line in enumerate(fh): + total_lines += 1 + line = line.rstrip('\n') + if not line or line.startswith('#'): + skipped_lines += 1 + if not invalid_line: + first_invalid_line = i + 1 + invalid_line = line + continue + fields = line.split('\t') + if len(fields) == in_columns: + try: + typed_fields = cast_types(fields) + except ValueError as e: + sys.exit( + 'Failed to convert some of the columns in line #%d to their ' + 'expected types. The error was: "%s" for the line: "%s"' + % (i, str(e), line) + ) + else: + # A "suspicious" line with less or more fields than expected + # Type-casting for it might fail or not, but it is pointless to + # even try because subsequent computation of any expression will + # fail anyway as expression lambdas expect a fixed number of + # arguments. + # Lets pass in a copy of the original string fields, let + # the computation of the first expression fail, then have that + # situation handled according to the non-computable settings in + # effect. + typed_fields = fields[:] + for fun, col_idx, mode, col_name, ex in ops: + try: + try: + new_val = fun(*typed_fields) + except NameError as e: + # Python 3.10+ would have the problematic name + # available as e.name + if non_existent_col_pat.fullmatch(str(e)) and ( + not args.fail_on_non_existent_columns + ): + # Looks like a reference to a non-existent column + # and we are not supposed to fail on it directly. + # Reraise and have it handled as a non-computable + # row. + raise + # NameErrors are not row-specific, but indicate a + # general problem with the user-supplied expression. + sys.exit( + 'While parsing expression "%s" the following ' + 'problem occured: "%s"' % (ex, str(e)) + ) + except Exception as e: + if args.skip_non_computable: + # log that a line got skipped, then stop computing + # for this line + skipped_lines += 1 + if not invalid_line: + first_invalid_line = i + 1 + invalid_line = line + break + if args.keep_non_computable: + # write the original line unchanged and stop computing + # for this line + out.write(line + '\n') + break + if args.non_computable_blank: + new_val = '' + elif args.non_computable_default is not None: + new_val = args.non_computable_default + else: + # --fail_on_non_computable + # (which is default behavior, too) + sys.exit( + 'Could not compute a new column value using "%s" on ' + 'line #%d: "%s". Error was "%s"' + % (ex, i, line, str(e)) + ) + if mode is Mode.INSERT: + fields.insert(col_idx, new_val) + typed_fields.insert(col_idx, new_val) + elif mode is Mode.REPLACE: + if col_idx > len(fields): + # Intentionally allow "replacing" one column beyond + # current fields since this can be used to fix + # short lines in the input. + sys.exit( + 'Cannot replace column #%d in line with %d columns: ' + '"%s"' % (col_idx + 1, len(fields), line) + ) + fields[col_idx:col_idx + 1] = [new_val] + typed_fields[col_idx:col_idx + 1] = [new_val] + else: + fields.append(new_val) + typed_fields.append(new_val) + else: + fields = [to_str(field) for field in fields] + out.write('\t'.join(fields) + '\n') + lines_computed += 1 -valid_expr = True -try: - exec(code) -except Exception as e: - if str(e).startswith('invalid syntax'): - valid_expr = False - sys.exit('Expression "%s" likely invalid. See tool tips, syntax and examples.' % expr) - else: - sys.exit(str(e)) -finally: - out.close() -if valid_expr: - valid_lines = total_lines - skipped_lines - print('Creating column %d with expression %s' % (in_columns + 1, expr)) - if valid_lines > 0: - print('kept %4.2f%% of %d lines.' % (100.0 * lines_kept / valid_lines, - total_lines)) - else: - print('Possible invalid expression "%s" or non-existent column referenced. See tool tips, syntax and examples.' % expr) - if skipped_lines > 0: - print('Skipped %d invalid lines starting at line #%d: "%s"' % - (skipped_lines, first_invalid_line, invalid_line)) +valid_lines = total_lines - skipped_lines +if valid_lines > 0: + print( + 'Computed new column values for %4.2f%% of %d lines written.' + % (100.0 * lines_computed / valid_lines, valid_lines) + ) +elif args.fail_on_non_existent_columns: + # Warn the user that there could be an issue with an expression. + print( + 'Could not compute a new column for any input row! ' + 'Please check your expression(s) "%s" for problems.' + % actions + ) +else: + # Same, but the problem could also be a reference to a non-existent + # column. + print( + 'Could not compute a new column for any input row! ' + 'Please check your expression(s) "%s" for references to non-existent ' + 'columns or other problems.' + % actions + ) +if skipped_lines > 0: + print('Skipped %d invalid lines starting at line #%d: "%s"' % + (skipped_lines, first_invalid_line, invalid_line)) +if lines_computed < valid_lines: + print( + 'Rewrote %d lines unmodified because computation of a new value failed' + % (valid_lines - lines_computed) + ) diff -r 02026300aa45 -r 6595517c2dd8 column_maker.xml --- a/column_maker.xml Tue Mar 09 18:33:10 2021 +0000 +++ b/column_maker.xml Thu Jul 28 15:28:30 2022 +0000 @@ -1,101 +1,161 @@ - - an expression on every row + + on rows + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + python - numpy + numpy - + - - - - - - - - - - - - - - + + + help="Select Yes to be able to specify names for new columns and have them added to the header line. If you select No, the first line will be treated as a regular line: If it is empty or starts with a # character it will be skipped, otherwise the tool will attempt to compute the specified expression on it." > - + + + - + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + +
- - - + + - - - + + - - - - - - - + + + + - + - - - + - @@ -105,8 +165,7 @@ - - + @@ -116,6 +175,122 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + =c2" for Input will result in the following:: +You can also use this tool to evaluate expressions. +For example, computing "c3 >= c2" for the input above will result in the following:: + + chr1 151077881 151077918 2 200 - True + chr1 151081985 151082078 3 500 + True + +Similarly, computing "type(c2) == type(c3) will return:: chr1 151077881 151077918 2 200 - True chr1 151081985 151082078 3 500 + True -or computing "type(c2)==type('') for Input will return:: +----- + +**Error handling** + +The tool will always fail on syntax errors in and other unrecoverable parsing +errors with any of your expressions. For other problems, however, it offers +control over how they should be handled: - chr1 151077881 151077918 2 200 - False - chr1 151081985 151082078 3 500 + False +1. The default for "Autodetect column types" is "Yes", which means the tool + will evaluate each column value as the type that Galaxy assumes for the + column. This default behavior will allow you to write simpler expressions. + The arithmetic expression "c4 * c5" from the first simple example, + for instance, works only because Galaxy realizes that c4 and c5 are integer + columns. Occasionally, this autodetection can cause issues. A common + such situation are missing values in columns that Galaxy thinks are of + numeric type. If you're getting errors like "Failed to convert some of the + columns in line #X ...", a solution might be to turn off column type + autodetection. The price you will have to pay for doing so is that now you + will have to handle type conversions yourself. In the first example you would + now have to use the epression: "int(c4) * int(c5)". +2. By default, if any expression references columns that are not existing before + that expression gets computed, the tool will fail, but you can uncheck the + "Fail on references to non-existent columns" option. If you do so, the result + will depend on your choice for "If an expression cannot be computed for a row" + (see 3.) -The following built-in functions are available:: +3. The default for rows, for which an expression fails to compute is, again, to + fail the tool run, but you can also choose to: + + - skip the row on output + + This is a simple way to only keep lines conforming to an expected standard. + It is also easy to mask problems with your expressions with this option so + take a look at the results and try to understand what gets skipped and for + what reasons (the stdout of the tool will contain information about both). + + - keep the row unchanged - abs | all | any | bin | bool | chr | ceil | cmp | complex + This can be a good solution if your input contains special separator lines + that don't follow the general tabular format of other lines and you would + like to keep those lines + + - produce an empty column value for the row - divmod | exp | float | log | log10 | floor | hex | int | len | long + This will use the empty string as a substitute for non-computable items. + Different from the "keep the row unchanged option" the problematic line will + have a column added or changed. This option is a good choice for inputs + in which all rows have the same tabular layout where you want to make sure + that the same is true for the output, i.e. that all output lines still have + the same number of columns. + + - fill in a replacement value + + This option is very similar to the previous one, but lets you control the + replacement value. + +**Example** + +In the following input:: - max | min | oct | ord | pow | range | reversed + chr1 151077881 151077918 2 200 - + chr1 151081985 151082078 3 500 + + chr1 151090031 151090938 4 700 + +the last line does not have a strand column. This violates the bed file format +specification, which says that unknown strand is to be encoded as ``.`` in the +strand column. + +You can fix the file with the following tool run: - round | sorted | sqrt | str | sum | type | unichr | unicode | +**Add expression**: `c6` + +**Mode of the operation**: `Replace` + +**Use new column to replace column number**: `6` +**Fail on references to non-existent columns**: `No` + +**If an expression cannot be computed for a row**: `Fill in a replacement value` + +**Replacement value**: `.` ]]>
diff -r 02026300aa45 -r 6595517c2dd8 test-data/bed12.bed --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/bed12.bed Thu Jul 28 15:28:30 2022 +0000 @@ -0,0 +1,3 @@ +chr1 14756 15038 JUNC00000001 294 - 14756 15038 255,0,0 2 73,69 0,213 +chr1 14969 15836 JUNC00000002 144 - 14969 15836 255,0,0 2 69,41 0,826 +chr1 15905 16677 JUNC00000003 12 - 15905 16677 255,0,0 2 42,71 0,701 diff -r 02026300aa45 -r 6595517c2dd8 test-data/bed12_modified.bed --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/bed12_modified.bed Thu Jul 28 15:28:30 2022 +0000 @@ -0,0 +1,3 @@ +chr1 14756 15038 JUNC00000001 73 - 14756 15038 255,0,0 2 73,69 0,213 +chr1 14969 15836 JUNC00000002 69 - 14969 15836 255,0,0 2 69,41 0,826 +chr1 15905 16677 JUNC00000003 71 - 15905 16677 255,0,0 2 42,71 0,701 diff -r 02026300aa45 -r 6595517c2dd8 test-data/bed_from_chrom_pos_ref.bed --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/bed_from_chrom_pos_ref.bed Thu Jul 28 15:28:30 2022 +0000 @@ -0,0 +1,6 @@ +NC_045512.2 28361 28370 +NC_045512.2 28880 28881 +NC_045512.2 28881 28882 +NC_045512.2 28882 28883 +NC_045512.2 29509 29510 +NC_045512.2 29733 29759 diff -r 02026300aa45 -r 6595517c2dd8 test-data/chrom_pos_ref.tab --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/chrom_pos_ref.tab Thu Jul 28 15:28:30 2022 +0000 @@ -0,0 +1,6 @@ +NC_045512.2 28361 GGAGAACGCA +NC_045512.2 28881 G +NC_045512.2 28882 G +NC_045512.2 28883 G +NC_045512.2 29510 A +NC_045512.2 29733 CGAGGCCACGCGGAGTACGATCGAGTG diff -r 02026300aa45 -r 6595517c2dd8 test-data/olympics.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/olympics.tsv Thu Jul 28 15:28:30 2022 +0000 @@ -0,0 +1,7 @@ +athlete_id name sex birth_year birth_day birth_place height weight team noc games year season city sport event medal +1 Jean-François Blanchy M 1886 12 December Bordeaux, Gironde (FRA) NA NA France FRA 1912 Summer Olympics 1912 Summer Stockholm Tennis Doubles, Men NA +1 Jean-François Blanchy M 1886 12 December Bordeaux, Gironde (FRA) NA NA France FRA 1912 Summer Olympics 1912 Summer Stockholm Tennis Singles, Men NA +1 Jean-François Blanchy M 1886 12 December Bordeaux, Gironde (FRA) NA NA France FRA 1920 Summer Olympics 1920 Summer Antwerpen Tennis Doubles, Men NA +1 Jean-François Blanchy M 1886 12 December Bordeaux, Gironde (FRA) NA NA France FRA 1920 Summer Olympics 1920 Summer Antwerpen Tennis Doubles, Mixed NA +1 Jean-François Blanchy M 1886 12 December Bordeaux, Gironde (FRA) NA NA France FRA 1920 Summer Olympics 1920 Summer Antwerpen Tennis Singles, Men NA +2 Arnaud Boetsch M 1969 1 April Meulan, Yvelines (FRA) 183 76 France FRA 1996 Summer Olympics 1996 Summer Atlanta Tennis Doubles, Men NA diff -r 02026300aa45 -r 6595517c2dd8 test-data/olympics_bmi_out.tab --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/olympics_bmi_out.tab Thu Jul 28 15:28:30 2022 +0000 @@ -0,0 +1,7 @@ +athlete_id name sex birth_year birth_day birth_place height weight team noc games year season city sport event medal BMI +1 Jean-François Blanchy M 1886 12 December Bordeaux, Gironde (FRA) NA NA France FRA 1912 Summer Olympics 1912 Summer Stockholm Tennis Doubles, Men NA NA +1 Jean-François Blanchy M 1886 12 December Bordeaux, Gironde (FRA) NA NA France FRA 1912 Summer Olympics 1912 Summer Stockholm Tennis Singles, Men NA NA +1 Jean-François Blanchy M 1886 12 December Bordeaux, Gironde (FRA) NA NA France FRA 1920 Summer Olympics 1920 Summer Antwerpen Tennis Doubles, Men NA NA +1 Jean-François Blanchy M 1886 12 December Bordeaux, Gironde (FRA) NA NA France FRA 1920 Summer Olympics 1920 Summer Antwerpen Tennis Doubles, Mixed NA NA +1 Jean-François Blanchy M 1886 12 December Bordeaux, Gironde (FRA) NA NA France FRA 1920 Summer Olympics 1920 Summer Antwerpen Tennis Singles, Men NA NA +2 Arnaud Boetsch M 1969 1 April Meulan, Yvelines (FRA) 183 76 France FRA 1996 Summer Olympics 1996 Summer Atlanta Tennis Doubles, Men NA 22.694018931589476 diff -r 02026300aa45 -r 6595517c2dd8 test-data/short_line_test.tab --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/short_line_test.tab Thu Jul 28 15:28:30 2022 +0000 @@ -0,0 +1,3 @@ +chr1 151077881 151077918 2 200 - +chr1 151081985 151082078 3 500 + +chr1 151090031 151090938 4 700 diff -r 02026300aa45 -r 6595517c2dd8 test-data/short_line_test_out.tab --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/short_line_test_out.tab Thu Jul 28 15:28:30 2022 +0000 @@ -0,0 +1,3 @@ +chr1 151077881 151077918 2 200 - +chr1 151081985 151082078 3 500 + +chr1 151090031 151090938 4 700 .