Mercurial > repos > devteam > column_maker
comparison column_maker.py @ 9:6595517c2dd8 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_maker commit fe76077775aaca531f6a563fdfcbd73fbf1528e7
| author | iuc |
|---|---|
| date | Thu, 28 Jul 2022 15:28:30 +0000 |
| parents | 02026300aa45 |
| children | aff5135563c6 |
comparison
equal
deleted
inserted
replaced
| 8:02026300aa45 | 9:6595517c2dd8 |
|---|---|
| 1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
| 2 """ | 2 """ |
| 3 This tool takes a tab-delimited textfile as input and creates another column in | 3 This tool takes a tab-delimited textfile as input and creates new columns in |
| 4 the file which is the result of a computation performed on every row in the | 4 the file which are the result of a computation performed on every row in the |
| 5 original file. The tool will skip over invalid lines within the file, | 5 original file. The tool will skip over empty and comment (starting with a #) |
| 6 informing the user about the number of lines skipped. | 6 lines within the file. It does not change the formatting of any original, |
| 7 retained columns. | |
| 7 """ | 8 """ |
| 8 | 9 |
| 9 import argparse | 10 import argparse |
| 10 import json | 11 import enum |
| 11 import re | 12 import re |
| 12 import sys | 13 import sys |
| 13 # Functions that may be used in the compute expression | 14 # Functions that may be used in the compute expression |
| 14 from math import ( # noqa: F401 | 15 from math import ( # noqa: F401 |
| 15 ceil, | 16 ceil, |
| 18 log, | 19 log, |
| 19 log10, | 20 log10, |
| 20 sqrt, | 21 sqrt, |
| 21 ) | 22 ) |
| 22 | 23 |
| 23 from numpy import format_float_positional # noqa: F401 | 24 from numpy import format_float_positional |
| 25 | |
| 26 | |
| 27 class Mode(enum.Enum): | |
| 28 APPEND = '' | |
| 29 INSERT = 'I' | |
| 30 REPLACE = 'R' | |
| 31 | |
| 32 | |
| 33 def from_str(s, to_type): | |
| 34 if to_type is list: | |
| 35 return [part.strip(' ') for part in s.split(',')] | |
| 36 else: | |
| 37 return to_type(s) | |
| 38 | |
| 39 | |
| 40 def to_str(obj): | |
| 41 if type(obj) is list: | |
| 42 return ','.join([to_str(i) for i in obj]) | |
| 43 if args.avoid_scientific_notation and type(obj) is float: | |
| 44 return format_float_positional(obj) | |
| 45 return str(obj) | |
| 46 | |
| 24 | 47 |
| 25 parser = argparse.ArgumentParser() | 48 parser = argparse.ArgumentParser() |
| 26 parser.add_argument('input', type=argparse.FileType('r'), help="input file") | 49 parser.add_argument('input', type=str, help='input file') |
| 27 parser.add_argument('output', type=argparse.FileType('wt'), help="output file") | 50 parser.add_argument('output', type=str, help='output file') |
| 28 parser.add_argument('cond', nargs='?', type=str, help="expression") | 51 parser.add_argument( |
| 29 parser.add_argument('columns', nargs='?', type=int, help="number of columns") | 52 '-t', '--column-types', nargs='?', required=True, |
| 30 parser.add_argument('column_types', nargs='?', type=str, help="comma separated list of column types") | 53 help='A comma-separated list of column types in the input file' |
| 31 parser.add_argument('--round', action="store_true", | 54 ) |
| 32 help="round result") | 55 parser.add_argument( |
| 33 parser.add_argument('--avoid_scientific_notation', action="store_true", | 56 '--avoid-scientific-notation', action='store_true', |
| 34 help="avoid scientific notation") | 57 help='avoid scientific notation' |
| 35 parser.add_argument('--header_new_column_name', default=None, type=str, | 58 ) |
| 36 help="First line of input is a header line with column " | 59 parser.add_argument( |
| 37 "names and this should become the name of the new " | 60 '--header', action='store_true', |
| 38 "column") | 61 help='The input has a header line with column names. ' |
| 39 parser.add_argument('--load_json', default=None, type=argparse.FileType('r'), | 62 'Actions must specify names of newly calculated columns.' |
| 40 help="overwrite parsed arguments from json file") | 63 ) |
| 64 parser.add_argument( | |
| 65 '--fail-on-non-existent-columns', action='store_true', | |
| 66 help='If an action references a column number that is not existent ' | |
| 67 'when the expression gets computed, the default behavior is to treat ' | |
| 68 'this as a case of rows for which the expression cannot be computed. ' | |
| 69 'The behavior of the tool will then depend on which of the ' | |
| 70 'non-computable switches is in effect. With this flag, in contrast, ' | |
| 71 'the tool will fail directly upon encountering a non-existing column.' | |
| 72 ) | |
| 73 non_computable = parser.add_mutually_exclusive_group() | |
| 74 non_computable.add_argument('--fail-on-non-computable', action='store_true') | |
| 75 non_computable.add_argument('--skip-non-computable', action='store_true') | |
| 76 non_computable.add_argument('--keep-non-computable', action='store_true') | |
| 77 non_computable.add_argument('--non-computable-blank', action='store_true') | |
| 78 non_computable.add_argument('--non-computable-default') | |
| 79 | |
| 80 group = parser.add_mutually_exclusive_group(required=True) | |
| 81 group.add_argument( | |
| 82 '-a', '--actions', nargs='*', type=str, | |
| 83 help='One or more action(s) of the format EXPR;[COL_ADD_SPEC];[COL_NAME]' | |
| 84 ) | |
| 85 group.add_argument( | |
| 86 '-f', '--file', type=str, | |
| 87 help='File to read actions from (mutually exclusive with -a)' | |
| 88 ) | |
| 41 args = parser.parse_args() | 89 args = parser.parse_args() |
| 42 | 90 |
| 43 argparse_dict = vars(args) | 91 if not args.column_types: |
| 44 if args.load_json: | 92 with open(args.input) as fh: |
| 45 json_dict = json.load(args.load_json) | 93 if not fh.readline(): |
| 46 argparse_dict.update(json_dict) | 94 # Generally, the input must have at least one column to be |
| 47 | 95 # considered tabular, but empty files are ok and should produce |
| 48 fh = argparse_dict['input'] | 96 # empty output. |
| 49 out = argparse_dict['output'] | 97 with open(args.output, 'w') as out: |
| 50 expr = argparse_dict['cond'] | 98 pass |
| 51 round_result = argparse_dict['round'] | 99 sys.exit() |
| 52 avoid_scientific_notation = argparse_dict['avoid_scientific_notation'] | 100 sys.exit( |
| 53 | 101 "Missing column types. " |
| 54 if argparse_dict['header_new_column_name'] is not None: | 102 "In Galaxy, click the pencil icon on the history item and " |
| 55 header_line = fh.readline().strip('\n') | 103 "select the Auto-detect option to correct it. " |
| 56 out.write( | 104 "This tool can only be used with tab-delimited data." |
| 57 '{0}\t{1}\n'.format( | 105 ) |
| 58 header_line, argparse_dict['header_new_column_name'] | 106 |
| 59 ) | 107 in_column_types = [t.strip() for t in args.column_types.split(',')] |
| 60 ) | 108 in_columns = len(in_column_types) |
| 109 | |
| 110 # Prepare initial column variable names and type cast representations | |
| 111 # for column data types | |
| 112 cols, type_casts = [], [] | |
| 113 for n, col_type in enumerate(in_column_types, start=1): | |
| 114 col_name = "c%d" % n | |
| 115 cols.append(col_name) | |
| 116 col_str = ', '.join(cols) # 'c1, c2, c3, c4' | |
| 117 | |
| 118 # Define lambda for type-casting of original row fields | |
| 61 try: | 119 try: |
| 62 in_columns = int(argparse_dict['columns']) | 120 cast_types = eval( |
| 63 if in_columns < 1: | 121 'lambda fields: [from_str(s, t) for s, t in zip(fields, [%s])]' |
| 64 # To be considered tabular, data must have at least one column. | 122 % args.column_types |
| 65 raise ValueError | 123 ) |
| 66 except Exception: | 124 except Exception as e: |
| 67 if not fh.readline(): | 125 sys.exit( |
| 68 # empty file content is ok and should produce empty output | 126 'While parsing column types, the following problem occured: "%s"' |
| 69 out.close() | 127 % e |
| 70 sys.exit() | 128 ) |
| 71 sys.exit("Missing or invalid 'columns' metadata value, click the pencil icon in the history item and select the Auto-detect option to correct it. This tool can only be used with tab-delimited data.") | 129 |
| 72 try: | 130 # Get and parse actions |
| 73 in_column_types = argparse_dict['column_types'].split(',') | 131 if args.file: |
| 74 except Exception: | 132 actions = [] |
| 75 sys.exit("Missing or invalid 'column_types' metadata value, click the pencil icon in the history item and select the Auto-detect option to correct it. This tool can only be used with tab-delimited data.") | 133 with open(args.file) as i: |
| 76 if len(in_column_types) != in_columns: | 134 for line in i: |
| 77 sys.exit("The 'columns' metadata setting does not conform to the 'column_types' metadata setting, click the pencil icon in the history item and select the Auto-detect option to correct it. This tool can only be used with tab-delimited data.") | 135 line = line.strip() |
| 78 | 136 if line: |
| 137 actions.append(line) | |
| 138 else: | |
| 139 actions = args.actions | |
| 140 | |
| 141 # each action must be a full data row manipulation instruction of the form: | |
| 142 # EXPR;[COL_ADD_SPEC];[COL_NAME] | |
| 143 # where EXPR is the actual expression to compute on the row, | |
| 144 # COL_ADD_SPEC consists of a column index and a mode identifier for how the | |
| 145 # new column should be added. | |
| 146 # Examples: 3I (insert new col before current column 3), | |
| 147 # 2R (replace current column 2 with new column); | |
| 148 # a missing COL_ADD_SPEC is interpreted as mode A (append new column at the | |
| 149 # end of the row). | |
| 150 # COL_NAME is required with the --header option and specifies the name of the | |
| 151 # new column; without --header, any COL_NAME gets ignored. | |
| 79 operators = 'is|not|or|and' | 152 operators = 'is|not|or|and' |
| 80 builtin_and_math_functions = 'abs|all|any|bin|chr|cmp|complex|divmod|float|bool|hex|int|len|long|max|min|oct|ord|pow|range|reversed|round|sorted|str|sum|type|unichr|unicode|log|log10|exp|sqrt|ceil|floor' | 153 builtin_and_math_functions = ( |
| 81 string_and_list_methods = [name for name in dir('') + dir([]) if not name.startswith('_')] | 154 'abs|all|any|ascii|bin|bool|chr|complex|divmod|float|format|hex|int|len|' |
| 82 whitelist = r"^([c0-9\+\-\*\/\(\)\.\'\"><=,:! ]|%s|%s|%s)*$" % (operators, builtin_and_math_functions, '|'.join(string_and_list_methods)) | 155 'list|map|max|min|oct|ord|pow|range|reversed|round|set|sorted|str|sum|type|' |
| 83 if not re.compile(whitelist).match(expr): | 156 'log|log10|exp|sqrt|ceil|floor' |
| 84 sys.exit("Invalid expression") | 157 ) |
| 85 if avoid_scientific_notation: | 158 imported_numpy_function = 'format_float_positional' |
| 86 expr = "format_float_positional(%s)" % expr | 159 string_and_list_methods = [ |
| 87 | 160 name for name in dir('') + dir([]) if not name.startswith('_') |
| 88 # Prepare the column variable names and wrappers for column data types | 161 ] |
| 89 cols, type_casts = [], [] | 162 whitelist = r"^([c0-9\+\-\*\/\(\)\.\'\"><=,:! ]|%s|%s|%s|%s)*$" % ( |
| 90 for col in range(1, in_columns + 1): | 163 operators, |
| 91 col_name = "c%d" % col | 164 builtin_and_math_functions, |
| 92 cols.append(col_name) | 165 imported_numpy_function, |
| 93 col_type = in_column_types[col - 1].strip() | 166 '|'.join(string_and_list_methods) |
| 94 if not round_result and col_type == 'int': | 167 ) |
| 95 col_type = 'float' | 168 valid_pat = re.compile(whitelist) |
| 96 type_cast = "%s(%s)" % (col_type, col_name) | 169 ops = [] |
| 97 type_casts.append(type_cast) | 170 num_cols = in_columns |
| 98 | 171 for ac in actions: |
| 99 col_str = ', '.join(cols) # 'c1, c2, c3, c4' | 172 try: |
| 100 type_cast_str = ', '.join(type_casts) # 'str(c1), int(c2), int(c3), str(c4)' | 173 expr_string, col_add_spec, new_col_name = ac.split(';') |
| 101 assign = "%s = line.split('\\t')" % col_str | 174 except ValueError: |
| 102 if len(cols) == 1: | 175 sys.exit( |
| 103 # Single column, unpacking by assignment won't work | 176 'Invalid Action: "%s". ' |
| 104 assign += '[0]' | 177 'Required format: EXPR;[COL_ADD_SPEC];[COL_NAME]' % ac |
| 105 wrap = "%s = %s" % (col_str, type_cast_str) | 178 ) |
| 179 if not valid_pat.match(expr_string): | |
| 180 sys.exit('Invalid expression: "%s"' % expr_string) | |
| 181 try: | |
| 182 expr_lambda = eval('lambda %s: %s' % (col_str, expr_string)) | |
| 183 except Exception as e: | |
| 184 if str(e).startswith('invalid syntax'): | |
| 185 sys.exit( | |
| 186 'Expression "%s" caused a syntax error during parsing.' | |
| 187 % expr_string | |
| 188 ) | |
| 189 else: | |
| 190 sys.exit( | |
| 191 'While parsing expression "%s" the following problem occured: ' | |
| 192 '"%s"' % (expr_string, str(e)) | |
| 193 ) | |
| 194 try: | |
| 195 new_col_idx = int(col_add_spec[:-1] or '0') - 1 | |
| 196 except ValueError: | |
| 197 sys.exit( | |
| 198 'COL_ADD_SPECS need to start with a (1-based) column index. ' | |
| 199 'Could not parse a column index from "%s"' % col_add_spec | |
| 200 ) | |
| 201 try: | |
| 202 mode = Mode(col_add_spec[-1:]) | |
| 203 except ValueError: | |
| 204 sys.exit( | |
| 205 'COL_ADD_SPECS need to end in a single-character mode identifier ' | |
| 206 '("I", or "R"), or be empty (for Append mode). ' | |
| 207 'Could not parse a valid identifier from "%s"' % col_add_spec | |
| 208 ) | |
| 209 if mode is Mode.REPLACE: | |
| 210 if new_col_idx < 0 or new_col_idx >= num_cols: | |
| 211 sys.exit( | |
| 212 'Cannot replace the contents of column %d as specified by ' | |
| 213 'action "%s". No such column at this point of the ' | |
| 214 'computation' % (new_col_idx + 1, ac) | |
| 215 ) | |
| 216 if not new_col_name and args.header: | |
| 217 sys.exit( | |
| 218 'A name is required for any new columns when using an existing ' | |
| 219 'header line (--header option), but found none in action: ' | |
| 220 '"%s"' % ac | |
| 221 ) | |
| 222 # Successfully parsed the instruction | |
| 223 # Store the expression lambda, the index and name of the new column, and | |
| 224 # the original string representation of the expression (for use in | |
| 225 # potential later error messages). | |
| 226 ops.append([expr_lambda, new_col_idx, mode, new_col_name, expr_string]) | |
| 227 if mode is Mode.APPEND or mode is Mode.INSERT: | |
| 228 # If the current expression results in an additional column, | |
| 229 # we need to handle the new field in subsequent lambda functions. | |
| 230 num_cols += 1 | |
| 231 col_str += ', c%d' % num_cols | |
| 232 | |
| 233 | |
| 234 # ready to start parsing the input file | |
| 235 print( | |
| 236 'Computing %d new columns with instructions %s' | |
| 237 % (num_cols - in_columns, actions) | |
| 238 ) | |
| 106 skipped_lines = 0 | 239 skipped_lines = 0 |
| 107 first_invalid_line = 0 | 240 first_invalid_line = 0 |
| 108 invalid_line = None | 241 invalid_line = None |
| 109 lines_kept = 0 | 242 lines_computed = 0 |
| 110 total_lines = 0 | 243 total_lines = 0 |
| 111 | 244 non_existent_col_pat = re.compile(r"name 'c\d+' is not defined") |
| 112 # Read input file, skipping invalid lines, and perform computation that will result in a new column | 245 |
| 113 code = ''' | 246 with open(args.input, encoding='utf-8') as fh, \ |
| 114 for i, line in enumerate(fh): | 247 open(args.output, 'w', encoding='utf-8') as out: |
| 115 total_lines += 1 | 248 if args.header: |
| 116 line = line.rstrip('\\r\\n') | 249 # compute new header line from original |
| 117 if not line or line.startswith('#'): | 250 header_cols = fh.readline().strip('\n').split('\t') |
| 118 skipped_lines += 1 | 251 for _, col_idx, mode, col_name, _ in ops: |
| 119 if not invalid_line: | 252 if mode is Mode.INSERT: |
| 120 first_invalid_line = i + 1 | 253 header_cols.insert(col_idx, col_name) |
| 121 invalid_line = line | 254 elif mode is Mode.REPLACE: |
| 122 continue | 255 header_cols[col_idx] = col_name |
| 123 try: | 256 else: |
| 124 %s | 257 header_cols.append(col_name) |
| 125 %s | 258 out.write('\t'.join(header_cols) + '\n') |
| 126 new_val = %s | 259 |
| 127 if round_result: | 260 # read data, skipping empty and comment lines, and perform computations |
| 128 new_val = int(round(new_val)) | 261 # that will result in new columns |
| 129 new_line = line + '\\t' + str(new_val) + "\\n" | 262 for i, line in enumerate(fh): |
| 130 out.write(new_line) | 263 total_lines += 1 |
| 131 lines_kept += 1 | 264 line = line.rstrip('\n') |
| 132 except Exception: | 265 if not line or line.startswith('#'): |
| 133 skipped_lines += 1 | 266 skipped_lines += 1 |
| 134 if not invalid_line: | 267 if not invalid_line: |
| 135 first_invalid_line = i + 1 | 268 first_invalid_line = i + 1 |
| 136 invalid_line = line | 269 invalid_line = line |
| 137 fh.close() | 270 continue |
| 138 ''' % (assign, wrap, expr) | 271 fields = line.split('\t') |
| 139 | 272 if len(fields) == in_columns: |
| 140 valid_expr = True | 273 try: |
| 141 try: | 274 typed_fields = cast_types(fields) |
| 142 exec(code) | 275 except ValueError as e: |
| 143 except Exception as e: | 276 sys.exit( |
| 144 if str(e).startswith('invalid syntax'): | 277 'Failed to convert some of the columns in line #%d to their ' |
| 145 valid_expr = False | 278 'expected types. The error was: "%s" for the line: "%s"' |
| 146 sys.exit('Expression "%s" likely invalid. See tool tips, syntax and examples.' % expr) | 279 % (i, str(e), line) |
| 147 else: | 280 ) |
| 148 sys.exit(str(e)) | 281 else: |
| 149 finally: | 282 # A "suspicious" line with less or more fields than expected |
| 150 out.close() | 283 # Type-casting for it might fail or not, but it is pointless to |
| 151 | 284 # even try because subsequent computation of any expression will |
| 152 if valid_expr: | 285 # fail anyway as expression lambdas expect a fixed number of |
| 153 valid_lines = total_lines - skipped_lines | 286 # arguments. |
| 154 print('Creating column %d with expression %s' % (in_columns + 1, expr)) | 287 # Lets pass in a copy of the original string fields, let |
| 155 if valid_lines > 0: | 288 # the computation of the first expression fail, then have that |
| 156 print('kept %4.2f%% of %d lines.' % (100.0 * lines_kept / valid_lines, | 289 # situation handled according to the non-computable settings in |
| 157 total_lines)) | 290 # effect. |
| 158 else: | 291 typed_fields = fields[:] |
| 159 print('Possible invalid expression "%s" or non-existent column referenced. See tool tips, syntax and examples.' % expr) | 292 for fun, col_idx, mode, col_name, ex in ops: |
| 160 if skipped_lines > 0: | 293 try: |
| 161 print('Skipped %d invalid lines starting at line #%d: "%s"' % | 294 try: |
| 162 (skipped_lines, first_invalid_line, invalid_line)) | 295 new_val = fun(*typed_fields) |
| 296 except NameError as e: | |
| 297 # Python 3.10+ would have the problematic name | |
| 298 # available as e.name | |
| 299 if non_existent_col_pat.fullmatch(str(e)) and ( | |
| 300 not args.fail_on_non_existent_columns | |
| 301 ): | |
| 302 # Looks like a reference to a non-existent column | |
| 303 # and we are not supposed to fail on it directly. | |
| 304 # Reraise and have it handled as a non-computable | |
| 305 # row. | |
| 306 raise | |
| 307 # NameErrors are not row-specific, but indicate a | |
| 308 # general problem with the user-supplied expression. | |
| 309 sys.exit( | |
| 310 'While parsing expression "%s" the following ' | |
| 311 'problem occured: "%s"' % (ex, str(e)) | |
| 312 ) | |
| 313 except Exception as e: | |
| 314 if args.skip_non_computable: | |
| 315 # log that a line got skipped, then stop computing | |
| 316 # for this line | |
| 317 skipped_lines += 1 | |
| 318 if not invalid_line: | |
| 319 first_invalid_line = i + 1 | |
| 320 invalid_line = line | |
| 321 break | |
| 322 if args.keep_non_computable: | |
| 323 # write the original line unchanged and stop computing | |
| 324 # for this line | |
| 325 out.write(line + '\n') | |
| 326 break | |
| 327 if args.non_computable_blank: | |
| 328 new_val = '' | |
| 329 elif args.non_computable_default is not None: | |
| 330 new_val = args.non_computable_default | |
| 331 else: | |
| 332 # --fail_on_non_computable | |
| 333 # (which is default behavior, too) | |
| 334 sys.exit( | |
| 335 'Could not compute a new column value using "%s" on ' | |
| 336 'line #%d: "%s". Error was "%s"' | |
| 337 % (ex, i, line, str(e)) | |
| 338 ) | |
| 339 if mode is Mode.INSERT: | |
| 340 fields.insert(col_idx, new_val) | |
| 341 typed_fields.insert(col_idx, new_val) | |
| 342 elif mode is Mode.REPLACE: | |
| 343 if col_idx > len(fields): | |
| 344 # Intentionally allow "replacing" one column beyond | |
| 345 # current fields since this can be used to fix | |
| 346 # short lines in the input. | |
| 347 sys.exit( | |
| 348 'Cannot replace column #%d in line with %d columns: ' | |
| 349 '"%s"' % (col_idx + 1, len(fields), line) | |
| 350 ) | |
| 351 fields[col_idx:col_idx + 1] = [new_val] | |
| 352 typed_fields[col_idx:col_idx + 1] = [new_val] | |
| 353 else: | |
| 354 fields.append(new_val) | |
| 355 typed_fields.append(new_val) | |
| 356 else: | |
| 357 fields = [to_str(field) for field in fields] | |
| 358 out.write('\t'.join(fields) + '\n') | |
| 359 lines_computed += 1 | |
| 360 | |
| 361 | |
| 362 valid_lines = total_lines - skipped_lines | |
| 363 if valid_lines > 0: | |
| 364 print( | |
| 365 'Computed new column values for %4.2f%% of %d lines written.' | |
| 366 % (100.0 * lines_computed / valid_lines, valid_lines) | |
| 367 ) | |
| 368 elif args.fail_on_non_existent_columns: | |
| 369 # Warn the user that there could be an issue with an expression. | |
| 370 print( | |
| 371 'Could not compute a new column for any input row! ' | |
| 372 'Please check your expression(s) "%s" for problems.' | |
| 373 % actions | |
| 374 ) | |
| 375 else: | |
| 376 # Same, but the problem could also be a reference to a non-existent | |
| 377 # column. | |
| 378 print( | |
| 379 'Could not compute a new column for any input row! ' | |
| 380 'Please check your expression(s) "%s" for references to non-existent ' | |
| 381 'columns or other problems.' | |
| 382 % actions | |
| 383 ) | |
| 384 if skipped_lines > 0: | |
| 385 print('Skipped %d invalid lines starting at line #%d: "%s"' % | |
| 386 (skipped_lines, first_invalid_line, invalid_line)) | |
| 387 if lines_computed < valid_lines: | |
| 388 print( | |
| 389 'Rewrote %d lines unmodified because computation of a new value failed' | |
| 390 % (valid_lines - lines_computed) | |
| 391 ) |
