comparison column_maker.py @ 3:be25c075ed54 draft

"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/column_maker commit 2b17bdfc47ca4d7f1a584216c4bd61a7050df7ea"
author devteam
date Thu, 04 Jun 2020 05:03:46 -0400
parents 464b9305180e
children 6e8d94597139
comparison
equal deleted inserted replaced
2:464b9305180e 3:be25c075ed54
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 # This tool takes a tab-delimited textfile as input and creates another column in the file which is the result of 2 """
3 # a computation performed on every row in the original file. The tool will skip over invalid lines within the file, 3 This tool takes a tab-delimited textfile as input and creates another column in
4 # informing the user about the number of lines skipped. 4 the file which is the result of a computation performed on every row in the
5 original file. The tool will skip over invalid lines within the file,
6 informing the user about the number of lines skipped.
7 """
8 from __future__ import print_function
5 9
6 import sys, re 10 import re
7 # These functions may be used in compute expression: 11 import sys
8 from math import log,exp,sqrt,ceil,floor
9 12
10 13 assert sys.version_info[:2] >= (2, 4)
11 assert sys.version_info[:2] >= ( 2, 4 )
12
13 def stop_err( msg ):
14 sys.stderr.write( msg )
15 sys.exit()
16 14
17 inp_file = sys.argv[1] 15 inp_file = sys.argv[1]
18 out_file = sys.argv[2] 16 out_file = sys.argv[2]
19 expr = sys.argv[3] 17 expr = sys.argv[3]
20 round_result = sys.argv[4] 18 round_result = sys.argv[4]
21 try: 19 try:
22 in_columns = int( sys.argv[5] ) 20 in_columns = int(sys.argv[5])
23 except: 21 except Exception:
24 stop_err( "Missing or invalid 'columns' metadata value, click the pencil icon in the history item and select the Auto-detect option to correct it. This tool can only be used with tab-delimited data." ) 22 exit("Missing or invalid 'columns' metadata value, click the pencil icon in the history item and select the Auto-detect option to correct it. This tool can only be used with tab-delimited data.")
25 if in_columns < 2: 23 if in_columns < 2:
26 # To be considered tabular, data must fulfill requirements of the sniff.is_column_based() method. 24 # To be considered tabular, data must fulfill requirements of the sniff.is_column_based() method.
27 stop_err( "Missing or invalid 'columns' metadata value, click the pencil icon in the history item and select the Auto-detect option to correct it. This tool can only be used with tab-delimited data." ) 25 exit("Missing or invalid 'columns' metadata value, click the pencil icon in the history item and select the Auto-detect option to correct it. This tool can only be used with tab-delimited data.")
28 try: 26 try:
29 in_column_types = sys.argv[6].split( ',' ) 27 in_column_types = sys.argv[6].split(',')
30 except: 28 except Exception:
31 stop_err( "Missing or invalid 'column_types' metadata value, click the pencil icon in the history item and select the Auto-detect option to correct it. This tool can only be used with tab-delimited data." ) 29 exit("Missing or invalid 'column_types' metadata value, click the pencil icon in the history item and select the Auto-detect option to correct it. This tool can only be used with tab-delimited data.")
32 if len( in_column_types ) != in_columns: 30 if len(in_column_types) != in_columns:
33 stop_err( "The 'columns' metadata setting does not conform to the 'column_types' metadata setting, click the pencil icon in the history item and select the Auto-detect option to correct it. This tool can only be used with tab-delimited data." ) 31 exit("The 'columns' metadata setting does not conform to the 'column_types' metadata setting, click the pencil icon in the history item and select the Auto-detect option to correct it. This tool can only be used with tab-delimited data.")
34 32 avoid_scientific_notation = sys.argv[7]
33
35 # Unescape if input has been escaped 34 # Unescape if input has been escaped
36 mapped_str = { 35 mapped_str = {
37 '__lt__': '<', 36 '__lt__': '<',
38 '__le__': '<=', 37 '__le__': '<=',
39 '__eq__': '==', 38 '__eq__': '==',
42 '__ge__': '>=', 41 '__ge__': '>=',
43 '__sq__': '\'', 42 '__sq__': '\'',
44 '__dq__': '"', 43 '__dq__': '"',
45 } 44 }
46 for key, value in mapped_str.items(): 45 for key, value in mapped_str.items():
47 expr = expr.replace( key, value ) 46 expr = expr.replace(key, value)
48 47
49 operators = 'is|not|or|and' 48 operators = 'is|not|or|and'
50 builtin_and_math_functions = 'abs|all|any|bin|chr|cmp|complex|divmod|float|bool|hex|int|len|long|max|min|oct|ord|pow|range|reversed|round|sorted|str|sum|type|unichr|unicode|log|exp|sqrt|ceil|floor' 49 builtin_and_math_functions = 'abs|all|any|bin|chr|cmp|complex|divmod|float|bool|hex|int|len|long|max|min|oct|ord|pow|range|reversed|round|sorted|str|sum|type|unichr|unicode|log|exp|sqrt|ceil|floor'
51 string_and_list_methods = [ name for name in dir('') + dir([]) if not name.startswith('_') ] 50 string_and_list_methods = [name for name in dir('') + dir([]) if not name.startswith('_')]
52 whitelist = "^([c0-9\+\-\*\/\(\)\.\'\"><=,:! ]|%s|%s|%s)*$" % (operators, builtin_and_math_functions, '|'.join(string_and_list_methods)) 51 whitelist = r"^([c0-9\+\-\*\/\(\)\.\'\"><=,:! ]|%s|%s|%s)*$" % (operators, builtin_and_math_functions, '|'.join(string_and_list_methods))
53 if not re.compile(whitelist).match(expr): 52 if not re.compile(whitelist).match(expr):
54 stop_err("Invalid expression") 53 exit("Invalid expression")
54 if avoid_scientific_notation == "yes":
55 expr = "format_float_positional(%s)" % expr
55 56
56 # Prepare the column variable names and wrappers for column data types 57 # Prepare the column variable names and wrappers for column data types
57 cols, type_casts = [], [] 58 cols, type_casts = [], []
58 for col in range( 1, in_columns + 1 ): 59 for col in range(1, in_columns + 1):
59 col_name = "c%d" % col 60 col_name = "c%d" % col
60 cols.append( col_name ) 61 cols.append(col_name)
61 col_type = in_column_types[ col - 1 ].strip() 62 col_type = in_column_types[col - 1].strip()
62 if round_result == 'no' and col_type == 'int': 63 if round_result == 'no' and col_type == 'int':
63 col_type = 'float' 64 col_type = 'float'
64 type_cast = "%s(%s)" % ( col_type, col_name ) 65 type_cast = "%s(%s)" % (col_type, col_name)
65 type_casts.append( type_cast ) 66 type_casts.append(type_cast)
66 67
67 col_str = ', '.join( cols ) # 'c1, c2, c3, c4' 68 col_str = ', '.join(cols) # 'c1, c2, c3, c4'
68 type_cast_str = ', '.join( type_casts ) # 'str(c1), int(c2), int(c3), str(c4)' 69 type_cast_str = ', '.join(type_casts) # 'str(c1), int(c2), int(c3), str(c4)'
69 assign = "%s = line.split( '\\t' )" % col_str 70 assign = "%s = line.split('\\t')" % col_str
70 wrap = "%s = %s" % ( col_str, type_cast_str ) 71 wrap = "%s = %s" % (col_str, type_cast_str)
71 skipped_lines = 0 72 skipped_lines = 0
72 first_invalid_line = 0 73 first_invalid_line = 0
73 invalid_line = None 74 invalid_line = None
74 lines_kept = 0 75 lines_kept = 0
75 total_lines = 0 76 total_lines = 0
76 out = open( out_file, 'wt' ) 77 out = open(out_file, 'wt')
77 78
78 # Read input file, skipping invalid lines, and perform computation that will result in a new column 79 # Read input file, skipping invalid lines, and perform computation that will result in a new column
79 code = ''' 80 code = '''
80 for i, line in enumerate( file( inp_file ) ): 81 # import here since flake8 complains otherwise
82 from math import (
83 ceil,
84 exp,
85 floor,
86 log,
87 sqrt
88 )
89 from numpy import format_float_positional
90
91 fh = open(inp_file)
92 for i, line in enumerate(fh):
81 total_lines += 1 93 total_lines += 1
82 line = line.rstrip( '\\r\\n' ) 94 line = line.rstrip('\\r\\n')
83 if not line or line.startswith( '#' ): 95 if not line or line.startswith('#'):
84 skipped_lines += 1 96 skipped_lines += 1
85 if not invalid_line: 97 if not invalid_line:
86 first_invalid_line = i + 1 98 first_invalid_line = i + 1
87 invalid_line = line 99 invalid_line = line
88 continue 100 continue
89 try: 101 try:
90 %s 102 %s
91 %s 103 %s
92 new_val = %s 104 new_val = %s
93 if round_result == "yes": 105 if round_result == "yes":
94 new_val = int( round( new_val ) ) 106 new_val = int(round(new_val))
95 new_line = line + '\\t' + str( new_val ) 107 new_line = line + '\\t' + str(new_val) + "\\n"
96 print >> out, new_line 108 out.write(new_line)
97 lines_kept += 1 109 lines_kept += 1
98 except: 110 except Exception:
99 skipped_lines += 1 111 skipped_lines += 1
100 if not invalid_line: 112 if not invalid_line:
101 first_invalid_line = i + 1 113 first_invalid_line = i + 1
102 invalid_line = line 114 invalid_line = line
103 ''' % ( assign, wrap, expr ) 115 fh.close()
116 ''' % (assign, wrap, expr)
104 117
105 valid_expr = True 118 valid_expr = True
106 try: 119 try:
107 exec code 120 exec(code)
108 except Exception, e: 121 except Exception as e:
109 out.close() 122 out.close()
110 if str( e ).startswith( 'invalid syntax' ): 123 if str(e).startswith('invalid syntax'):
111 valid_expr = False 124 valid_expr = False
112 stop_err( 'Expression "%s" likely invalid. See tool tips, syntax and examples.' % expr ) 125 exit('Expression "%s" likely invalid. See tool tips, syntax and examples.' % expr)
113 else: 126 else:
114 stop_err( str( e ) ) 127 exit(str(e))
115 128
116 if valid_expr: 129 if valid_expr:
117 out.close() 130 out.close()
118 valid_lines = total_lines - skipped_lines 131 valid_lines = total_lines - skipped_lines
119 print 'Creating column %d with expression %s' % ( in_columns + 1, expr ) 132 print('Creating column %d with expression %s' % (in_columns + 1, expr))
120 if valid_lines > 0: 133 if valid_lines > 0:
121 print 'kept %4.2f%% of %d lines.' % ( 100.0*lines_kept/valid_lines, total_lines ) 134 print('kept %4.2f%% of %d lines.' % (100.0 * lines_kept / valid_lines,
135 total_lines))
122 else: 136 else:
123 print 'Possible invalid expression "%s" or non-existent column referenced. See tool tips, syntax and examples.' % expr 137 print('Possible invalid expression "%s" or non-existent column referenced. See tool tips, syntax and examples.' % expr)
124 if skipped_lines > 0: 138 if skipped_lines > 0:
125 print 'Skipped %d invalid lines starting at line #%d: "%s"' % ( skipped_lines, first_invalid_line, invalid_line ) 139 print('Skipped %d invalid lines starting at line #%d: "%s"' %
140 (skipped_lines, first_invalid_line, invalid_line))