Mercurial > repos > iuc > table_compute
comparison scripts/safety.py @ 0:1b0f96ed73f2 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/table_compute commit 1ee75135483d5db22c540bc043746cd986f85762"
| author | iuc |
|---|---|
| date | Sat, 17 Aug 2019 16:25:37 -0400 |
| parents | |
| children | dddadbbac949 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1b0f96ed73f2 |
|---|---|
| 1 import re | |
| 2 | |
| 3 | |
| 4 class Safety(): | |
| 5 """ | |
| 6 Class to safely evaluate mathematical expression on single | |
| 7 or table data | |
| 8 """ | |
| 9 | |
| 10 __allowed_tokens = ( | |
| 11 '(', ')', 'if', 'else', 'or', 'and', 'not', 'in', | |
| 12 '+', '-', '*', '/', '%', ',', '!=', '==', '>', '>=', '<', '<=', | |
| 13 'min', 'max', 'sum', | |
| 14 ) | |
| 15 __allowed_ref_types = { | |
| 16 'pd.DataFrame': { | |
| 17 'abs', 'add', 'agg', 'aggregate', 'align', 'all', 'any', 'append', | |
| 18 'apply', 'applymap', 'as_matrix', 'asfreq', 'at', 'axes', 'bool', | |
| 19 'clip', 'clip_lower', 'clip_upper', 'columns', 'combine', | |
| 20 'compound', 'corr', 'count', 'cov', 'cummax', 'cummin', 'cumprod', | |
| 21 'cumsum', 'describe', 'div', 'divide', 'dot', 'drop', | |
| 22 'drop_duplicates', 'droplevel', 'dropna', 'duplicated', 'empty', | |
| 23 'eq', 'equals', 'expanding', 'ffill', 'fillna', 'filter', 'first', | |
| 24 'first_valid_index', 'floordiv', 'ge', 'groupby', 'gt', 'head', | |
| 25 'iat', 'iloc', 'index', 'insert', 'interpolate', 'isin', 'isna', | |
| 26 'isnull', 'items', 'iteritems', 'iterrows', 'itertuples', 'ix', | |
| 27 'join', 'keys', 'kurt', 'kurtosis', 'last', 'last_valid_index', | |
| 28 'le', 'loc', 'lookup', 'lt', 'mad', 'mask', 'max', 'mean', | |
| 29 'median', 'melt', 'merge', 'min', 'mod', 'mode', 'mul', 'multiply', | |
| 30 'ndim', 'ne', 'nlargest', 'notna', 'notnull', 'nsmallest', | |
| 31 'nunique', 'pct_change', 'pivot', 'pivot_table', 'pop', 'pow', | |
| 32 'prod', 'product', 'quantile', 'radd', 'rank', 'rdiv', 'replace', | |
| 33 'resample', 'rfloordiv', 'rmod', 'rmul', 'rolling', 'round', | |
| 34 'rpow', 'rsub', 'rtruediv', 'sample', 'select', | |
| 35 'sem', 'shape', 'shift', 'size', 'skew', 'slice_shift', | |
| 36 'squeeze', 'stack', 'std', 'sub', 'subtract', 'sum', 'swapaxes', | |
| 37 'swaplevel', 'T', 'tail', 'take', 'transform', 'transpose', | |
| 38 'truediv', 'truncate', 'tshift', 'unstack', 'var', 'where', | |
| 39 }, | |
| 40 'pd.Series': { | |
| 41 'abs', 'add', 'agg', 'aggregate', 'align', 'all', 'any', 'append', | |
| 42 'apply', 'argsort', 'as_matrix', 'asfreq', 'asof', 'astype', 'at', | |
| 43 'at_time', 'autocorr', 'axes', 'between', 'between_time', 'bfill', | |
| 44 'bool', 'cat', 'clip', 'clip_lower', 'clip_upper', 'combine', | |
| 45 'combine_first', 'compound', 'corr', 'count', 'cov', 'cummax', | |
| 46 'cummin', 'cumprod', 'cumsum', 'describe', 'diff', 'div', 'divide', | |
| 47 'divmod', 'dot', 'drop', 'drop_duplicates', 'droplevel', 'dropna', | |
| 48 'dt', 'dtype', 'dtypes', 'duplicated', 'empty', 'eq', 'equals', | |
| 49 'ewm', 'expanding', 'factorize', 'ffill', 'fillna', 'filter', | |
| 50 'first', 'first_valid_index', 'flags', 'floordiv', 'ge', 'groupby', | |
| 51 'gt', 'hasnans', 'head', 'iat', 'idxmax', 'idxmin', 'iloc', 'imag', | |
| 52 'index', 'interpolate', 'is_monotonic', 'is_monotonic_decreasing', | |
| 53 'is_monotonic_increasing', 'is_unique', 'isin', 'isna', 'isnull', | |
| 54 'item', 'items', 'iteritems', 'ix', 'keys', 'kurt', 'kurtosis', | |
| 55 'last', 'last_valid_index', 'le', 'loc', 'lt', 'mad', 'map', | |
| 56 'mask', 'max', 'mean', 'median', 'min', 'mod', 'mode', 'mul', | |
| 57 'multiply', 'name', 'ndim', 'ne', 'nlargest', 'nonzero', 'notna', | |
| 58 'notnull', 'nsmallest', 'nunique', 'pct_change', 'pop', 'pow', | |
| 59 'prod', 'product', 'ptp', 'quantile', 'radd', 'rank', 'rdiv', | |
| 60 'rdivmod', 'real', 'repeat', 'replace', 'resample', 'rfloordiv', | |
| 61 'rmod', 'rmul', 'rolling', 'round', 'rpow', 'rsub', 'rtruediv', | |
| 62 'sample', 'searchsorted', 'select', 'sem', 'shape', 'shift', | |
| 63 'size', 'skew', 'slice_shift', 'sort_index', 'sort_values', | |
| 64 'squeeze', 'std', 'sub', 'subtract', 'sum', 'swapaxes', | |
| 65 'swaplevel', 'T', 'tail', 'take', 'transform', 'transpose', | |
| 66 'truediv', 'truncate', 'tshift', 'unique', 'unstack', | |
| 67 'value_counts', 'var', 'where', 'xs', | |
| 68 }, | |
| 69 } | |
| 70 | |
| 71 __allowed_qualified = { | |
| 72 # allowed numpy functionality | |
| 73 'np': { | |
| 74 'abs', 'add', 'all', 'any', 'append', 'array', 'bool', 'ceil', | |
| 75 'complex', 'cos', 'cosh', 'cov', 'cumprod', 'cumsum', 'degrees', | |
| 76 'divide', 'divmod', 'dot', 'e', 'empty', 'exp', 'float', 'floor', | |
| 77 'hypot', 'inf', 'int', 'isfinite', 'isin', 'isinf', 'isnan', 'log', | |
| 78 'log10', 'log2', 'max', 'mean', 'median', 'min', 'mod', 'multiply', | |
| 79 'nan', 'ndim', 'pi', 'product', 'quantile', 'radians', 'rank', | |
| 80 'remainder', 'round', 'sin', 'sinh', 'size', 'sqrt', 'squeeze', | |
| 81 'stack', 'std', 'str', 'subtract', 'sum', 'swapaxes', 'take', | |
| 82 'tan', 'tanh', 'transpose', 'unique', 'var', 'where', | |
| 83 }, | |
| 84 # allowed math functionality | |
| 85 'math': { | |
| 86 'acos', 'acosh', 'asin', 'asinh', 'atan', 'atan2', 'atanh', 'ceil', | |
| 87 'copysign', 'cos', 'cosh', 'degrees', 'e', 'erf', 'erfc', 'exp', | |
| 88 'expm1', 'fabs', 'factorial', 'floor', 'fmod', 'frexp', 'fsum', | |
| 89 'gamma', 'gcd', 'hypot', 'inf', 'isclose', 'isfinite', 'isinf', | |
| 90 'isnan', 'ldexp', 'lgamma', 'log', 'log10', 'log1p', 'log2', | |
| 91 'modf', 'nan', 'pi', 'pow', 'radians', 'remainder', 'sin', 'sinh', | |
| 92 'sqrt', 'tan', 'tanh', 'tau', 'trunc', | |
| 93 }, | |
| 94 # allowed pd functionality | |
| 95 'pd': { | |
| 96 'DataFrame', 'array', 'concat', 'cut', 'date_range', 'factorize', | |
| 97 'interval_range', 'isna', 'isnull', 'melt', 'merge', 'notna', | |
| 98 'notnull', 'period_range', 'pivot', 'pivot_table', 'unique', | |
| 99 'value_counts', 'wide_to_long', | |
| 100 }, | |
| 101 } | |
| 102 | |
| 103 def __init__(self, expression, | |
| 104 ref_whitelist=None, ref_type=None, | |
| 105 custom_qualified=None): | |
| 106 self.allowed_qualified = self.__allowed_qualified.copy() | |
| 107 if ref_whitelist is None: | |
| 108 self.these = [] | |
| 109 else: | |
| 110 self.these = ref_whitelist | |
| 111 if ref_type is None or ref_type not in self.__allowed_ref_types: | |
| 112 self.allowed_qualified['_this'] = set() | |
| 113 else: | |
| 114 self.allowed_qualified[ | |
| 115 '_this' | |
| 116 ] = self.__allowed_ref_types[ref_type] | |
| 117 if custom_qualified is not None: | |
| 118 self.allowed_qualified.update(custom_qualified) | |
| 119 self.expr = expression | |
| 120 self.__assertSafe() | |
| 121 | |
| 122 def generateFunction(self): | |
| 123 "Generates a function to be evaluated outside the class" | |
| 124 cust_fun = "def fun(%s):\n\treturn(%s)" % (self.these[0], self.expr) | |
| 125 return cust_fun | |
| 126 | |
| 127 def __assertSafe(self): | |
| 128 indeed, problematic_token = self.__isSafeStatement() | |
| 129 if not indeed: | |
| 130 self.detailedExcuse(problematic_token) | |
| 131 raise ValueError("Custom Expression is not safe.") | |
| 132 | |
| 133 @staticmethod | |
| 134 def detailedExcuse(word): | |
| 135 "Gives a verbose statement for why users should not use some specific operators." | |
| 136 mess = None | |
| 137 if word == "for": | |
| 138 mess = "for loops and comprehensions are not allowed. Use numpy or pandas table operations instead." | |
| 139 elif word == ":": | |
| 140 mess = "Colons are not allowed. Use inline Python if/else statements." | |
| 141 elif word == "=": | |
| 142 mess = "Variable assignment is not allowed. Use object methods to substitute values." | |
| 143 elif word in ("[", "]"): | |
| 144 mess = "Direct indexing of arrays is not allowed. Use numpy or pandas functions/methods to address specific parts of tables." | |
| 145 else: | |
| 146 mess = "Not an allowed token in this operation" | |
| 147 print("( '%s' ) %s" % (word, mess)) | |
| 148 | |
| 149 def __isSafeStatement(self): | |
| 150 """ | |
| 151 Determines if a user-expression is safe to evaluate. | |
| 152 | |
| 153 To be considered safe an expression may contain only: | |
| 154 - standard Python operators and numbers | |
| 155 - inline conditional expressions | |
| 156 - select functions and objects | |
| 157 by default, these come from the math, numpy and pandas | |
| 158 libraries, and must be qualified with the modules' conventional | |
| 159 names math, np, pd; can be overridden at the instance level | |
| 160 - references to a whitelist of objects (pd.DataFrames by default) | |
| 161 and their methods | |
| 162 """ | |
| 163 | |
| 164 safe = True | |
| 165 # examples of user-expressions | |
| 166 # '-math.log(1 - elem/4096) * 4096 if elem != bn else elem - 0.5' | |
| 167 # 'vec.median() + vec.sum()' | |
| 168 | |
| 169 # 1. Break expressions into tokens | |
| 170 # e.g., | |
| 171 # [ | |
| 172 # '-', 'math.log', '(', '1', '-', 'elem', '/', '4096', ')', '*', | |
| 173 # '4096', 'if', 'elem', '!=', 'bn', 'else', 'elem', '-', '0.5' | |
| 174 # ] | |
| 175 # or | |
| 176 # ['vec.median', '(', ')', '+', 'vec.sum', '(', ')'] | |
| 177 tokens = [ | |
| 178 e for e in re.split( | |
| 179 r'([a-zA-Z0-9_.]+|[^a-zA-Z0-9_.() ]+|[()])', self.expr | |
| 180 ) if e.strip() | |
| 181 ] | |
| 182 | |
| 183 # 2. Subtract allowed standard tokens | |
| 184 rem = [e for e in tokens if e not in self.__allowed_tokens] | |
| 185 | |
| 186 # 3. Subtract allowed qualified objects from allowed modules | |
| 187 # and whitelisted references and their attributes | |
| 188 rem2 = [] | |
| 189 for e in rem: | |
| 190 parts = e.split('.') | |
| 191 if len(parts) == 1: | |
| 192 if parts[0] in self.these: | |
| 193 continue | |
| 194 if len(parts) == 2: | |
| 195 if parts[0] in self.these: | |
| 196 parts[0] = '_this' | |
| 197 if parts[0] in self.allowed_qualified: | |
| 198 if parts[1] in self.allowed_qualified[parts[0]]: | |
| 199 continue | |
| 200 rem2.append(e) | |
| 201 | |
| 202 # 4. Assert that rest are real numbers | |
| 203 e = '' | |
| 204 for e in rem2: | |
| 205 try: | |
| 206 _ = float(e) | |
| 207 except ValueError: | |
| 208 safe = False | |
| 209 break | |
| 210 | |
| 211 return safe, e |
