Mercurial > repos > iuc > table_compute
diff scripts/safety.py @ 0:1b0f96ed73f2 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/table_compute commit 1ee75135483d5db22c540bc043746cd986f85762"
author | iuc |
---|---|
date | Sat, 17 Aug 2019 16:25:37 -0400 |
parents | |
children | dddadbbac949 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/safety.py Sat Aug 17 16:25:37 2019 -0400 @@ -0,0 +1,211 @@ +import re + + +class Safety(): + """ + Class to safely evaluate mathematical expression on single + or table data + """ + + __allowed_tokens = ( + '(', ')', 'if', 'else', 'or', 'and', 'not', 'in', + '+', '-', '*', '/', '%', ',', '!=', '==', '>', '>=', '<', '<=', + 'min', 'max', 'sum', + ) + __allowed_ref_types = { + 'pd.DataFrame': { + 'abs', 'add', 'agg', 'aggregate', 'align', 'all', 'any', 'append', + 'apply', 'applymap', 'as_matrix', 'asfreq', 'at', 'axes', 'bool', + 'clip', 'clip_lower', 'clip_upper', 'columns', 'combine', + 'compound', 'corr', 'count', 'cov', 'cummax', 'cummin', 'cumprod', + 'cumsum', 'describe', 'div', 'divide', 'dot', 'drop', + 'drop_duplicates', 'droplevel', 'dropna', 'duplicated', 'empty', + 'eq', 'equals', 'expanding', 'ffill', 'fillna', 'filter', 'first', + 'first_valid_index', 'floordiv', 'ge', 'groupby', 'gt', 'head', + 'iat', 'iloc', 'index', 'insert', 'interpolate', 'isin', 'isna', + 'isnull', 'items', 'iteritems', 'iterrows', 'itertuples', 'ix', + 'join', 'keys', 'kurt', 'kurtosis', 'last', 'last_valid_index', + 'le', 'loc', 'lookup', 'lt', 'mad', 'mask', 'max', 'mean', + 'median', 'melt', 'merge', 'min', 'mod', 'mode', 'mul', 'multiply', + 'ndim', 'ne', 'nlargest', 'notna', 'notnull', 'nsmallest', + 'nunique', 'pct_change', 'pivot', 'pivot_table', 'pop', 'pow', + 'prod', 'product', 'quantile', 'radd', 'rank', 'rdiv', 'replace', + 'resample', 'rfloordiv', 'rmod', 'rmul', 'rolling', 'round', + 'rpow', 'rsub', 'rtruediv', 'sample', 'select', + 'sem', 'shape', 'shift', 'size', 'skew', 'slice_shift', + 'squeeze', 'stack', 'std', 'sub', 'subtract', 'sum', 'swapaxes', + 'swaplevel', 'T', 'tail', 'take', 'transform', 'transpose', + 'truediv', 'truncate', 'tshift', 'unstack', 'var', 'where', + }, + 'pd.Series': { + 'abs', 'add', 'agg', 'aggregate', 'align', 'all', 'any', 'append', + 'apply', 'argsort', 'as_matrix', 'asfreq', 'asof', 'astype', 'at', + 'at_time', 'autocorr', 'axes', 'between', 'between_time', 'bfill', + 'bool', 'cat', 'clip', 'clip_lower', 'clip_upper', 'combine', + 'combine_first', 'compound', 'corr', 'count', 'cov', 'cummax', + 'cummin', 'cumprod', 'cumsum', 'describe', 'diff', 'div', 'divide', + 'divmod', 'dot', 'drop', 'drop_duplicates', 'droplevel', 'dropna', + 'dt', 'dtype', 'dtypes', 'duplicated', 'empty', 'eq', 'equals', + 'ewm', 'expanding', 'factorize', 'ffill', 'fillna', 'filter', + 'first', 'first_valid_index', 'flags', 'floordiv', 'ge', 'groupby', + 'gt', 'hasnans', 'head', 'iat', 'idxmax', 'idxmin', 'iloc', 'imag', + 'index', 'interpolate', 'is_monotonic', 'is_monotonic_decreasing', + 'is_monotonic_increasing', 'is_unique', 'isin', 'isna', 'isnull', + 'item', 'items', 'iteritems', 'ix', 'keys', 'kurt', 'kurtosis', + 'last', 'last_valid_index', 'le', 'loc', 'lt', 'mad', 'map', + 'mask', 'max', 'mean', 'median', 'min', 'mod', 'mode', 'mul', + 'multiply', 'name', 'ndim', 'ne', 'nlargest', 'nonzero', 'notna', + 'notnull', 'nsmallest', 'nunique', 'pct_change', 'pop', 'pow', + 'prod', 'product', 'ptp', 'quantile', 'radd', 'rank', 'rdiv', + 'rdivmod', 'real', 'repeat', 'replace', 'resample', 'rfloordiv', + 'rmod', 'rmul', 'rolling', 'round', 'rpow', 'rsub', 'rtruediv', + 'sample', 'searchsorted', 'select', 'sem', 'shape', 'shift', + 'size', 'skew', 'slice_shift', 'sort_index', 'sort_values', + 'squeeze', 'std', 'sub', 'subtract', 'sum', 'swapaxes', + 'swaplevel', 'T', 'tail', 'take', 'transform', 'transpose', + 'truediv', 'truncate', 'tshift', 'unique', 'unstack', + 'value_counts', 'var', 'where', 'xs', + }, + } + + __allowed_qualified = { + # allowed numpy functionality + 'np': { + 'abs', 'add', 'all', 'any', 'append', 'array', 'bool', 'ceil', + 'complex', 'cos', 'cosh', 'cov', 'cumprod', 'cumsum', 'degrees', + 'divide', 'divmod', 'dot', 'e', 'empty', 'exp', 'float', 'floor', + 'hypot', 'inf', 'int', 'isfinite', 'isin', 'isinf', 'isnan', 'log', + 'log10', 'log2', 'max', 'mean', 'median', 'min', 'mod', 'multiply', + 'nan', 'ndim', 'pi', 'product', 'quantile', 'radians', 'rank', + 'remainder', 'round', 'sin', 'sinh', 'size', 'sqrt', 'squeeze', + 'stack', 'std', 'str', 'subtract', 'sum', 'swapaxes', 'take', + 'tan', 'tanh', 'transpose', 'unique', 'var', 'where', + }, + # allowed math functionality + 'math': { + 'acos', 'acosh', 'asin', 'asinh', 'atan', 'atan2', 'atanh', 'ceil', + 'copysign', 'cos', 'cosh', 'degrees', 'e', 'erf', 'erfc', 'exp', + 'expm1', 'fabs', 'factorial', 'floor', 'fmod', 'frexp', 'fsum', + 'gamma', 'gcd', 'hypot', 'inf', 'isclose', 'isfinite', 'isinf', + 'isnan', 'ldexp', 'lgamma', 'log', 'log10', 'log1p', 'log2', + 'modf', 'nan', 'pi', 'pow', 'radians', 'remainder', 'sin', 'sinh', + 'sqrt', 'tan', 'tanh', 'tau', 'trunc', + }, + # allowed pd functionality + 'pd': { + 'DataFrame', 'array', 'concat', 'cut', 'date_range', 'factorize', + 'interval_range', 'isna', 'isnull', 'melt', 'merge', 'notna', + 'notnull', 'period_range', 'pivot', 'pivot_table', 'unique', + 'value_counts', 'wide_to_long', + }, + } + + def __init__(self, expression, + ref_whitelist=None, ref_type=None, + custom_qualified=None): + self.allowed_qualified = self.__allowed_qualified.copy() + if ref_whitelist is None: + self.these = [] + else: + self.these = ref_whitelist + if ref_type is None or ref_type not in self.__allowed_ref_types: + self.allowed_qualified['_this'] = set() + else: + self.allowed_qualified[ + '_this' + ] = self.__allowed_ref_types[ref_type] + if custom_qualified is not None: + self.allowed_qualified.update(custom_qualified) + self.expr = expression + self.__assertSafe() + + def generateFunction(self): + "Generates a function to be evaluated outside the class" + cust_fun = "def fun(%s):\n\treturn(%s)" % (self.these[0], self.expr) + return cust_fun + + def __assertSafe(self): + indeed, problematic_token = self.__isSafeStatement() + if not indeed: + self.detailedExcuse(problematic_token) + raise ValueError("Custom Expression is not safe.") + + @staticmethod + def detailedExcuse(word): + "Gives a verbose statement for why users should not use some specific operators." + mess = None + if word == "for": + mess = "for loops and comprehensions are not allowed. Use numpy or pandas table operations instead." + elif word == ":": + mess = "Colons are not allowed. Use inline Python if/else statements." + elif word == "=": + mess = "Variable assignment is not allowed. Use object methods to substitute values." + elif word in ("[", "]"): + mess = "Direct indexing of arrays is not allowed. Use numpy or pandas functions/methods to address specific parts of tables." + else: + mess = "Not an allowed token in this operation" + print("( '%s' ) %s" % (word, mess)) + + def __isSafeStatement(self): + """ + Determines if a user-expression is safe to evaluate. + + To be considered safe an expression may contain only: + - standard Python operators and numbers + - inline conditional expressions + - select functions and objects + by default, these come from the math, numpy and pandas + libraries, and must be qualified with the modules' conventional + names math, np, pd; can be overridden at the instance level + - references to a whitelist of objects (pd.DataFrames by default) + and their methods + """ + + safe = True + # examples of user-expressions + # '-math.log(1 - elem/4096) * 4096 if elem != bn else elem - 0.5' + # 'vec.median() + vec.sum()' + + # 1. Break expressions into tokens + # e.g., + # [ + # '-', 'math.log', '(', '1', '-', 'elem', '/', '4096', ')', '*', + # '4096', 'if', 'elem', '!=', 'bn', 'else', 'elem', '-', '0.5' + # ] + # or + # ['vec.median', '(', ')', '+', 'vec.sum', '(', ')'] + tokens = [ + e for e in re.split( + r'([a-zA-Z0-9_.]+|[^a-zA-Z0-9_.() ]+|[()])', self.expr + ) if e.strip() + ] + + # 2. Subtract allowed standard tokens + rem = [e for e in tokens if e not in self.__allowed_tokens] + + # 3. Subtract allowed qualified objects from allowed modules + # and whitelisted references and their attributes + rem2 = [] + for e in rem: + parts = e.split('.') + if len(parts) == 1: + if parts[0] in self.these: + continue + if len(parts) == 2: + if parts[0] in self.these: + parts[0] = '_this' + if parts[0] in self.allowed_qualified: + if parts[1] in self.allowed_qualified[parts[0]]: + continue + rem2.append(e) + + # 4. Assert that rest are real numbers + e = '' + for e in rem2: + try: + _ = float(e) + except ValueError: + safe = False + break + + return safe, e