Galaxy |

Changeset 1:dddadbbac949 (2019-08-30)

Previous changeset 0:1b0f96ed73f2 (2019-08-17) Next changeset 2:02c3e335a695 (2019-09-13)

Commit message:
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/table_compute commit 6820ec9431a22576f3716c40feeb27f0b8cf5e83"

modified:
scripts/safety.py
scripts/table_compute.py
table_compute.xml

added:
test-data/skiplines.tsv

diff -r 1b0f96ed73f2 -r dddadbbac949 scripts/safety.py
--- a/scripts/safety.py Sat Aug 17 16:25:37 2019 -0400
+++ b/scripts/safety.py Fri Aug 30 05:28:18 2019 -0400

[

@@ -11,6 +11,7 @@
         '(', ')', 'if', 'else', 'or', 'and', 'not', 'in',
         '+', '-', '*', '/', '%', ',', '!=', '==', '>', '>=', '<', '<=',
         'min', 'max', 'sum',
+        'str', 'int', 'float'
     )
     __allowed_ref_types = {
         'pd.DataFrame': {
@@ -163,26 +164,25 @@

         safe = True
         # examples of user-expressions
-        # '-math.log(1 - elem/4096) * 4096 if elem != bn else elem - 0.5'
+        # '-math.log(1 - elem/4096) * 4096 if elem != 1 else elem - 0.5'
         # 'vec.median() +  vec.sum()'

         # 1. Break expressions into tokens
         # e.g.,
         # [
         #     '-', 'math.log', '(', '1', '-', 'elem', '/', '4096', ')', '*',
-        #     '4096', 'if', 'elem', '!=', 'bn', 'else', 'elem', '-', '0.5'
+        #     '4096', 'if', 'elem', '!=', '1', 'else', 'elem', '-', '0.5'
         # ]
         # or
         # ['vec.median', '(', ')', '+', 'vec.sum', '(', ')']
         tokens = [
             e for e in re.split(
-                r'([a-zA-Z0-9_.]+|[^a-zA-Z0-9_.() ]+|[()])', self.expr
+                r'("[a-zA-Z%0-9_.]+"|[a-zA-Z0-9_.]+|[^a-zA-Z0-9_.() ]+|[()])', self.expr
             ) if e.strip()
         ]

         # 2. Subtract allowed standard tokens
         rem = [e for e in tokens if e not in self.__allowed_tokens]
-
         # 3. Subtract allowed qualified objects from allowed modules
         #    and whitelisted references and their attributes
         rem2 = []
@@ -194,18 +194,32 @@
             if len(parts) == 2:
                 if parts[0] in self.these:
                     parts[0] = '_this'
+                elif parts[0] == "":
+                    # e.g. '.T' gives ['','.T']
+                    # Here we assume that the blank part[0] refers to the
+                    # self.ref_type (e.g. "pd.DataFrame"), and that
+                    # the second part is a function of that type.
+                    if parts[1] in self.allowed_qualified['_this']:
+                        continue
+
                 if parts[0] in self.allowed_qualified:
                     if parts[1] in self.allowed_qualified[parts[0]]:
                         continue
+
             rem2.append(e)

-        # 4. Assert that rest are real numbers
+        # Debug
+        # for x in (tokens, rem, rem2):print(x)
+
+        # 4. Assert that rest are real numbers or strings
         e = ''
         for e in rem2:
             try:
                 _ = float(e)
             except ValueError:
-                safe = False
-                break
+                # e.g. '"TEXT"' is okay.
+                if not(e[0] == '"' and e[-1] == '"'):
+                    safe = False
+                    break

         return safe, e

diff -r 1b0f96ed73f2 -r dddadbbac949 scripts/table_compute.py
--- a/scripts/table_compute.py Sat Aug 17 16:25:37 2019 -0400
+++ b/scripts/table_compute.py Fri Aug 30 05:28:18 2019 -0400

[

b'@@ -3,7 +3,8 @@\n Table Compute tool - a wrapper around pandas with parameter input validation.\n """\n \n-__version__ = "0.8"\n+\n+__version__ = "0.9.1"\n \n import csv\n import math\n@@ -11,15 +12,17 @@\n \n import numpy as np\n import pandas as pd\n-import userconfig as uc\n from safety import Safety\n-# This should be generated in the same directory\n \n-# Version command should not need to copy the config\n if len(argv) == 2 and argv[1] == "--version":\n print(__version__)\n exit(-1)\n \n+# The import below should be generated in the same directory as\n+# the table_compute.py script.\n+# It is placed here so that the --version switch does not fail\n+import userconfig as uc # noqa: I100,I202\n+\n \n class Utils:\n @staticmethod\n@@ -37,12 +40,74 @@\n "Returns a valid two value DataFrame or Series operator"\n return getattr(type(pd_obj), "__" + op_name + "__")\n \n+ @staticmethod\n+ def readcsv(filedict, narm):\n+ data = pd.read_csv(\n+ filedict["file"],\n+ header=filedict["header"],\n+ index_col=filedict["row_names"],\n+ keep_default_na=narm,\n+ nrows=filedict["nrows"],\n+ skipfooter=filedict["skipfooter"],\n+ skip_blank_lines=filedict["skip_blank_lines"],\n+ sep=\'\\t\'\n+ )\n+ # Fix whitespace issues in index or column names\n+ data.columns = [col.strip() if type(col) is str else col\n+ for col in data.columns]\n+ data.index = [row.strip() if type(row) is str else row\n+ for row in data.index]\n+ return(data)\n \n-# Math is imported but not directly used because users\n-# may specify a "math.<function>" when inserting a custom\n-# function. To remove linting errors, which break CI testing\n-# we will just use an arbitrary math statement here.\n-__ = math.log\n+ @staticmethod\n+ def rangemaker(tab):\n+ # e.g. "1:3,2:-2" specifies "1,2,3,2,1,0,-1,-2" to give [0,1,2,1,0,-1,-2]\n+ # Positive indices are decremented by 1 to reference 0-base numbering\n+ # Negative indices are unaltered, so that -1 refers to the last column\n+ out = []\n+ err_mess = None\n+ for ranges in tab.split(","):\n+ nums = ranges.split(":")\n+ if len(nums) == 1:\n+ numb = int(nums[0])\n+ # Positive numbers get decremented.\n+ # i.e. column "3" refers to index 2\n+ # column "-1" still refers to index -1\n+ if numb != 0:\n+ out.append(numb if (numb < 0) else (numb - 1))\n+ else:\n+ err_mess = "Please do not use 0 as an index"\n+ elif len(nums) == 2:\n+ left, right = map(int, nums)\n+ if 0 in (left, right):\n+ err_mess = "Please do not use 0 as an index"\n+ elif left < right:\n+ if left > 0: # and right > 0 too\n+ # 1:3 to 0,1,2\n+ out.extend(range(left - 1, right))\n+ elif right < 0: # and left < 0 too\n+ # -3:-1 to -3,-2,-1\n+ out.extend(range(left, right + 1))\n+ elif left < 0 and right > 0:\n+ # -2:2 to -2,-1,0,1\n+ out.extend(range(left, 0))\n+ out.extend(range(0, right))\n+ elif right < left:\n+ if right > 0: # and left > 0\n+ # 3:1 to 2,1,0\n+ out.extend(range(left - 1, right - 2, -1))\n+ elif left < 0: # and right < 0\n+ # -1:-3 to -1,-2,-3\n+ out.extend(range(left, right - 1, -1))\n+ elif right < 0 and left > 0:\n+ # 2:-2 to 1,0,-1,-2\n+ out.extend(range(left - 1, right - 1, -1))\n+ else:\n+ err_mess = "%s should no'..b'c.Data["tables"][0]["reader_header"],\n- index_col=uc.Data["tables"][0]["reader_row_col"],\n- keep_default_na=uc.Default["narm"],\n- sep=\'\\t\'\n- )\n- # Fix whitespace issues in index or column names\n- data.columns = [col.strip() if type(col) is str else col\n- for col in data.columns]\n- data.index = [row.strip() if type(row) is str else row\n- for row in data.index]\n-\n+ data = Utils.readcsv(uc.Data["tables"][0], uc.Default["narm"])\n user_mode_single = params["user_mode_single"]\n \n if user_mode_single == "precision":\n@@ -79,9 +132,13 @@\n rows_specified = params["select_rows_wanted"]\n \n # Select all indexes if empty array of values\n- if not cols_specified:\n+ if cols_specified:\n+ cols_specified = Utils.rangemaker(cols_specified)\n+ else:\n cols_specified = range(len(data.columns))\n- if not rows_specified:\n+ if rows_specified:\n+ rows_specified = Utils.rangemaker(rows_specified)\n+ else:\n rows_specified = range(len(data))\n \n # do not use duplicate indexes\n@@ -161,16 +218,44 @@\n elif user_mode_single == "element":\n # lt, gt, ge, etc.\n operation = params["element_op"]\n+ bool_mat = None\n if operation is not None:\n- op = Utils.getTwoValuePandaOp(operation, data)\n- value = params["element_value"]\n- try:\n- # Could be numeric\n- value = float(value)\n- except ValueError:\n- pass\n- # generate filter matrix of True/False values\n- bool_mat = op(data, value)\n+ if operation == "rowcol":\n+ # Select all indexes if empty array of values\n+ if "element_cols" in params:\n+ cols_specified = Utils.rangemaker(params["element_cols"])\n+ else:\n+ cols_specified = range(len(data.columns))\n+ if "element_rows" in params:\n+ rows_specified = Utils.rangemaker(params["element_rows"])\n+ else:\n+ rows_specified = range(len(data))\n+\n+ # Inclusive selection:\n+ # - True: Giving a row or column will match all elements in that row or column\n+ # - False: Give a row or column will match only elements in both those rows or columns\n+ inclusive = params["element_inclusive"]\n+\n+ # Create a bool matrix (intialised to False) with selected\n+ # rows and columns set to True\n+ bool_mat = data.copy()\n+ bool_mat[:] = False\n+ if inclusive:\n+ bool_mat.iloc[rows_specified, :] = True\n+ bool_mat.iloc[:, cols_specified] = True\n+ else:\n+ bool_mat.iloc[rows_specified, cols_specified] = True\n+\n+ else:\n+ op = Utils.getTwoValuePandaOp(operation, data)\n+ value = params["element_value"]\n+ try:\n+ # Could be numeric\n+ value = float(value)\n+ except ValueError:\n+ pass\n+ # generate filter matrix of True/False values\n+ bool_mat = op(data, value)\n else:\n # implement no filtering through a filter matrix filled with\n # True values.\n@@ -265,13 +350,7 @@\n \n # Read and populate tables\n for x, t_sect in enumerate(table_sections):\n- tmp = pd.read_csv(\n- t_sect["file"],\n- header=t_sect["header"],\n- index_col=t_sect["row_names"],\n- keep_default_na=uc.Default["narm"],\n- sep="\\t"\n- )\n+ tmp = Utils.readcsv(t_sect, uc.Default["narm"])\n table.append(tmp)\n table_names.append("table" + str(x + 1))\n table_names_real.append("table[" + str(x) + "]")\n'

diff -r 1b0f96ed73f2 -r dddadbbac949 table_compute.xml
--- a/table_compute.xml Sat Aug 17 16:25:37 2019 -0400
+++ b/table_compute.xml Fri Aug 30 05:28:18 2019 -0400

[

b'@@ -1,7 +1,7 @@\n <tool id="table_compute" name="Table Compute" version="@VERSION@">\n <description>computes operations on table data</description>\n <macros>\n- <token name="@VERSION@">0.8</token>\n+ <token name="@VERSION@">0.9.1</token>\n <token name="@COPEN@"><![CDATA[<code>]]></token>\n <token name="@CCLOSE@"><![CDATA[</code>]]></token>\n <import>allowed_functions.xml</import>\n@@ -19,11 +19,11 @@\n <sanitizer sanitize="false" />\n </macro>\n <macro name="validator_index_ranges">\n- <validator type="regex" message="Specify a comma-separated list index numbers or ranges">^(?:\\d+(?::\\d)*(?:, *\\d+(?::\\d)*)*)?$</validator>\n+ <validator type="regex" message="Specify a comma-separated list index numbers or ranges">^(?:-?\\d+(?::-?\\d+)*(?:, *-?\\d+(?::-?\\d+)*)*)?$</validator>\n <sanitizer sanitize="false" />\n </macro>\n <macro name="validator_functiondef">\n- <validator type="regex" message="An expression is required and is allowed to contain only letters, numbers and the characters \'_ !-+=/*%.<>()\'">^[\\w !\\-+=/*%,.<>()]+$</validator>\n+ <validator type="regex" message="An expression is required and is allowed to contain only letters, numbers and the characters \'_ !-+=/*%.<>()\'">^[\'"\\w !\\-+=/*%,.<>()]+$</validator>\n <sanitizer sanitize="false" />\n </macro>\n \n@@ -36,6 +36,16 @@\n </conditional>\n </conditional>\n </macro>\n+ <macro name="test_inputs_ranges" >\n+ <conditional name="singtabop" >\n+ <param name="use_type" value="single" />\n+ <param name="input" value="examples.1.tsv" />\n+ <conditional name="user" >\n+ <param name="mode" value="select" />\n+ <yield />\n+ </conditional>\n+ </conditional>\n+ </macro>\n \n <macro name="umi2trans" >\n <yield />\n@@ -53,14 +63,31 @@\n <macro name="file_opts">\n <param name="input" type="data" format="tsv,tabular" label="Table" />\n <param name="col_row_names" type="select" display="checkboxes" multiple="true" optional="true"\n- label="This input data has">\n+ label="Input data has">\n <option value="has_col_names" selected="true">Column names on the first row</option>\n- <option value="has_row_names" selected="true">Row names on the first column"</option>\n+ <option value="has_row_names" selected="true">Row names on the first column</option>\n </param>\n+ <section name="adv" title="Advanced File Options " expanded="false" >\n+ <param name="header" type="integer" min="0" optional="true" label="Header begins at line N" help="All lines before line N will be skipped. If a value is set, this will override the above \'Column names on the first row\' parameter." />\n+ <param name="nrows" type="integer" min="0" optional="true" label="Read N lines only" help="Parses only N lines after the header line." />\n+ <param name="skipfooter" type="integer" min="0" optional="true" label="Skip N lines from bottom" help="Do not use this in conjunction with the \'Read N lines only\' parameter." />\n+ <param name="skip_blank_lines" type="boolean" checked="true" falsevalue="False" truevalue="True" label="Skip blank lines" help="Otherwise it will insert NaN values for every blank line detected." />\n+ </section>\n </macro>\n \n- <macro name="elem_val_macro" >\n- <param name="element_value" type="text" optional="true" label="Filter value" help="This value is converted to numeric if possible, otherwise it is treated as a string" />\n+ <macro n'..b' <param name="element_op" value="rowcol" />\n+ <param name="select_cols_wanted" value="2" />\n+ <param name="select_rows_wanted" value="2,4" />\n+ <param name="inclusive_selection" value="True" />\n+ </conditional>\n+ </conditional>\n+ </conditional>\n+ <output name="table" >\n+ <assert_contents>\n+ <has_n_columns n="4" />\n+ <has_line_matching expression="^g2\\s+chr3\\s+chr6\\s+chr9$" />\n+ <has_line_matching expression="^g4\\s+chr81\\s+chr6\\s+chr3$" />\n+ </assert_contents>\n+ </output>\n+ </test>\n+ <test expect_num_outputs="1" >\n+ \n+ <conditional name="singtabop" >\n+ <param name="use_type" value="single" />\n+ <param name="input" value="examples.1.tsv" />\n+ <param name="col_row_names" value="has_col_names,has_row_names" />\n+ <conditional name="user" >\n+ <param name="mode" value="element" />\n+ <conditional name="element" >\n+ <param name="mode" value="custom" />\n+ <param name="custom_expr" value=""chr%.f" % elem" />\n+ </conditional>\n+ <conditional name="elem_val" >\n+ <param name="element_op" value="rowcol" />\n+ <param name="select_cols_wanted" value="2" />\n+ <param name="select_rows_wanted" value="2,4" />\n+ <param name="inclusive_selection" value="False" />\n+ </conditional>\n+ </conditional>\n+ </conditional>\n+ <output name="table" >\n+ <assert_contents>\n+ <has_n_columns n="4" />\n+ <has_line_matching expression="^g2\\s+3\\s+chr6\\s+9$" />\n+ <has_line_matching expression="^g4\\s+81\\s+chr6\\s+3$" />\n+ </assert_contents>\n+ </output>\n+ </test>\n </tests>\n <help><![CDATA[\n This tool computes table expressions on the element, row, and column basis. It can sub-select,\n@@ -1436,6 +1668,65 @@\n \n This splits the matrix using "foo" and "bar" using only the values from "baz". Header values may contain extra information.\n \n+\n+Example 9: Replacing text in specific rows or columns\n+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n+\n+We have the following table\n+\n+ === === === ===\n+ . c1 c2 c3\n+ === === === ===\n+ g1 10 20 30\n+ g2 3 3 9\n+ g3 4 8 12\n+ g4 81 6 3\n+ === === === ===\n+\n+and we want to add "chr" to the elements in column 2 AND rows 2 and 4:\n+\n+ === === ==== ===\n+ . c1 c2 c3\n+ === === ==== ===\n+ g1 10 20 30\n+ g2 3 chr3 9\n+ g3 4 8 12\n+ g4 81 chr6 3\n+ === === ==== ===\n+\n+In Galaxy we would select the following:\n+\n+ * *Input Single or Multiple Tables* \xe2\x86\x92 **Single Table**\n+ * *Column names on first row?* \xe2\x86\x92 **Yes**\n+ * *Row names on first column?* \xe2\x86\x92 **Yes**\n+\n+ * *Type of table operation* \xe2\x86\x92 **Manipulate selected table elements**\n+\n+ * *Operation to perform* \xe2\x86\x92 **Custom**\n+\n+ * *Custom Expression* \xe2\x86\x92 ::\n+\n+ "chr%.f" % elem\n+\n+ * *Operate on elements* \xe2\x86\x92 **Specific Rows and/or Columns**\n+ * *List of columns to select* \xe2\x86\x92 "2"\n+ * *List of rows to select* \xe2\x86\x92 "2,4"\n+ * *Inclusive Selection* \xe2\x86\x92 "No"\n+\n+\n+If we wanted to instead add "chr" to the ALL elements in column 2 and rows 2 and 4, we would repeat the steps above but set the *Inclusive Selection* to "Yes", to give:\n+\n+ === ===== ===== =====\n+ . c1 c2 c3\n+ === ===== ===== =====\n+ g1 10 chr20 30\n+ g2 chr3 chr3 chr9\n+ g3 4 8 12\n+ g4 chr81 chr6 chr3\n+ === ===== ===== =====\n+\n+\n+\n ]]></help>\n <citations></citations>\n </tool>\n'

diff -r 1b0f96ed73f2 -r dddadbbac949 test-data/skiplines.tsv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/skiplines.tsv Fri Aug 30 05:28:18 2019 -0400

@@ -0,0 +1,10 @@
+A1 A6 A7 A8
+A2 A3 A4 A5
+ c1 c2 c3
+g1 10 20 30
+g2 3 6 9
+g3 4 8 12
+g4 81 6 3
+
+A1 A6 A7 A8
+A2 A3 A4 A5
\ No newline at end of file