Repository 'table_compute'
hg clone https://toolshed.g2.bx.psu.edu/repos/iuc/table_compute

Changeset 1:dddadbbac949 (2019-08-30)
Previous changeset 0:1b0f96ed73f2 (2019-08-17) Next changeset 2:02c3e335a695 (2019-09-13)
Commit message:
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/table_compute commit 6820ec9431a22576f3716c40feeb27f0b8cf5e83"
modified:
scripts/safety.py
scripts/table_compute.py
table_compute.xml
added:
test-data/skiplines.tsv
b
diff -r 1b0f96ed73f2 -r dddadbbac949 scripts/safety.py
--- a/scripts/safety.py Sat Aug 17 16:25:37 2019 -0400
+++ b/scripts/safety.py Fri Aug 30 05:28:18 2019 -0400
[
@@ -11,6 +11,7 @@
         '(', ')', 'if', 'else', 'or', 'and', 'not', 'in',
         '+', '-', '*', '/', '%', ',', '!=', '==', '>', '>=', '<', '<=',
         'min', 'max', 'sum',
+        'str', 'int', 'float'
     )
     __allowed_ref_types = {
         'pd.DataFrame': {
@@ -163,26 +164,25 @@
 
         safe = True
         # examples of user-expressions
-        # '-math.log(1 - elem/4096) * 4096 if elem != bn else elem - 0.5'
+        # '-math.log(1 - elem/4096) * 4096 if elem != 1 else elem - 0.5'
         # 'vec.median() +  vec.sum()'
 
         # 1. Break expressions into tokens
         # e.g.,
         # [
         #     '-', 'math.log', '(', '1', '-', 'elem', '/', '4096', ')', '*',
-        #     '4096', 'if', 'elem', '!=', 'bn', 'else', 'elem', '-', '0.5'
+        #     '4096', 'if', 'elem', '!=', '1', 'else', 'elem', '-', '0.5'
         # ]
         # or
         # ['vec.median', '(', ')', '+', 'vec.sum', '(', ')']
         tokens = [
             e for e in re.split(
-                r'([a-zA-Z0-9_.]+|[^a-zA-Z0-9_.() ]+|[()])', self.expr
+                r'("[a-zA-Z%0-9_.]+"|[a-zA-Z0-9_.]+|[^a-zA-Z0-9_.() ]+|[()])', self.expr
             ) if e.strip()
         ]
 
         # 2. Subtract allowed standard tokens
         rem = [e for e in tokens if e not in self.__allowed_tokens]
-
         # 3. Subtract allowed qualified objects from allowed modules
         #    and whitelisted references and their attributes
         rem2 = []
@@ -194,18 +194,32 @@
             if len(parts) == 2:
                 if parts[0] in self.these:
                     parts[0] = '_this'
+                elif parts[0] == "":
+                    # e.g. '.T' gives ['','.T']
+                    # Here we assume that the blank part[0] refers to the
+                    # self.ref_type (e.g. "pd.DataFrame"), and that
+                    # the second part is a function of that type.
+                    if parts[1] in self.allowed_qualified['_this']:
+                        continue
+
                 if parts[0] in self.allowed_qualified:
                     if parts[1] in self.allowed_qualified[parts[0]]:
                         continue
+
             rem2.append(e)
 
-        # 4. Assert that rest are real numbers
+        # Debug
+        # for x in (tokens, rem, rem2):print(x)
+
+        # 4. Assert that rest are real numbers or strings
         e = ''
         for e in rem2:
             try:
                 _ = float(e)
             except ValueError:
-                safe = False
-                break
+                # e.g. '"TEXT"' is okay.
+                if not(e[0] == '"' and e[-1] == '"'):
+                    safe = False
+                    break
 
         return safe, e
b
diff -r 1b0f96ed73f2 -r dddadbbac949 scripts/table_compute.py
--- a/scripts/table_compute.py Sat Aug 17 16:25:37 2019 -0400
+++ b/scripts/table_compute.py Fri Aug 30 05:28:18 2019 -0400
[
b'@@ -3,7 +3,8 @@\n Table Compute tool - a wrapper around pandas with parameter input validation.\n """\n \n-__version__ = "0.8"\n+\n+__version__ = "0.9.1"\n \n import csv\n import math\n@@ -11,15 +12,17 @@\n \n import numpy as np\n import pandas as pd\n-import userconfig as uc\n from safety import Safety\n-# This should be generated in the same directory\n \n-# Version command should not need to copy the config\n if len(argv) == 2 and argv[1] == "--version":\n     print(__version__)\n     exit(-1)\n \n+# The import below should be generated in the same directory as\n+# the table_compute.py script.\n+# It is placed here so that the --version switch does not fail\n+import userconfig as uc  # noqa: I100,I202\n+\n \n class Utils:\n     @staticmethod\n@@ -37,12 +40,74 @@\n         "Returns a valid two value DataFrame or Series operator"\n         return getattr(type(pd_obj), "__" + op_name + "__")\n \n+    @staticmethod\n+    def readcsv(filedict, narm):\n+        data = pd.read_csv(\n+            filedict["file"],\n+            header=filedict["header"],\n+            index_col=filedict["row_names"],\n+            keep_default_na=narm,\n+            nrows=filedict["nrows"],\n+            skipfooter=filedict["skipfooter"],\n+            skip_blank_lines=filedict["skip_blank_lines"],\n+            sep=\'\\t\'\n+        )\n+        # Fix whitespace issues in index or column names\n+        data.columns = [col.strip() if type(col) is str else col\n+                        for col in data.columns]\n+        data.index = [row.strip() if type(row) is str else row\n+                      for row in data.index]\n+        return(data)\n \n-# Math is imported but not directly used because users\n-# may specify a "math.<function>" when inserting a custom\n-# function. To remove linting errors, which break CI testing\n-# we will just use an arbitrary math statement here.\n-__ = math.log\n+    @staticmethod\n+    def rangemaker(tab):\n+        # e.g. "1:3,2:-2" specifies "1,2,3,2,1,0,-1,-2" to give [0,1,2,1,0,-1,-2]\n+        # Positive indices are decremented by 1 to reference 0-base numbering\n+        # Negative indices are unaltered, so that -1 refers to the last column\n+        out = []\n+        err_mess = None\n+        for ranges in tab.split(","):\n+            nums = ranges.split(":")\n+            if len(nums) == 1:\n+                numb = int(nums[0])\n+                # Positive numbers get decremented.\n+                # i.e. column "3" refers to index 2\n+                #      column "-1" still refers to index -1\n+                if numb != 0:\n+                    out.append(numb if (numb < 0) else (numb - 1))\n+                else:\n+                    err_mess = "Please do not use 0 as an index"\n+            elif len(nums) == 2:\n+                left, right = map(int, nums)\n+                if 0 in (left, right):\n+                    err_mess = "Please do not use 0 as an index"\n+                elif left < right:\n+                    if left > 0:  # and right > 0 too\n+                        # 1:3 to 0,1,2\n+                        out.extend(range(left - 1, right))\n+                    elif right < 0:  # and left < 0 too\n+                        # -3:-1 to -3,-2,-1\n+                        out.extend(range(left, right + 1))\n+                    elif left < 0 and right > 0:\n+                        # -2:2 to -2,-1,0,1\n+                        out.extend(range(left, 0))\n+                        out.extend(range(0, right))\n+                elif right < left:\n+                    if right > 0:  # and left > 0\n+                        # 3:1 to 2,1,0\n+                        out.extend(range(left - 1, right - 2, -1))\n+                    elif left < 0:  # and right < 0\n+                        # -1:-3 to -1,-2,-3\n+                        out.extend(range(left, right - 1, -1))\n+                    elif right < 0 and left > 0:\n+                        # 2:-2 to 1,0,-1,-2\n+                        out.extend(range(left - 1, right - 1, -1))\n+                else:\n+                    err_mess = "%s should no'..b'c.Data["tables"][0]["reader_header"],\n-        index_col=uc.Data["tables"][0]["reader_row_col"],\n-        keep_default_na=uc.Default["narm"],\n-        sep=\'\\t\'\n-    )\n-    # Fix whitespace issues in index or column names\n-    data.columns = [col.strip() if type(col) is str else col\n-                    for col in data.columns]\n-    data.index = [row.strip() if type(row) is str else row\n-                  for row in data.index]\n-\n+    data = Utils.readcsv(uc.Data["tables"][0], uc.Default["narm"])\n     user_mode_single = params["user_mode_single"]\n \n     if user_mode_single == "precision":\n@@ -79,9 +132,13 @@\n         rows_specified = params["select_rows_wanted"]\n \n         # Select all indexes if empty array of values\n-        if not cols_specified:\n+        if cols_specified:\n+            cols_specified = Utils.rangemaker(cols_specified)\n+        else:\n             cols_specified = range(len(data.columns))\n-        if not rows_specified:\n+        if rows_specified:\n+            rows_specified = Utils.rangemaker(rows_specified)\n+        else:\n             rows_specified = range(len(data))\n \n         # do not use duplicate indexes\n@@ -161,16 +218,44 @@\n     elif user_mode_single == "element":\n         # lt, gt, ge, etc.\n         operation = params["element_op"]\n+        bool_mat = None\n         if operation is not None:\n-            op = Utils.getTwoValuePandaOp(operation, data)\n-            value = params["element_value"]\n-            try:\n-                # Could be numeric\n-                value = float(value)\n-            except ValueError:\n-                pass\n-            # generate filter matrix of True/False values\n-            bool_mat = op(data, value)\n+            if operation == "rowcol":\n+                # Select all indexes if empty array of values\n+                if "element_cols" in params:\n+                    cols_specified = Utils.rangemaker(params["element_cols"])\n+                else:\n+                    cols_specified = range(len(data.columns))\n+                if "element_rows" in params:\n+                    rows_specified = Utils.rangemaker(params["element_rows"])\n+                else:\n+                    rows_specified = range(len(data))\n+\n+                # Inclusive selection:\n+                # - True: Giving a row or column will match all elements in that row or column\n+                # - False: Give a row or column will match only elements in both those rows or columns\n+                inclusive = params["element_inclusive"]\n+\n+                # Create a bool matrix (intialised to False) with selected\n+                # rows and columns set to True\n+                bool_mat = data.copy()\n+                bool_mat[:] = False\n+                if inclusive:\n+                    bool_mat.iloc[rows_specified, :] = True\n+                    bool_mat.iloc[:, cols_specified] = True\n+                else:\n+                    bool_mat.iloc[rows_specified, cols_specified] = True\n+\n+            else:\n+                op = Utils.getTwoValuePandaOp(operation, data)\n+                value = params["element_value"]\n+                try:\n+                    # Could be numeric\n+                    value = float(value)\n+                except ValueError:\n+                    pass\n+                # generate filter matrix of True/False values\n+                bool_mat = op(data, value)\n         else:\n             # implement no filtering through a filter matrix filled with\n             # True values.\n@@ -265,13 +350,7 @@\n \n     # Read and populate tables\n     for x, t_sect in enumerate(table_sections):\n-        tmp = pd.read_csv(\n-            t_sect["file"],\n-            header=t_sect["header"],\n-            index_col=t_sect["row_names"],\n-            keep_default_na=uc.Default["narm"],\n-            sep="\\t"\n-        )\n+        tmp = Utils.readcsv(t_sect, uc.Default["narm"])\n         table.append(tmp)\n         table_names.append("table" + str(x + 1))\n         table_names_real.append("table[" + str(x) + "]")\n'
b
diff -r 1b0f96ed73f2 -r dddadbbac949 table_compute.xml
--- a/table_compute.xml Sat Aug 17 16:25:37 2019 -0400
+++ b/table_compute.xml Fri Aug 30 05:28:18 2019 -0400
[
b'@@ -1,7 +1,7 @@\n <tool id="table_compute" name="Table Compute" version="@VERSION@">\n     <description>computes operations on table data</description>\n     <macros>\n-        <token name="@VERSION@">0.8</token>\n+        <token name="@VERSION@">0.9.1</token>\n         <token name="@COPEN@"><![CDATA[<code>]]></token>\n         <token name="@CCLOSE@"><![CDATA[</code>]]></token>\n         <import>allowed_functions.xml</import>\n@@ -19,11 +19,11 @@\n             <sanitizer sanitize="false" />\n         </macro>\n         <macro name="validator_index_ranges">\n-            <validator type="regex" message="Specify a comma-separated list index numbers or ranges">^(?:\\d+(?::\\d)*(?:, *\\d+(?::\\d)*)*)?$</validator>\n+            <validator type="regex" message="Specify a comma-separated list index numbers or ranges">^(?:-?\\d+(?::-?\\d+)*(?:, *-?\\d+(?::-?\\d+)*)*)?$</validator>\n             <sanitizer sanitize="false" />\n         </macro>\n         <macro name="validator_functiondef">\n-            <validator type="regex" message="An expression is required and is allowed to contain only letters, numbers and the characters \'_ !-+=/*%.&lt;&gt;()\'">^[\\w !\\-+=/*%,.&lt;&gt;()]+$</validator>\n+            <validator type="regex" message="An expression is required and is allowed to contain only letters, numbers and the characters \'_ !-+=/*%.&lt;&gt;()\'">^[\'"\\w !\\-+=/*%,.&lt;&gt;()]+$</validator>\n             <sanitizer sanitize="false" />\n         </macro>\n         <!-- macro for main input tests -->\n@@ -36,6 +36,16 @@\n                 </conditional>\n             </conditional>\n         </macro>\n+        <macro name="test_inputs_ranges" >\n+            <conditional name="singtabop" >\n+                <param name="use_type" value="single" />\n+                <param name="input" value="examples.1.tsv" />\n+                <conditional name="user" >\n+                    <param name="mode" value="select" />\n+                    <yield />\n+                </conditional>\n+            </conditional>\n+        </macro>\n         <!-- macro for umi to transcript tests -->\n         <macro name="umi2trans" >\n             <yield />\n@@ -53,14 +63,31 @@\n         <macro name="file_opts">\n             <param name="input" type="data" format="tsv,tabular" label="Table" />\n             <param name="col_row_names" type="select" display="checkboxes" multiple="true" optional="true"\n-            label="This input data has">\n+                   label="Input data has">\n                 <option value="has_col_names" selected="true">Column names on the first row</option>\n-                <option value="has_row_names" selected="true">Row names on the first column"</option>\n+                <option value="has_row_names" selected="true">Row names on the first column</option>\n             </param>\n+            <section name="adv" title="Advanced File Options " expanded="false" >\n+                <param name="header" type="integer" min="0" optional="true" label="Header begins at line N" help="All lines before line N will be skipped. If a value is set, this will override the above \'Column names on the first row\' parameter." />\n+                <param name="nrows" type="integer" min="0" optional="true" label="Read N lines only" help="Parses only N lines after the header line." />\n+                <param name="skipfooter" type="integer" min="0" optional="true" label="Skip N lines from bottom" help="Do not use this in conjunction with the \'Read N lines only\' parameter." />\n+                <param name="skip_blank_lines" type="boolean" checked="true" falsevalue="False" truevalue="True" label="Skip blank lines" help="Otherwise it will insert NaN values for every blank line detected." />\n+            </section>\n         </macro>\n         <!-- element value macro -->\n-        <macro name="elem_val_macro" >\n-            <param name="element_value" type="text" optional="true" label="Filter value" help="This value is converted to numeric if possible, otherwise it is treated as a string" />\n+        <macro n'..b'                    <param name="element_op" value="rowcol" />\n+                        <param name="select_cols_wanted" value="2" />\n+                        <param name="select_rows_wanted" value="2,4" />\n+                        <param name="inclusive_selection" value="True" />\n+                    </conditional>\n+                </conditional>\n+            </conditional>\n+            <output name="table" >\n+                <assert_contents>\n+                    <has_n_columns n="4" />\n+                    <has_line_matching expression="^g2\\s+chr3\\s+chr6\\s+chr9$" />\n+                    <has_line_matching expression="^g4\\s+chr81\\s+chr6\\s+chr3$" />\n+                </assert_contents>\n+            </output>\n+        </test>\n+        <test expect_num_outputs="1" >\n+            <!-- Test 41: Row Col custom op #2 -->\n+            <conditional name="singtabop" >\n+                <param name="use_type" value="single" />\n+                <param name="input" value="examples.1.tsv" />\n+                <param name="col_row_names" value="has_col_names,has_row_names" />\n+                <conditional name="user" >\n+                    <param name="mode" value="element" />\n+                    <conditional name="element" >\n+                        <param name="mode" value="custom" />\n+                        <param name="custom_expr" value="&#34;chr%.f&#34; % elem" />\n+                    </conditional>\n+                    <conditional name="elem_val" >\n+                        <param name="element_op" value="rowcol" />\n+                        <param name="select_cols_wanted" value="2" />\n+                        <param name="select_rows_wanted" value="2,4" />\n+                        <param name="inclusive_selection" value="False" />\n+                    </conditional>\n+                </conditional>\n+            </conditional>\n+            <output name="table" >\n+                <assert_contents>\n+                    <has_n_columns n="4" />\n+                    <has_line_matching expression="^g2\\s+3\\s+chr6\\s+9$" />\n+                    <has_line_matching expression="^g4\\s+81\\s+chr6\\s+3$" />\n+                </assert_contents>\n+            </output>\n+        </test>\n     </tests>\n     <help><![CDATA[\n This tool computes table expressions on the element, row, and column basis. It can sub-select,\n@@ -1436,6 +1668,65 @@\n \n This splits the matrix using "foo" and "bar" using only the values from "baz". Header values may contain extra information.\n \n+\n+Example 9: Replacing text in specific rows or columns\n+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n+\n+We have the following table\n+\n+ === === === ===\n+  .  c1  c2  c3\n+ === === === ===\n+ g1  10  20  30\n+ g2   3   3   9\n+ g3   4   8  12\n+ g4  81   6   3\n+ === === === ===\n+\n+and we want to add "chr" to the elements in column 2 AND rows 2 and 4:\n+\n+ === === ==== ===\n+  .  c1    c2  c3\n+ === === ==== ===\n+ g1  10    20  30\n+ g2   3  chr3   9\n+ g3   4     8  12\n+ g4  81  chr6   3\n+ === === ==== ===\n+\n+In Galaxy we would select the following:\n+\n+ * *Input Single or Multiple Tables* \xe2\x86\x92 **Single Table**\n+ * *Column names on first row?* \xe2\x86\x92 **Yes**\n+ * *Row names on first column?* \xe2\x86\x92 **Yes**\n+\n+ * *Type of table operation* \xe2\x86\x92  **Manipulate selected table elements**\n+\n+   * *Operation to perform* \xe2\x86\x92 **Custom**\n+\n+     * *Custom Expression* \xe2\x86\x92 ::\n+\n+         "chr%.f" % elem\n+\n+     * *Operate on elements* \xe2\x86\x92 **Specific Rows and/or Columns**\n+     * *List of columns to select* \xe2\x86\x92 "2"\n+     * *List of rows to select* \xe2\x86\x92 "2,4"\n+     * *Inclusive Selection* \xe2\x86\x92 "No"\n+\n+\n+If we wanted to instead add "chr" to the ALL elements in column 2 and rows 2 and 4, we would repeat the steps above but set the *Inclusive Selection* to "Yes", to give:\n+\n+ === =====  ===== =====\n+  .    c1     c2    c3\n+ === =====  ===== =====\n+ g1     10  chr20    30\n+ g2   chr3   chr3  chr9\n+ g3      4      8    12\n+ g4  chr81   chr6  chr3\n+ === =====  ===== =====\n+\n+\n+\n ]]></help>\n     <citations></citations>\n </tool>\n'
b
diff -r 1b0f96ed73f2 -r dddadbbac949 test-data/skiplines.tsv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/skiplines.tsv Fri Aug 30 05:28:18 2019 -0400
b
@@ -0,0 +1,10 @@
+A1 A6 A7 A8
+A2 A3 A4 A5
+ c1 c2 c3
+g1 10 20 30
+g2 3 6 9
+g3 4 8 12
+g4 81 6 3
+            
+A1 A6 A7 A8
+A2 A3 A4 A5
\ No newline at end of file