Repository 'query_tabular'
hg clone https://toolshed.g2.bx.psu.edu/repos/jjohnson/query_tabular

Changeset 6:03842a4f71c6 (2017-02-17)
Previous changeset 5:19ae309ec53c (2016-04-20) Next changeset 7:72c32037fa1e (2017-02-18)
Commit message:
Uploaded
modified:
query_tabular.py
query_tabular.xml
added:
test-data/._IEDB.tsv
test-data/._netMHC_summary.tsv
test-data/._query_results.tsv
test-data/._regex_results.tsv
test-data/._sales_results.tsv
b
diff -r 19ae309ec53c -r 03842a4f71c6 query_tabular.py
--- a/query_tabular.py Wed Apr 20 15:46:04 2016 -0400
+++ b/query_tabular.py Fri Feb 17 15:20:24 2017 -0500
[
b'@@ -13,15 +13,24 @@\n TODO:\n - could read column names from comment lines, but issues with legal names\n - could add some transformations on tabular columns,\n+  filter - skip_regex\n   e.g. a regex to format date/time strings\n     format: {\n       c2 : re.sub(\'pat\', \'sub\', c2)\n       c3 : len(c3)\n-   }\n-   def format(colname,val, expr):\n+    }\n+    def format(colname,val, expr):\n+  normalize input list columns\n+    iterate over list values creating one row per iteration\n+      option for input line_num column\n+    create associated table \n+      fk, name, value  # e.g. PSM table with list of proteins containing peptide\n+      fk, name, value[, value] # if multiple columns similarly indexed, e.g. vcf\n - column_defs dict of columns to create from tabular input\n     column_defs : { \'name1\' : \'expr\', \'name2\' : \'expr\'}\n - allow multiple queries and outputs\n+  repeat min - max with up to max conditional outputs\n+\n - add a --json input for table definitions (or yaml)\n JSON config:\n { tables : [\n@@ -35,9 +44,11 @@\n     },\n     { file_path : \'/home/galaxy/dataset_102.dat\',\n             table_name : \'gff\',\n-            column_names : [\'seqname\',,,\'start\',\'end\']\n+            column_names : [\'seqname\',,\'date\',\'start\',\'end\']\n             comment_lines : 1\n             load_named_columns : True\n+            filters : [{\'filter\': \'regex\', \'pattern\': \'#peptide\', \'action\': \'exclude_match\'}, \n+                       {\'filter\': \'replace\', \'column\': 3, \'replace\': \'gi[|]\', \'pattern\': \'\'}]\n     },\n     { file_path : \'/home/galaxy/dataset_103.dat\',\n             table_name : \'test\',\n@@ -47,8 +58,71 @@\n }\n """\n \n-tables_query = \\\n-    "SELECT name, sql FROM sqlite_master WHERE type=\'table\' ORDER BY name"\n+\n+class LineFilter( object ):\n+    def __init__(self,source,filter_dict):\n+        self.source = source\n+        self.filter_dict = filter_dict\n+        print >> sys.stderr, \'LineFilter %s\' % filter_dict if filter_dict else \'NONE\'\n+        self.func = lambda l: l.rstrip(\'\\r\\n\') if l else None\n+        if not filter_dict:\n+            return\n+        if filter_dict[\'filter\'] == \'regex\':\n+            rgx = re.compile(filter_dict[\'pattern\'])\n+            if filter_dict[\'action\'] == \'exclude_match\':\n+                self.func = lambda l: l if not rgx.match(l) else None\n+            elif filter_dict[\'action\'] == \'include_match\':\n+                self.func = lambda l: l if rgx.match(l) else None\n+            elif filter_dict[\'action\'] == \'exclude_find\':\n+                self.func = lambda l: l if not rgx.search(l) else None\n+            elif filter_dict[\'action\'] == \'include_find\':\n+                self.func = lambda l: l if rgx.search(l) else None\n+        elif filter_dict[\'filter\'] == \'replace\':\n+            p = filter_dict[\'pattern\']\n+            r = filter_dict[\'replace\']\n+            c = int(filter_dict[\'column\']) - 1\n+            self.func = lambda l: \'\\t\'.join([x if i != c else re.sub(p,r,x) for i,x in enumerate(l.split(\'\\t\'))])\n+    def __iter__(self):\n+        return self\n+    def next(self):\n+        for i,next_line in enumerate(self.source):\n+            line = self.func(next_line)\n+            if line:\n+                return line\n+        raise StopIteration\n+\n+\n+class TabularReader:\n+    """\n+    Tabular file iterator. Returns a list \n+    """\n+    def __init__(self, file_path, skip=0, comment_char=None, col_idx=None, filters=None):\n+        self.skip = skip\n+        self.comment_char = comment_char\n+        self.col_idx = col_idx\n+        self.filters = filters\n+        self.tsv_file = open(file_path)\n+        if skip and skip > 0:\n+            for i in range(5): \n+                if not self.tsv_file.readline():\n+                    break\n+        source = LineFilter(self.tsv_file,None)\n+        if comment_char:\n+            source = LineFilter(source,{"filter": "regex", "pattern": comment_char, "action": "exclude_match"})\n+        if filters:\n+            for f in filters:\n+                source = LineFilter(source,f)\n+        self.source ='..b'                table_name,\n                 \'%s INTEGER PRIMARY KEY AUTOINCREMENT,\' % pkey_autoincr if pkey_autoincr else \'\',\n                 \', \\n    \'.join(col_def))\n     # print >> sys.stdout, table_def\n@@ -142,25 +213,23 @@\n             index_columns = index.split(\',\')\n             create_index(conn, table_name, index_name, index_columns)\n         c = conn.cursor()\n-        with open(file_path, "r") as fh:\n-            for linenum, line in enumerate(fh):\n-                if linenum < skip or line.startswith(comment_char):\n-                    continue\n-                data_lines += 1\n-                try:\n-                    fields = line.rstrip(\'\\r\\n\').split(\'\\t\')\n-                    if col_idx:\n-                        fields = [fields[i] for i in col_idx]\n-                    vals = [col_func[i](x) if x else None for i, x in enumerate(fields)]\n-                    c.execute(insert_stmt, vals)\n-                except Exception, e:\n-                    print >> sys.stderr, \'Failed at line: %d err: %s\' % (linenum, e)\n+        tr = TabularReader(file_path,skip=skip, comment_char=comment_char, col_idx=col_idx, filters=filters)\n+        for linenum, fields in enumerate(tr):\n+            data_lines += 1\n+            try:\n+                if col_idx:\n+                    fields = [fields[i] for i in col_idx]\n+                vals = [col_func[i](x) if x else None for i, x in enumerate(fields)]\n+                c.execute(insert_stmt, vals)\n+            except Exception, e:\n+                print >> sys.stderr, \'Failed at line: %d err: %s\' % (linenum, e)\n         conn.commit()\n         c.close()\n     except Exception, e:\n         print >> sys.stderr, \'Failed: %s\' % (e)\n         exit(1)\n \n+\n def create_index(conn, table_name, index_name, index_columns, unique=False):\n     index_def = "CREATE %s INDEX %s on %s(%s)" % (\'UNIQUE\' if unique else \'\', index_name, table_name, \',\'.join(index_columns))\n     c = conn.cursor()\n@@ -168,6 +237,7 @@\n     conn.commit()\n     c.close()\n \n+\n def regex_match(expr, item):\n     return re.match(expr, item) is not None\n \n@@ -237,6 +307,7 @@\n                     path = table[\'file_path\']\n                     table_name = table[\'table_name\'] if \'table_name\' in table else \'t%d\' % (ti + 1)\n                     comment_lines = table[\'comment_lines\'] if \'comment_lines\' in table else 0\n+                    comment_char = table[\'comment_char\'] if \'comment_char\' in table else None\n                     column_names = table[\'column_names\'] if \'column_names\' in table else None\n                     if column_names:\n                         load_named_columns = table[\'load_named_columns\'] if \'load_named_columns\' in table else False\n@@ -244,10 +315,11 @@\n                         load_named_columns = False\n                     unique_indexes = table[\'unique\'] if \'unique\' in table else []\n                     indexes = table[\'index\'] if \'index\' in table else []\n+                    filters = table[\'filters\'] if \'filters\' in table else None\n                     pkey_autoincr = table[\'pkey_autoincr\'] if \'pkey_autoincr\' in table else None\n                     create_table(conn, path, table_name, pkey_autoincr=pkey_autoincr, column_names=column_names, \n-                                 skip=comment_lines, load_named_columns=load_named_columns, \n-                                 unique_indexes=unique_indexes, indexes=indexes)\n+                                 skip=comment_lines, comment_char=comment_char, load_named_columns=load_named_columns, \n+                                 filters=filters,unique_indexes=unique_indexes, indexes=indexes)\n         except Exception, exc:\n             print >> sys.stderr, "Error: %s" % exc\n     conn.close()\n@@ -262,6 +334,8 @@\n         query = options.query\n \n     if (query is None):\n+        tables_query = \\\n+            "SELECT name, sql FROM sqlite_master WHERE type=\'table\' ORDER BY name"\n         try:\n             conn = get_connection(options.sqlitedb)\n             c = conn.cursor()\n'
b
diff -r 19ae309ec53c -r 03842a4f71c6 query_tabular.xml
--- a/query_tabular.xml Wed Apr 20 15:46:04 2016 -0400
+++ b/query_tabular.xml Fri Feb 17 15:20:24 2017 -0500
[
b'@@ -1,4 +1,4 @@\n-<tool id="query_tabular" name="Query Tabular" version="0.1.3">\n+<tool id="query_tabular" name="Query Tabular" version="2.0.0">\n     <description>using sqlite sql</description>\n \n     <requirements>\n@@ -6,8 +6,15 @@\n     <stdio>\n         <exit_code range="1:" />\n     </stdio>\n-    <command interpreter="python"><![CDATA[\n-        query_tabular.py \n+    <command><![CDATA[\n+        #if $add_to_database.withdb: \n+            #if $save_db:\n+                cp "$add_to_database.withdb" "$save_db" &&\n+            #else:\n+                cp "$add_to_database.withdb" "$workdb" &&\n+            #end if \n+        #end if\n+        python $__tool_directory__/query_tabular.py \n         #if $save_db\n         -s $sqlitedb\n         #else\n@@ -19,6 +26,8 @@\n           $no_header\n           -o $output\n         #end if\n+        && cat $query_file\n+        && cat $table_json\n     ]]></command>\n     <configfiles>\n         <configfile name="query_file">\n@@ -53,11 +62,6 @@\n   #set $col_names = \'\'\n   #end if\n   #set $jtbl[\'column_names\'] = $col_names\n-  #if str($tbl.tbl_opts.skip_lines) != \'\':\n-    #set $jtbl[\'comment_lines\'] = int($tbl.tbl_opts.skip_lines)\n-  #elif $tbl.table.metadata.comment_lines and $tbl.table.metadata.comment_lines > 0:\n-    #set $jtbl[\'comment_lines\'] = int($tbl.table.metadata.comment_lines)\n-  #end if\n   #set $idx_unique = []\n   #set $idx_non = []\n   #for $idx in $tbl.tbl_opts.indexes:\n@@ -73,6 +77,40 @@\n   #if len($idx_non) > 0:\n     #set $jtbl[\'index\'] = $idx_non\n   #end if\n+  #set $input_filters = []\n+  #for $fi in $tbl.input_opts.linefilters:\n+    #if $fi.filter.filter_type == \'skip\':\n+      #if str($tbl.tbl_opts.skip_lines) != \'\':\n+        #set $jtbl[\'comment_lines\'] = int($fi.filter.skip_lines)\n+      #elif $tbl.table.metadata.comment_lines and $tbl.table.metadata.comment_lines > 0:\n+        #set $jtbl[\'comment_lines\'] = int($tbl.table.metadata.comment_lines)\n+      #end if\n+    #elif $fi.filter.filter_type == \'comment\':\n+      #set $jtbl[\'comment_char\'] = str($fi.filter.comment_char)\n+    #elif $fi.filter.filter_type == \'regex\':\n+      #set $filter_dict = dict()\n+      #set $filter_dict[\'filter\'] = str($fi.filter.filter_type)\n+      #set $filter_dict[\'pattern\'] = str($fi.filter.regex_pattern)\n+      #set $filter_dict[\'action\'] = str($fi.filter.regex_action)\n+      #silent $input_filters.append($filter_dict)\n+    #elif $fi.filter.filter_type == \'replace\':\n+      #set $filter_dict = dict()\n+      #set $filter_dict[\'filter\'] = str($fi.filter.filter_type)\n+      #set $filter_dict[\'column\'] = int(str($fi.filter.column))\n+      #set $filter_dict[\'pattern\'] = str($fi.filter.regex_pattern)\n+      #set $filter_dict[\'replace\'] = str($fi.filter.regex_replace)\n+      #silent $input_filters.append($filter_dict)\n+    ## #elif $fi.filter.filter_type == \'normalize\':\n+    ##   #set $filter_dict = dict()\n+    ##   #set $filter_dict[\'filter\'] = str($fi.filter.filter_type)\n+    ##   #set $filter_dict[\'columns\'] = [int(str($ci)) for $ci in str($fi.filter.columns).split(\',\')]\n+    ##   #set $filter_dict[\'separator\'] = str($fi.filter.separator)\n+    ##   #silent $input_filters.append($filter_dict)\n+    #end if\n+  #end for\n+  #if $input_filters:\n+    #set $jtbl[\'filters\'] = $input_filters\n+  #end if\n   #set $jtbls += [$jtbl]\n #end for\n #echo $json.dumps($jtbldef)\n@@ -80,8 +118,66 @@\n     </configfiles>\n     <inputs>\n         <param name="workdb" type="hidden" value="workdb.sqlite" label=""/>\n+        <section name="add_to_database" expanded="false" title="Add tables to an existing database">\n+            <param name="withdb" type="data" format="sqlite" optional="true" label="Add tables to this Database" \n+               help="Make sure your added table names are not already in this database"/>\n+        </section>\n         <repeat name="tables" title="Database Table" min="1">\n             <param name="table" type="data" format="tabular" label="Tabular Dataset for Table"/>\n+            <section name="input_opts" expanded="false" ti'..b'                                <option value="include_find">include line if pattern found</option>\n+                            </param>\n+                        </when>\n+                        <when value="replace">\n+                            <param name="column" type="data_column" data_ref="table" label="Column to replace text"\n+                                   help=""/>\n+                            <param name="regex_pattern" type="text" value="" label="regex pattern">\n+                                <sanitizer sanitize="False"/>\n+                            </param>\n+                            <param name="regex_replace" type="text" value="" label="replacement expression">\n+                                <sanitizer sanitize="False"/>\n+                            </param>\n+                        </when>\n+                        <!--\n+                        <when value="normalize">\n+                            <param name="columns" type="data_column" data_ref="table" multiple="True" label="Columns to split"\n+                                   help=""/>\n+                            <param name="separator" type="text" value="," label="List item delimiter in column">\n+                                <sanitizer sanitize="False"/>\n+                                <validator type="regex" message="Anything but TAB or Newline">^[^\\t\\n\\r\\f\\v]+$</validator>\n+                            </param>\n+                        </when>\n+                        -->\n+                    </conditional>\n+                </repeat>\n+            </section>\n             <section name="tbl_opts" expanded="false" title="Table Options">\n                 <param name="table_name" type="text" value="" optional="true" label="Specify Name for Table">\n                     <help>By default, tables will be named: t1,t2,...,tn (table names must be unique)</help>\n@@ -97,7 +193,6 @@\n                        help="Only creates this additional column when a name is entered. (This can not be the same name as any of the other columns in this table.)">\n                         <validator type="regex" message="Column name">^([A-Za-z]\\w*)?$</validator>\n                 </param>\n-                <param name="skip_lines" type="integer" value="" min="0" optional="true" label="Skip lines" help="Leave blank to use the comment lines metadata for this dataset" />\n                 <repeat name="indexes" title="Table Index">\n                     <param name="unique" type="boolean" truevalue="yes" falsevalue="no" checked="False" label="This is a unique index"/>\n                     <param name="index_columns" type="text" value="" label="Index on Columns">\n@@ -108,7 +203,7 @@\n             </section>\n         </repeat>\n         <param name="save_db" type="boolean" truevalue="yes" falsevalue="no" checked="false" label="Save the sqlite database in your history"/>\n-        <param name="sqlquery" type="text" area="true" size="10x80" value="" optional="true" label="SQL Query to generate tabular output">\n+        <param name="sqlquery" type="text" area="true" size="20x80" value="" optional="true" label="SQL Query to generate tabular output">\n                 <help>By default: tables are named: t1,t2,...,tn and columns in each table: c1,c2,...,cn</help>\n                 <sanitizer sanitize="False"/>\n                 <validator type="regex" message="">^(?ims)\\s*select\\s+.*\\s+from\\s+.*$</validator>\n@@ -186,6 +281,9 @@\n \n   Loads tabular datasets into a SQLite_ data base.  \n \n+  An existing SQLite_ data base can be used as input, and any selected tabular datasets will be added as new tables in that data base.\n+\n+\n **Outputs**\n \n   The results of a SQL query are output to the history as a tabular file.\n@@ -195,7 +293,6 @@\n     *(The* **SQLite to tabular** *tool can run additional queries on this database.)*\n \n \n-\n For help in using SQLite_ see:  http://www.sqlite.org/docs.html\n \n **NOTE:** input for SQLite dates input field must be in the format: *YYYY-MM-DD* for example: 2015-09-30\n'
b
diff -r 19ae309ec53c -r 03842a4f71c6 test-data/._IEDB.tsv
b
Binary file test-data/._IEDB.tsv has changed
b
diff -r 19ae309ec53c -r 03842a4f71c6 test-data/._netMHC_summary.tsv
b
Binary file test-data/._netMHC_summary.tsv has changed
b
diff -r 19ae309ec53c -r 03842a4f71c6 test-data/._query_results.tsv
b
Binary file test-data/._query_results.tsv has changed
b
diff -r 19ae309ec53c -r 03842a4f71c6 test-data/._regex_results.tsv
b
Binary file test-data/._regex_results.tsv has changed
b
diff -r 19ae309ec53c -r 03842a4f71c6 test-data/._sales_results.tsv
b
Binary file test-data/._sales_results.tsv has changed