Repository 'filter_tabular'
hg clone https://toolshed.g2.bx.psu.edu/repos/iuc/filter_tabular

Changeset 0:6fbd9d25ceef (2017-07-18)
Next changeset 1:cd2a99849f8b (2017-08-18)
Commit message:
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/query_tabular commit 74915fc9cee746bbce1c4b507e13231259de177d
added:
filter_tabular.py
filter_tabular.xml
filters.py
load_db.py
macros.xml
query_db.py
query_tabular.py
sqlite_to_tabular.py
test-data/IEDB.tsv
test-data/add_to_db_results.tsv
test-data/customers.tsv
test-data/filtered_people_results.tsv
test-data/filtered_pets_results.tsv
test-data/netMHC_summary.tsv
test-data/pet_normalized_query_results.tsv
test-data/pets.tsv
test-data/query_results.tsv
test-data/regex_results.tsv
test-data/sales.tsv
test-data/sales_results.tsv
test-data/testdb.sqlite
b
diff -r 000000000000 -r 6fbd9d25ceef filter_tabular.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/filter_tabular.py Tue Jul 18 09:06:47 2017 -0400
[
@@ -0,0 +1,68 @@
+#!/usr/bin/env python
+
+from __future__ import print_function
+
+import json
+import optparse
+import os.path
+import sys
+
+from filters import filter_file
+
+
+def __main__():
+    # Parse Command Line
+    parser = optparse.OptionParser()
+    parser.add_option('-i', '--input', dest='input', default=None,
+                      help='Input file for filtering')
+    parser.add_option('-j', '--jsonfile', dest='jsonfile', default=None,
+                      help='JSON array of filter specifications')
+    parser.add_option('-o', '--output', dest='output', default=None,
+                      help='Output file for query results')
+    parser.add_option('-v', '--verbose', dest='verbose', default=False,
+                      action='store_true',
+                      help='verbose')
+    (options, args) = parser.parse_args()
+
+    if options.input is not None:
+        try:
+            inputPath = os.path.abspath(options.input)
+            inputFile = open(inputPath, 'r')
+        except Exception as e:
+            exit('Error: %s' % (e))
+    else:
+        inputFile = sys.stdin
+
+    if options.output is not None:
+        try:
+            outputPath = os.path.abspath(options.output)
+            outputFile = open(outputPath, 'w')
+        except Exception as e:
+            exit('Error: %s' % (e))
+    else:
+        outputFile = sys.stdout
+
+    filters = None
+    if options.jsonfile:
+        try:
+            with open(options.jsonfile) as fh:
+                filters = json.load(fh)
+        except Exception as e:
+            exit('Error: %s' % (e))
+
+    if options.verbose and filters:
+        for f in filters:
+            print('%s  %s' % (f['filter'],
+                  ', '.join(
+                  ['%s: %s' % (k, f[k])
+                   for k in set(f.keys()) - set(['filter'])])),
+                  file=sys.stdout)
+
+    try:
+        filter_file(inputFile, outputFile, filters=filters)
+    except Exception as e:
+        exit('Error: %s' % (e))
+
+
+if __name__ == "__main__":
+    __main__()
b
diff -r 000000000000 -r 6fbd9d25ceef filter_tabular.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/filter_tabular.xml Tue Jul 18 09:06:47 2017 -0400
[
@@ -0,0 +1,130 @@
+<tool id="filter_tabular" name="Filter Tabular" version="1.0.0">
+    <description></description>
+
+    <macros>
+         <import>macros.xml</import>
+    </macros>
+
+    <requirements>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+        python '$__tool_directory__/filter_tabular.py'
+        -i '$input'
+        -j '$filter_json'
+        -o '$output'
+    ]]></command>
+    <configfiles>
+        <configfile name="filter_json">
+#import json
+@LINEFILTERS@
+#if $input_filters:
+#echo $json.dumps($input_filters)
+#end if
+        </configfile>
+    </configfiles>
+    <inputs>
+        <param name="input" type="data" format="tabular" label="Tabular Dataset to filter"/>
+        <expand macro="macro_line_filters" />
+    </inputs>
+    <outputs>
+        <data format="tabular" name="output" />
+    </outputs>
+    <tests>
+        <test>
+            <param name="input" ftype="tabular" value="pets.tsv"/>
+            <repeat name="linefilters">
+                <conditional name="filter">
+                    <param name="filter_type" value="regex"/>
+                    <param name="regex_pattern" value="^\d+"/>
+                    <param name="regex_action" value="include_find"/>
+                </conditional>
+            </repeat>
+            <repeat name="linefilters">
+                <conditional name="filter">
+                    <param name="filter_type" value="append_line_num"/>
+                </conditional>
+            </repeat>
+            <repeat name="linefilters">
+                <conditional name="filter">
+                    <param name="filter_type" value="select_columns"/>
+                    <param name="columns" value="7,2,3,4,1"/>
+                </conditional>
+            </repeat>
+            <repeat name="linefilters">
+                <conditional name="filter">
+                    <param name="filter_type" value="replace"/>
+                    <param name="column" value="c4"/>
+                    <param name="regex_pattern" value="(\d+)/(\d+)/(\d+)"/>
+                    <param name="regex_replace" value="19\3-\2-\1"/>
+                </conditional>
+            </repeat>
+            <output name="output" file="filtered_people_results.tsv"/>
+        </test>
+        <test>
+            <param name="input" ftype="tabular" value="pets.tsv"/>
+            <repeat name="linefilters">
+                <conditional name="filter">
+                    <param name="filter_type" value="comment"/>
+                    <param name="comment_char" value="35"/>
+                </conditional>
+            </repeat>
+            <repeat name="linefilters">
+                <conditional name="filter">
+                    <param name="filter_type" value="regex"/>
+                    <param name="regex_pattern" value="^\d+"/>
+                    <param name="regex_action" value="include_find"/>
+                </conditional>
+            </repeat>
+            <repeat name="linefilters">
+                <conditional name="filter">
+                    <param name="filter_type" value="append_line_num"/>
+                </conditional>
+            </repeat>
+            <repeat name="linefilters">
+                <conditional name="filter">
+                    <param name="filter_type" value="select_columns"/>
+                    <param name="columns" value="c7,c5,c6"/>
+                </conditional>
+            </repeat>
+            <repeat name="linefilters">
+                <conditional name="filter">
+                    <param name="filter_type" value="normalize"/>
+                    <param name="columns" value="c2,c3"/>
+                    <param name="separator" value=","/>
+                </conditional>
+            </repeat>
+            <repeat name="linefilters">
+                <conditional name="filter">
+                    <param name="filter_type" value="regex"/>
+                    <param name="regex_pattern" value="^\d+\t\t"/>
+                    <param name="regex_action" value="exclude_match"/>
+                </conditional>
+            </repeat>
+            <output name="output" file="filtered_pets_results.tsv"/>
+        </test>
+
+    </tests>
+    <help><![CDATA[
+==============
+Filter Tabular
+==============
+
+  Filter a tabular dataset by applying line filters as it is being read.
+  Multiple filters may be used with each filter using the result of the previous filter.  
+
+**Inputs**
+
+  A tabular dataset.
+
+
+**Outputs**
+
+  A filtered tabular dataset.
+
+
+@LINEFILTERS_HELP@
+
+@LINEFILTERS_HELP_EXAMPLE@
+
+    ]]></help>
+</tool>
b
diff -r 000000000000 -r 6fbd9d25ceef filters.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/filters.py Tue Jul 18 09:06:47 2017 -0400
[
@@ -0,0 +1,156 @@
+#!/usr/binsenv python
+
+from __future__ import print_function
+
+import re
+import sys
+
+
+class LineFilter(object):
+    def __init__(self, source, filter_dict):
+        self.source = source
+        self.filter_dict = filter_dict
+        self.func = lambda i, l: l.rstrip('\r\n') if l else None
+        self.src_lines = []
+        self.src_line_cnt = 0
+        if not filter_dict:
+            return
+        if filter_dict['filter'] == 'regex':
+            rgx = re.compile(filter_dict['pattern'])
+            if filter_dict['action'] == 'exclude_match':
+                self.func = lambda i, l: l if not rgx.match(l) else None
+            elif filter_dict['action'] == 'include_match':
+                self.func = lambda i, l: l if rgx.match(l) else None
+            elif filter_dict['action'] == 'exclude_find':
+                self.func = lambda i, l: l if not rgx.search(l) else None
+            elif filter_dict['action'] == 'include_find':
+                self.func = lambda i, l: l if rgx.search(l) else None
+        elif filter_dict['filter'] == 'select_columns':
+            cols = [int(c) - 1 for c in filter_dict['columns']]
+            self.func = lambda i, l: self.select_columns(l, cols)
+        elif filter_dict['filter'] == 'replace':
+            p = filter_dict['pattern']
+            r = filter_dict['replace']
+            c = int(filter_dict['column']) - 1
+            self.func = lambda i, l: '\t'.join(
+                [x if j != c else re.sub(p, r, x) for j, x in enumerate(l.split('\t'))])
+        elif filter_dict['filter'] == 'prepend_line_num':
+            self.func = lambda i, l: '%d\t%s' % (i, l)
+        elif filter_dict['filter'] == 'append_line_num':
+            self.func = lambda i, l: '%s\t%d' % (l.rstrip('\r\n'), i)
+        elif filter_dict['filter'] == 'prepend_text':
+            s = filter_dict['column_text']
+            self.func = lambda i, l: '%s\t%s' % (s, l)
+        elif filter_dict['filter'] == 'append_text':
+            s = filter_dict['column_text']
+            self.func = lambda i, l: '%s\t%s' % (l.rstrip('\r\n'), s)
+        elif filter_dict['filter'] == 'skip':
+            cnt = filter_dict['count']
+            self.func = lambda i, l: l if i > cnt else None
+        elif filter_dict['filter'] == 'normalize':
+            cols = [int(c) - 1 for c in filter_dict['columns']]
+            sep = filter_dict['separator']
+            self.func = lambda i, l: self.normalize(l, cols, sep)
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        if not self.src_lines:
+            self.get_lines()
+        if self.src_lines:
+            return self.src_lines.pop(0)
+        raise StopIteration
+
+    next = __next__
+
+    def select_columns(self, line, cols):
+        fields = line.split('\t')
+        return '\t'.join([fields[x] for x in cols])
+
+    def normalize(self, line, split_cols, sep):
+        lines = []
+        fields = line.rstrip('\r\n').split('\t')
+        split_fields = dict()
+        cnt = 0
+        for c in split_cols:
+            if c < len(fields):
+                split_fields[c] = fields[c].split(sep)
+                cnt = max(cnt, len(split_fields[c]))
+        if cnt == 0:
+            lines.append('\t'.join(fields))
+        else:
+            for n in range(0, cnt):
+                flds = [x if c not in split_cols else split_fields[c][n]
+                        if n < len(split_fields[c])
+                        else '' for (c, x) in enumerate(fields)]
+                lines.append('\t'.join(flds))
+        return lines
+
+    def get_lines(self):
+        for i, next_line in enumerate(self.source):
+            self.src_line_cnt += 1
+            line = self.func(self.src_line_cnt, next_line)
+            if line:
+                if isinstance(line, list):
+                    self.src_lines.extend(line)
+                else:
+                    self.src_lines.append(line)
+                return
+
+
+class TabularReader:
+    """
+    Tabular file iterator. Returns a list
+    """
+    def __init__(self, input_file, skip=0, comment_char=None, col_idx=None,
+                 filters=None):
+        self.skip = skip
+        self.comment_char = comment_char
+        self.col_idx = col_idx
+        self.filters = filters
+        self.tsv_file = \
+            input_file if hasattr(input_file, 'readline') else open(input_file)
+        if skip and skip > 0:
+            for i in range(skip):
+                if not self.tsv_file.readline():
+                    break
+        source = LineFilter(self.tsv_file, None)
+        if comment_char:
+            source = LineFilter(source,
+                                {"filter": "regex", "pattern": comment_char,
+                                 "action": "exclude_match"})
+        if filters:
+            for f in filters:
+                source = LineFilter(source, f)
+        self.source = source
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        ''' Iteration '''
+        for i, line in enumerate(self.source):
+            fields = line.rstrip('\r\n').split('\t')
+            if self.col_idx:
+                fields = [fields[i] for i in self.col_idx]
+            return fields
+        raise StopIteration
+
+    next = __next__
+
+
+def filter_file(input_file, output, skip=0, comment_char='#', filters=None):
+    data_lines = 0
+    try:
+        tr = TabularReader(input_file, skip=skip, comment_char=comment_char,
+                           filters=filters)
+        for linenum, fields in enumerate(tr):
+            data_lines += 1
+            try:
+                output.write('%s\n' % '\t'.join(fields))
+            except Exception as e:
+                print('Failed at line: %d err: %s' % (linenum, e),
+                      file=sys.stderr)
+    except Exception as e:
+        exit('Error: %s' % (e))
b
diff -r 000000000000 -r 6fbd9d25ceef load_db.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/load_db.py Tue Jul 18 09:06:47 2017 -0400
[
@@ -0,0 +1,135 @@
+#!/usr/bin/env python
+
+from __future__ import print_function
+
+import sys
+
+from filters import TabularReader
+
+
+def getValueType(val):
+    if val or 0. == val:
+        try:
+            int(val)
+            return 'INTEGER'
+        except:
+            try:
+                float(val)
+                return 'REAL'
+            except:
+                return 'TEXT'
+    return None
+
+
+def get_column_def(file_path, table_name, skip=0, comment_char='#',
+                   column_names=None, max_lines=100, load_named_columns=False,
+                   filters=None):
+    col_pref = ['TEXT', 'REAL', 'INTEGER', None]
+    col_types = []
+    col_idx = None
+    try:
+        tr = TabularReader(file_path, skip=skip, comment_char=comment_char,
+                           col_idx=None, filters=filters)
+        for linenum, fields in enumerate(tr):
+            if linenum > max_lines:
+                break
+            try:
+                while len(col_types) < len(fields):
+                    col_types.append(None)
+                for i, val in enumerate(fields):
+                    colType = getValueType(val)
+                    if col_pref.index(colType) < col_pref.index(col_types[i]):
+                        col_types[i] = colType
+            except Exception as e:
+                print('Failed at line: %d err: %s' % (linenum, e),
+                      file=sys.stderr)
+    except Exception as e:
+        print('Failed: %s' % (e), file=sys.stderr)
+    for i, col_type in enumerate(col_types):
+        if not col_type:
+            col_types[i] = 'TEXT'
+    if column_names:
+        col_names = []
+        if load_named_columns:
+            col_idx = []
+            for i, cname in enumerate(
+                    [cn.strip() for cn in column_names.split(',')]):
+                if cname != '':
+                    col_idx.append(i)
+                    col_names.append(cname)
+            col_types = [col_types[i] for i in col_idx]
+        else:
+            col_names = ['c%d' % i for i in range(1, len(col_types) + 1)]
+            for i, cname in enumerate(
+                    [cn.strip() for cn in column_names.split(',')]):
+                if cname and i < len(col_names):
+                    col_names[i] = cname
+    else:
+        col_names = ['c%d' % i for i in range(1, len(col_types) + 1)]
+    col_def = []
+    for i, col_name in enumerate(col_names):
+        col_def.append('%s %s' % (col_names[i], col_types[i]))
+    return col_names, col_types, col_def, col_idx
+
+
+def create_table(conn, file_path, table_name, skip=0, comment_char='#',
+                 pkey_autoincr=None, column_names=None,
+                 load_named_columns=False, filters=None,
+                 unique_indexes=[], indexes=[]):
+    col_names, col_types, col_def, col_idx = \
+        get_column_def(file_path, table_name, skip=skip,
+                       comment_char=comment_char, column_names=column_names,
+                       load_named_columns=load_named_columns, filters=filters)
+    col_func = [float if t == 'REAL' else int
+                if t == 'INTEGER' else str for t in col_types]
+    table_def = 'CREATE TABLE %s (\n    %s%s\n);' % (
+                table_name,
+                '%s INTEGER PRIMARY KEY AUTOINCREMENT,' %
+                pkey_autoincr if pkey_autoincr else '',
+                ', \n    '.join(col_def))
+    # print >> sys.stdout, table_def
+    insert_stmt = 'INSERT INTO %s(%s) VALUES(%s)' % (
+                  table_name, ','.join(col_names),
+                  ','.join(["?" for x in col_names]))
+    # print >> sys.stdout, insert_stmt
+    data_lines = 0
+    try:
+        c = conn.cursor()
+        c.execute(table_def)
+        conn.commit()
+        c.close()
+        for i, index in enumerate(unique_indexes):
+            index_name = 'idx_uniq_%s_%d' % (table_name, i)
+            index_columns = index.split(',')
+            create_index(conn, table_name, index_name, index_columns,
+                         unique=True)
+        for i, index in enumerate(indexes):
+            index_name = 'idx_%s_%d' % (table_name, i)
+            index_columns = index.split(',')
+            create_index(conn, table_name, index_name, index_columns)
+        c = conn.cursor()
+        tr = TabularReader(file_path, skip=skip, comment_char=comment_char,
+                           col_idx=col_idx, filters=filters)
+        for linenum, fields in enumerate(tr):
+            data_lines += 1
+            try:
+                vals = [col_func[i](x)
+                        if x else None for i, x in enumerate(fields)]
+                c.execute(insert_stmt, vals)
+            except Exception as e:
+                print('Failed at line: %d err: %s' % (linenum, e),
+                      file=sys.stderr)
+        conn.commit()
+        c.close()
+    except Exception as e:
+        exit('Error: %s' % (e))
+
+
+def create_index(conn, table_name, index_name, index_columns, unique=False):
+    index_def = "CREATE %s INDEX %s on %s(%s)" % (
+                'UNIQUE' if unique else '', index_name,
+                table_name, ','.join(index_columns))
+    c = conn.cursor()
+    c.execute(index_def)
+    conn.commit()
+    c.close()
b
diff -r 000000000000 -r 6fbd9d25ceef macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml Tue Jul 18 09:06:47 2017 -0400
[
b'@@ -0,0 +1,383 @@\n+<macros>\n+  <token name="@LINEFILTERS@">\n+<![CDATA[\n+  ## set linefilters to the \n+  #set $input_filters = []\n+  #for $fi in $linefilters:\n+    #if $fi.filter.filter_type == \'skip\':\n+      #set $skip_lines = None\n+      #if str($fi.filter.skip_lines) != \'\':\n+        #set $skip_lines = int($fi.filter.skip_lines)\n+      #elif $tbl.table.metadata.comment_lines and $tbl.table.metadata.comment_lines > 0:\n+        #set $skip_lines = int($tbl.table.metadata.comment_lines)\n+      #end if\n+      #if $skip_lines is not None:\n+        #set $filter_dict = dict()\n+        #set $filter_dict[\'filter\'] = str($fi.filter.filter_type)\n+        #set $filter_dict[\'count\'] = $skip_lines\n+        #silent $input_filters.append($filter_dict)\n+      #end if\n+    #elif $fi.filter.filter_type == \'comment\':\n+      #set $filter_dict = dict()\n+      #set $filter_dict[\'filter\'] = \'regex\'\n+      #set $filter_dict[\'pattern\'] = \'^(%s).*$\' % \'|\'.join([chr(int(x)).replace(\'|\',\'[|]\') for x in (str($fi.filter.comment_char)).split(\',\')])\n+      #set $filter_dict[\'action\'] = \'exclude_match\'\n+      #silent $input_filters.append($filter_dict)\n+    #elif $fi.filter.filter_type == \'regex\':\n+      #set $filter_dict = dict()\n+      #set $filter_dict[\'filter\'] = str($fi.filter.filter_type)\n+      #set $filter_dict[\'pattern\'] = str($fi.filter.regex_pattern)\n+      #set $filter_dict[\'action\'] = str($fi.filter.regex_action)\n+      #silent $input_filters.append($filter_dict)\n+    #elif $fi.filter.filter_type == \'select_columns\':\n+      #set $filter_dict = dict()\n+      #set $filter_dict[\'filter\'] = str($fi.filter.filter_type)\n+      #set $filter_dict[\'columns\'] = [int(str($ci).replace(\'c\',\'\')) for $ci in str($fi.filter.columns).split(\',\')]\n+      #silent $input_filters.append($filter_dict)\n+    #elif $fi.filter.filter_type == \'replace\':\n+      #set $filter_dict = dict()\n+      #set $filter_dict[\'filter\'] = str($fi.filter.filter_type)\n+      #set $filter_dict[\'column\'] = int(str($fi.filter.column).replace(\'c\',\'\'))\n+      #set $filter_dict[\'pattern\'] = str($fi.filter.regex_pattern)\n+      #set $filter_dict[\'replace\'] = str($fi.filter.regex_replace)\n+      #silent $input_filters.append($filter_dict)\n+    #elif str($fi.filter.filter_type).endswith(\'pend_line_num\'):\n+      #set $filter_dict = dict()\n+      #set $filter_dict[\'filter\'] = str($fi.filter.filter_type)\n+      #silent $input_filters.append($filter_dict)\n+    #elif str($fi.filter.filter_type).endswith(\'pend_text\'):\n+      #set $filter_dict = dict()\n+      #set $filter_dict[\'filter\'] = str($fi.filter.filter_type)\n+      #set $filter_dict[\'column_text\'] = str($fi.filter.column_text)\n+      #silent $input_filters.append($filter_dict)\n+    #elif $fi.filter.filter_type == \'normalize\':\n+      #set $filter_dict = dict()\n+      #set $filter_dict[\'filter\'] = str($fi.filter.filter_type)\n+      #set $filter_dict[\'columns\'] = [int(str($ci).replace(\'c\',\'\')) for $ci in str($fi.filter.columns).split(\',\')]\n+      #set $filter_dict[\'separator\'] = str($fi.filter.separator)\n+      #silent $input_filters.append($filter_dict)\n+    #end if\n+  #end for\n+]]>\n+  </token>\n+  <xml name="macro_line_filters">\n+                <repeat name="linefilters" title="Filter Tabular Input Lines">\n+                    <conditional name="filter">\n+                        <param name="filter_type" type="select" label="Filter By">\n+                            <option value="skip">skip leading lines</option>\n+                            <option value="comment">comment char</option>\n+                            <option value="regex">by regex expression matching</option>\n+                            <option value="select_columns">select columns</option>\n+                            <option value="replace">regex replace value in column</option>\n+                            <option value="prepend_line_num">prepend a line number column</option>\n+                            <option value="append_line_num">append a line number column</option>\n+         '..b'==== ============\n+    #CustomerID FirstName  LastName   Email                 DOB        Phone\n+    =========== ========== ========== ===================== ========== ============\n+    1           John       Smith      John.Smith@yahoo.com  1968-02-04 626 222-2222\n+    2           Steven     Goldfish   goldfish@fishhere.net 1974-04-04 323 455-4545\n+    3           Paula      Brown      pb@herowndomain.org   1978-05-24 416 323-3232\n+    4           James      Smith      jim@supergig.co.uk    1980-10-20 416 323-8888\n+    =========== ========== ========== ===================== ========== ============\n+  \n+   Dataset *sales*\n+  \n+    Table name: "sales"\n+  \n+    Column names: "CustomerID,Date,SaleAmount"\n+  \n+    =============  ============  ============\n+      #CustomerID    Date          SaleAmount\n+    =============  ============  ============\n+               2    2004-05-06         100.22\n+               1    2004-05-07          99.95\n+               3    2004-05-07         122.95\n+               3    2004-05-13         100.00\n+               4    2004-05-22         555.55\n+    =============  ============  ============\n+  \n+  The query\n+  \n+  ::\n+  \n+    SELECT FirstName,LastName,sum(SaleAmount) as "TotalSales" \n+    FROM customers join sales on customers.CustomerID = sales.CustomerID \n+    GROUP BY customers.CustomerID ORDER BY TotalSales DESC;\n+  \n+  Produces this tabular output:\n+  \n+    ========== ======== ==========\n+    #FirstName LastName TotalSales\n+    ========== ======== ==========\n+    James      Smith    555.55\n+    Paula      Brown    222.95\n+    Steven     Goldfish 100.22\n+    John       Smith    99.95\n+    ========== ======== ==========\n+  \n+  \n+  If the optional Table name and Column names inputs are not used, the query would be:\n+  \n+  ::\n+  \n+    SELECT t1.c2 as "FirstName", t1.c3 as "LastName", sum(t2.c3) as "TotalSales" \n+    FROM t1 join t2 on t1.c1 = t2.c1 \n+    GROUP BY t1.c1 ORDER BY TotalSales DESC;\n+  \n+  You can selectively name columns, e.g. on the customers input you could just name columns 2,3, and 5: \n+  \n+    Column names: ,FirstName,LastName,,BirthDate\n+  \n+    Results in the following data base table\n+  \n+    =========== ========== ========== ===================== ========== ============\n+    #c1         FirstName  LastName   c4                    BirthDate  c6\n+    =========== ========== ========== ===================== ========== ============\n+    1           John       Smith      John.Smith@yahoo.com  1968-02-04 626 222-2222\n+    2           Steven     Goldfish   goldfish@fishhere.net 1974-04-04 323 455-4545\n+    3           Paula      Brown      pb@herowndomain.org   1978-05-24 416 323-3232\n+    4           James      Smith      jim@supergig.co.uk    1980-10-20 416 323-8888\n+    =========== ========== ========== ===================== ========== ============\n+\n+\n+  Regular_expression_ functions are included for: \n+\n+  ::\n+\n+    matching:      re_match(\'pattern\',column) \n+\n+    SELECT t1.FirstName, t1.LastName\n+    FROM t1\n+    WHERE re_match(\'^.*\\.(net|org)$\',c4)\n+\n+  Results:\n+\n+    =========== ==========\n+    #FirstName  LastName\n+    =========== ==========\n+    Steven      Goldfish\n+    Paula       Brown\n+    =========== ==========\n+\n+\n+  ::\n+\n+    searching:     re_search(\'pattern\',column)\n+    substituting:  re_sub(\'pattern\',\'replacement,column)\n+\n+    SELECT t1.FirstName, t1.LastName, re_sub(\'^\\d{2}(\\d{2})-(\\d\\d)-(\\d\\d)\',\'\\3/\\2/\\1\',BirthDate) as "DOB"\n+    FROM t1\n+    WHERE re_search(\'[hp]er\',c4)\n+\n+  Results:\n+\n+\n+    =========== ========== ==========\n+    #FirstName  LastName   DOB\n+    =========== ========== ==========\n+    Steven      Goldfish   04/04/74\n+    Paula       Brown      24/05/78\n+    James       Smith      20/10/80\n+    =========== ========== ==========\n+\n+.. _Regular_expression: https://docs.python.org/release/2.7/library/re.html\n+.. _SQLite: http://www.sqlite.org/index.html\n+.. _SQLite_functions: http://www.sqlite.org/docs.html\n+\n+\n+]]>\n+  </token>\n+\n+</macros>\n+\n'
b
diff -r 000000000000 -r 6fbd9d25ceef query_db.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/query_db.py Tue Jul 18 09:06:47 2017 -0400
[
@@ -0,0 +1,67 @@
+#!/usr/bin/env python
+
+from __future__ import print_function
+
+import re
+import sqlite3 as sqlite
+import sys
+
+
+TABLE_QUERY = \
+    """
+    SELECT name, sql
+    FROM sqlite_master
+    WHERE type='table'
+    ORDER BY name
+    """
+
+
+def regex_match(expr, item):
+    return re.match(expr, item) is not None
+
+
+def regex_search(expr, item):
+    return re.search(expr, item) is not None
+
+
+def regex_sub(expr, replace, item):
+    return re.sub(expr, replace, item)
+
+
+def get_connection(sqlitedb_path, addfunctions=True):
+    conn = sqlite.connect(sqlitedb_path)
+    if addfunctions:
+        conn.create_function("re_match", 2, regex_match)
+        conn.create_function("re_search", 2, regex_search)
+        conn.create_function("re_sub", 3, regex_sub)
+    return conn
+
+
+def describe_tables(conn, outputFile):
+    try:
+        c = conn.cursor()
+        tables_query = TABLE_QUERY
+        rslt = c.execute(tables_query).fetchall()
+        for table, sql in rslt:
+            print("Table %s:" % table, file=outputFile)
+            try:
+                col_query = 'SELECT * FROM %s LIMIT 0' % table
+                cur = conn.cursor().execute(col_query)
+                cols = [col[0] for col in cur.description]
+                print(" Columns: %s" % cols, file=outputFile)
+            except Exception as exc:
+                print("Warning: %s" % exc, file=sys.stderr)
+    except Exception as e:
+        exit('Error: %s' % (e))
+    exit(0)
+
+
+def run_query(conn, query, outputFile, no_header=False):
+    cur = conn.cursor()
+    results = cur.execute(query)
+    if not no_header:
+        outputFile.write("#%s\n" % '\t'.join(
+            [str(col[0]) for col in cur.description]))
+    for i, row in enumerate(results):
+        outputFile.write("%s\n" % '\t'.join(
+            [str(val) if val is not None else '' for val in row]))
b
diff -r 000000000000 -r 6fbd9d25ceef query_tabular.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/query_tabular.py Tue Jul 18 09:06:47 2017 -0400
[
@@ -0,0 +1,137 @@
+#!/usr/bin/env python
+
+from __future__ import print_function
+
+import json
+import optparse
+import os.path
+import sys
+
+from load_db import create_table
+
+from query_db import describe_tables, get_connection, run_query
+
+
+"""
+JSON config:
+{ tables : [
+    { file_path : '/home/galaxy/dataset_101.dat',
+            table_name : 't1',
+            column_names : ['c1','c2','c3'],
+            pkey_autoincr : 'id'
+            comment_lines : 1
+            unique: ['c1'],
+            index: ['c2', 'c3']
+    },
+    { file_path : '/home/galaxy/dataset_102.dat',
+            table_name : 'gff',
+            column_names : ['seqname',,'date','start','end']
+            comment_lines : 1
+            load_named_columns : True
+            filters : [{'filter': 'regex', 'pattern': '#peptide',
+                        'action': 'exclude_match'},
+                       {'filter': 'replace', 'column': 3,
+                        'replace': 'gi[|]', 'pattern': ''}]
+    },
+    { file_path : '/home/galaxy/dataset_103.dat',
+            table_name : 'test',
+            column_names : ['c1', 'c2', 'c3']
+    }
+    ]
+}
+"""
+
+
+def __main__():
+    # Parse Command Line
+    parser = optparse.OptionParser()
+    parser.add_option('-s', '--sqlitedb', dest='sqlitedb', default=None,
+                      help='The SQLite Database')
+    parser.add_option('-j', '--jsonfile', dest='jsonfile', default=None,
+                      help='JSON dict of table specifications')
+    parser.add_option('-q', '--query', dest='query', default=None,
+                      help='SQL query')
+    parser.add_option('-Q', '--query_file', dest='query_file', default=None,
+                      help='SQL query file')
+    parser.add_option('-n', '--no_header', dest='no_header', default=False,
+                      action='store_true',
+                      help='Include a column headers line')
+    parser.add_option('-o', '--output', dest='output', default=None,
+                      help='Output file for query results')
+    (options, args) = parser.parse_args()
+
+    # determine output destination
+    if options.output is not None:
+        try:
+            outputPath = os.path.abspath(options.output)
+            outputFile = open(outputPath, 'w')
+        except Exception as e:
+            exit('Error: %s' % (e))
+    else:
+        outputFile = sys.stdout
+
+    def _create_table(ti, table):
+        path = table['file_path']
+        table_name =\
+            table['table_name'] if 'table_name' in table else 't%d' % (ti + 1)
+        comment_lines =\
+            table['comment_lines'] if 'comment_lines' in table else 0
+        comment_char =\
+            table['comment_char'] if 'comment_char' in table else None
+        column_names =\
+            table['column_names'] if 'column_names' in table else None
+        if column_names:
+            load_named_columns =\
+                table['load_named_columns']\
+                if 'load_named_columns' in table else False
+        else:
+            load_named_columns = False
+        unique_indexes = table['unique'] if 'unique' in table else []
+        indexes = table['index'] if 'index' in table else []
+        filters = table['filters'] if 'filters' in table else None
+        pkey_autoincr = \
+            table['pkey_autoincr'] if 'pkey_autoincr' in table else None
+        create_table(get_connection(options.sqlitedb), path, table_name,
+                     pkey_autoincr=pkey_autoincr,
+                     column_names=column_names,
+                     skip=comment_lines,
+                     comment_char=comment_char,
+                     load_named_columns=load_named_columns,
+                     filters=filters,
+                     unique_indexes=unique_indexes,
+                     indexes=indexes)
+
+    if options.jsonfile:
+        try:
+            with open(options.jsonfile) as fh:
+                tdef = json.load(fh)
+                if 'tables' in tdef:
+                    for ti, table in enumerate(tdef['tables']):
+                        _create_table(ti, table)
+        except Exception as e:
+            exit('Error: %s' % (e))
+
+    query = None
+    if options.query_file is not None:
+        with open(options.query_file, 'r') as fh:
+            query = ''
+            for line in fh:
+                query += line
+    elif options.query is not None:
+        query = options.query
+
+    if query is None:
+        try:
+            describe_tables(get_connection(options.sqlitedb), outputFile)
+        except Exception as e:
+            exit('Error: %s' % (e))
+    else:
+        try:
+            run_query(get_connection(options.sqlitedb), query, outputFile,
+                      no_header=options.no_header)
+        except Exception as e:
+            exit('Error: %s' % (e))
+
+
+if __name__ == "__main__":
+    __main__()
b
diff -r 000000000000 -r 6fbd9d25ceef sqlite_to_tabular.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sqlite_to_tabular.py Tue Jul 18 09:06:47 2017 -0400
b
@@ -0,0 +1,60 @@
+#!/usr/bin/env python
+
+from __future__ import print_function
+
+import optparse
+import os.path
+import sys
+
+from query_db import describe_tables, get_connection, run_query
+
+
+def __main__():
+    # Parse Command Line
+    parser = optparse.OptionParser()
+    parser.add_option('-s', '--sqlitedb', dest='sqlitedb', default=None,
+                      help='The SQLite Database')
+    parser.add_option('-q', '--query', dest='query', default=None,
+                      help='SQL query')
+    parser.add_option('-Q', '--query_file', dest='query_file', default=None,
+                      help='SQL query file')
+    parser.add_option('-n', '--no_header', dest='no_header', default=False,
+                      action='store_true',
+                      help='Include a column headers line')
+    parser.add_option('-o', '--output', dest='output', default=None,
+                      help='Output file for query results')
+    (options, args) = parser.parse_args()
+
+    # determine output destination
+    if options.output is not None:
+        try:
+            outputPath = os.path.abspath(options.output)
+            outputFile = open(outputPath, 'w')
+        except Exception as e:
+            exit('Error: %s' % (e))
+    else:
+        outputFile = sys.stdout
+
+    query = None
+    if options.query_file is not None:
+        with open(options.query_file, 'r') as fh:
+            query = fh.read()
+    elif options.query is not None:
+        query = options.query
+
+    if query is None:
+        try:
+            describe_tables(get_connection(options.sqlitedb), outputFile)
+        except Exception as e:
+            exit('Error: %s' % (e))
+        exit(0)
+    else:
+        try:
+            run_query(get_connection(options.sqlitedb), query, outputFile,
+                      no_header=options.no_header)
+        except Exception as e:
+            exit('Error: %s' % (e))
+
+
+if __name__ == "__main__":
+    __main__()
b
diff -r 000000000000 -r 6fbd9d25ceef test-data/IEDB.tsv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/IEDB.tsv Tue Jul 18 09:06:47 2017 -0400
b
@@ -0,0 +1,17 @@
+#ID allele seq_num start end length peptide method percentile_rank ann_ic50 ann_rank smm_ic50 smm_rank comblib_sidney2008_score comblib_sidney2008_rank netmhcpan_ic50 netmhcpan_rank
+PPAP2C HLA-A*02:01 1 3 11 9 GMYCMVFLV Consensus (ann/smm/comblib_sidney2008) 0.2 4 0.2 3.77 0.2 7.1e-06 0.5 - -
+PPAP2C HLA-A*23:01 1 1 9 9 SFGMYCMVF Consensus (ann/smm) 0.5 67 0.5 137.54 0.5 - - - -
+PPAP2C HLA-A*23:01 1 4 12 9 MYCMVFLVK Consensus (ann/smm) 0.65 146 0.7 160.11 0.6 - - - -
+PPAP2C HLA-A*02:01 1 2 10 9 FGMYCMVFL Consensus (ann/smm/comblib_sidney2008) 2.3 222 3.1 150.01 2.3 2.14e-05 1.3 - -
+PPAP2C HLA-A*23:01 1 3 11 9 GMYCMVFLV Consensus (ann/smm) 4.95 3256 4 2706.64 5.9 - - - -
+PPAP2C HLA-A*23:01 1 2 10 9 FGMYCMVFL Consensus (ann/smm) 6.55 4423 4.9 4144.10 8.2 - - - -
+PPAP2C HLA-A*02:01 1 1 9 9 SFGMYCMVF Consensus (ann/smm/comblib_sidney2008) 45 24390 45 44989.38 39 0.01 91 - -
+PPAP2C HLA-A*02:01 1 4 12 9 MYCMVFLVK Consensus (ann/smm/comblib_sidney2008) 54 23399 41 157801.09 54 0.01 86 - -
+ADAMTSL1 HLA-A*02:01 1 1 9 9 SLDMCISGL Consensus (ann/smm/comblib_sidney2008) 1 26 1 51.65 0.9 3.02e-05 1.7 - -
+ADAMTSL1 HLA-A*23:01 1 4 12 9 MCISGLCQL Consensus (ann/smm) 6.65 5781 5.9 3626.02 7.4 - - - -
+ADAMTSL1 HLA-A*02:01 1 4 12 9 MCISGLCQL Consensus (ann/smm/comblib_sidney2008) 14 1823 6.5 2612.82 14 0.00056 24 - -
+ADAMTSL1 HLA-A*23:01 1 1 9 9 SLDMCISGL Consensus (ann/smm) 30.5 27179 34 24684.82 27 - - - -
+ADAMTSL1 HLA-A*02:01 1 2 10 9 LDMCISGLC Consensus (ann/smm/comblib_sidney2008) 42 23677 42 53716.78 41 0.01 71 - -
+ADAMTSL1 HLA-A*23:01 1 3 11 9 DMCISGLCQ Consensus (ann/smm) 64.5 34451 73 118148.99 56 - - - -
+ADAMTSL1 HLA-A*23:01 1 2 10 9 LDMCISGLC Consensus (ann/smm) 76.0 33222 62 665932.18 90 - - - -
+ADAMTSL1 HLA-A*02:01 1 3 11 9 DMCISGLCQ Consensus (ann/smm/comblib_sidney2008) 97 31630 98 639896.89 71 0.03 97 - -
b
diff -r 000000000000 -r 6fbd9d25ceef test-data/add_to_db_results.tsv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/add_to_db_results.tsv Tue Jul 18 09:06:47 2017 -0400
b
@@ -0,0 +1,3 @@
+#id first last pets quote
+1 Paula Brown 2 Time flies like and arrow.  Fruit flies like a banana.
+2 Steven Jones 1 I would have wrtten less if I had more time
b
diff -r 000000000000 -r 6fbd9d25ceef test-data/customers.tsv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/customers.tsv Tue Jul 18 09:06:47 2017 -0400
b
@@ -0,0 +1,5 @@
+#CustomerID FirstName LastName Email DOB Phone
+1 John Smith John.Smith@yahoo.com 1968-02-04 626 222-2222
+2 Steven Goldfish goldfish@fishhere.net 1974-04-04 323 455-4545
+3 Paula Brown pb@herowndomain.org 1978-05-24 416 323-3232
+4 James Smith jim@supergig.co.uk 1980-10-20 416 323-8888
b
diff -r 000000000000 -r 6fbd9d25ceef test-data/filtered_people_results.tsv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/filtered_people_results.tsv Tue Jul 18 09:06:47 2017 -0400
b
@@ -0,0 +1,4 @@
+1 Paula Brown 1978-05-24 2
+2 Steven Jones 1974-04-04 1
+3 Jane Doe 1978-05-24 0
+4 James Smith 1980-10-20 1
b
diff -r 000000000000 -r 6fbd9d25ceef test-data/filtered_pets_results.tsv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/filtered_pets_results.tsv Tue Jul 18 09:06:47 2017 -0400
b
@@ -0,0 +1,4 @@
+1 Rex dog
+1 Fluff cat
+2 Allie cat
+4 Spot
b
diff -r 000000000000 -r 6fbd9d25ceef test-data/netMHC_summary.tsv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/netMHC_summary.tsv Tue Jul 18 09:06:47 2017 -0400
b
@@ -0,0 +1,9 @@
+#pos peptide logscore affinity(nM) Bind Level Protein Name Allele
+2 GMYCMVFLV 0.858 4 SB PPAP2C HLA-A02:01
+1 FGMYCMVFL 0.501 222 WB PPAP2C HLA-A02:01
+3 MYCMVFLVK 0.070 23399 PPAP2C HLA-A02:01
+0 SFGMYCMVF 0.066 24390 PPAP2C HLA-A02:01
+0 SLDMCISGL 0.698 26 SB ADAMTSL1 HLA-A02:01
+3 MCISGLCQL 0.306 1823 ADAMTSL1 HLA-A02:01
+1 LDMCISGLC 0.069 23677 ADAMTSL1 HLA-A02:01
+2 DMCISGLCQ 0.042 31630 ADAMTSL1 HLA-A02:01
b
diff -r 000000000000 -r 6fbd9d25ceef test-data/pet_normalized_query_results.tsv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/pet_normalized_query_results.tsv Tue Jul 18 09:06:47 2017 -0400
b
@@ -0,0 +1,3 @@
+#id first last dob name animal pets
+1 Paula Brown 1978-05-24 Fluff cat 2
+2 Steven Jones 1974-04-04 Allie cat 1
b
diff -r 000000000000 -r 6fbd9d25ceef test-data/pets.tsv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/pets.tsv Tue Jul 18 09:06:47 2017 -0400
b
@@ -0,0 +1,7 @@
+#People with pets
+Pets FirstName LastName DOB PetNames PetType
+2 Paula Brown 24/05/78 Rex,Fluff dog,cat
+1 Steven Jones 04/04/74 Allie cat
+0 Jane Doe 24/05/78
+1 James Smith 20/10/80 Spot
+
b
diff -r 000000000000 -r 6fbd9d25ceef test-data/query_results.tsv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/query_results.tsv Tue Jul 18 09:06:47 2017 -0400
b
@@ -0,0 +1,17 @@
+#ID peptide start end percentile_rank logscore affinity Bind_Level
+PPAP2C GMYCMVFLV 3 11 0.2 0.858 4 SB
+PPAP2C GMYCMVFLV 3 11 4.95 0.858 4 SB
+ADAMTSL1 SLDMCISGL 1 9 1.0 0.698 26 SB
+ADAMTSL1 SLDMCISGL 1 9 30.5 0.698 26 SB
+PPAP2C FGMYCMVFL 2 10 2.3 0.501 222 WB
+PPAP2C FGMYCMVFL 2 10 6.55 0.501 222 WB
+ADAMTSL1 MCISGLCQL 4 12 6.65 0.306 1823
+ADAMTSL1 MCISGLCQL 4 12 14.0 0.306 1823
+PPAP2C MYCMVFLVK 4 12 0.65 0.07 23399
+PPAP2C MYCMVFLVK 4 12 54.0 0.07 23399
+ADAMTSL1 LDMCISGLC 2 10 42.0 0.069 23677
+ADAMTSL1 LDMCISGLC 2 10 76.0 0.069 23677
+PPAP2C SFGMYCMVF 1 9 0.5 0.066 24390
+PPAP2C SFGMYCMVF 1 9 45.0 0.066 24390
+ADAMTSL1 DMCISGLCQ 3 11 64.5 0.042 31630
+ADAMTSL1 DMCISGLCQ 3 11 97.0 0.042 31630
b
diff -r 000000000000 -r 6fbd9d25ceef test-data/regex_results.tsv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/regex_results.tsv Tue Jul 18 09:06:47 2017 -0400
b
@@ -0,0 +1,4 @@
+#FirstName LastName DOB
+Steven Goldfish 04/04/74
+Paula Brown 24/05/78
+James Smith 20/10/80
b
diff -r 000000000000 -r 6fbd9d25ceef test-data/sales.tsv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sales.tsv Tue Jul 18 09:06:47 2017 -0400
b
@@ -0,0 +1,6 @@
+#CustomerID Date SaleAmount
+2 2004-05-06 100.22
+1 2004-05-07 99.95
+3 2004-05-07 122.95
+3 2004-05-13 100.00
+4 2004-05-22 555.55
b
diff -r 000000000000 -r 6fbd9d25ceef test-data/sales_results.tsv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sales_results.tsv Tue Jul 18 09:06:47 2017 -0400
b
@@ -0,0 +1,5 @@
+#FirstName LastName TotalSales
+James Smith 555.55
+Paula Brown 222.95
+Steven Goldfish 100.22
+John Smith 99.95
b
diff -r 000000000000 -r 6fbd9d25ceef test-data/testdb.sqlite
b
Binary file test-data/testdb.sqlite has changed