Mercurial > repos > iuc > query_tabular

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/filter_tabular.py	Tue Jul 18 09:07:07 2017 -0400
@@ -0,0 +1,68 @@
+#!/usr/bin/env python
+
+from __future__ import print_function
+
+import json
+import optparse
+import os.path
+import sys
+
+from filters import filter_file
+
+
+def __main__():
+    # Parse Command Line
+    parser = optparse.OptionParser()
+    parser.add_option('-i', '--input', dest='input', default=None,
+                      help='Input file for filtering')
+    parser.add_option('-j', '--jsonfile', dest='jsonfile', default=None,
+                      help='JSON array of filter specifications')
+    parser.add_option('-o', '--output', dest='output', default=None,
+                      help='Output file for query results')
+    parser.add_option('-v', '--verbose', dest='verbose', default=False,
+                      action='store_true',
+                      help='verbose')
+    (options, args) = parser.parse_args()
+
+    if options.input is not None:
+        try:
+            inputPath = os.path.abspath(options.input)
+            inputFile = open(inputPath, 'r')
+        except Exception as e:
+            exit('Error: %s' % (e))
+    else:
+        inputFile = sys.stdin
+
+    if options.output is not None:
+        try:
+            outputPath = os.path.abspath(options.output)
+            outputFile = open(outputPath, 'w')
+        except Exception as e:
+            exit('Error: %s' % (e))
+    else:
+        outputFile = sys.stdout
+
+    filters = None
+    if options.jsonfile:
+        try:
+            with open(options.jsonfile) as fh:
+                filters = json.load(fh)
+        except Exception as e:
+            exit('Error: %s' % (e))
+
+    if options.verbose and filters:
+        for f in filters:
+            print('%s  %s' % (f['filter'],
+                  ', '.join(
+                  ['%s: %s' % (k, f[k])
+                   for k in set(f.keys()) - set(['filter'])])),
+                  file=sys.stdout)
+
+    try:
+        filter_file(inputFile, outputFile, filters=filters)
+    except Exception as e:
+        exit('Error: %s' % (e))
+
+
+if __name__ == "__main__":
+    __main__()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/filters.py	Tue Jul 18 09:07:07 2017 -0400
@@ -0,0 +1,156 @@
+#!/usr/binsenv python
+
+from __future__ import print_function
+
+import re
+import sys
+
+
+class LineFilter(object):
+    def __init__(self, source, filter_dict):
+        self.source = source
+        self.filter_dict = filter_dict
+        self.func = lambda i, l: l.rstrip('\r\n') if l else None
+        self.src_lines = []
+        self.src_line_cnt = 0
+        if not filter_dict:
+            return
+        if filter_dict['filter'] == 'regex':
+            rgx = re.compile(filter_dict['pattern'])
+            if filter_dict['action'] == 'exclude_match':
+                self.func = lambda i, l: l if not rgx.match(l) else None
+            elif filter_dict['action'] == 'include_match':
+                self.func = lambda i, l: l if rgx.match(l) else None
+            elif filter_dict['action'] == 'exclude_find':
+                self.func = lambda i, l: l if not rgx.search(l) else None
+            elif filter_dict['action'] == 'include_find':
+                self.func = lambda i, l: l if rgx.search(l) else None
+        elif filter_dict['filter'] == 'select_columns':
+            cols = [int(c) - 1 for c in filter_dict['columns']]
+            self.func = lambda i, l: self.select_columns(l, cols)
+        elif filter_dict['filter'] == 'replace':
+            p = filter_dict['pattern']
+            r = filter_dict['replace']
+            c = int(filter_dict['column']) - 1
+            self.func = lambda i, l: '\t'.join(
+                [x if j != c else re.sub(p, r, x) for j, x in enumerate(l.split('\t'))])
+        elif filter_dict['filter'] == 'prepend_line_num':
+            self.func = lambda i, l: '%d\t%s' % (i, l)
+        elif filter_dict['filter'] == 'append_line_num':
+            self.func = lambda i, l: '%s\t%d' % (l.rstrip('\r\n'), i)
+        elif filter_dict['filter'] == 'prepend_text':
+            s = filter_dict['column_text']
+            self.func = lambda i, l: '%s\t%s' % (s, l)
+        elif filter_dict['filter'] == 'append_text':
+            s = filter_dict['column_text']
+            self.func = lambda i, l: '%s\t%s' % (l.rstrip('\r\n'), s)
+        elif filter_dict['filter'] == 'skip':
+            cnt = filter_dict['count']
+            self.func = lambda i, l: l if i > cnt else None
+        elif filter_dict['filter'] == 'normalize':
+            cols = [int(c) - 1 for c in filter_dict['columns']]
+            sep = filter_dict['separator']
+            self.func = lambda i, l: self.normalize(l, cols, sep)
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        if not self.src_lines:
+            self.get_lines()
+        if self.src_lines:
+            return self.src_lines.pop(0)
+        raise StopIteration
+
+    next = __next__
+
+    def select_columns(self, line, cols):
+        fields = line.split('\t')
+        return '\t'.join([fields[x] for x in cols])
+
+    def normalize(self, line, split_cols, sep):
+        lines = []
+        fields = line.rstrip('\r\n').split('\t')
+        split_fields = dict()
+        cnt = 0
+        for c in split_cols:
+            if c < len(fields):
+                split_fields[c] = fields[c].split(sep)
+                cnt = max(cnt, len(split_fields[c]))
+        if cnt == 0:
+            lines.append('\t'.join(fields))
+        else:
+            for n in range(0, cnt):
+                flds = [x if c not in split_cols else split_fields[c][n]
+                        if n < len(split_fields[c])
+                        else '' for (c, x) in enumerate(fields)]
+                lines.append('\t'.join(flds))
+        return lines
+
+    def get_lines(self):
+        for i, next_line in enumerate(self.source):
+            self.src_line_cnt += 1
+            line = self.func(self.src_line_cnt, next_line)
+            if line:
+                if isinstance(line, list):
+                    self.src_lines.extend(line)
+                else:
+                    self.src_lines.append(line)
+                return
+
+
+class TabularReader:
+    """
+    Tabular file iterator. Returns a list
+    """
+    def __init__(self, input_file, skip=0, comment_char=None, col_idx=None,
+                 filters=None):
+        self.skip = skip
+        self.comment_char = comment_char
+        self.col_idx = col_idx
+        self.filters = filters
+        self.tsv_file = \
+            input_file if hasattr(input_file, 'readline') else open(input_file)
+        if skip and skip > 0:
+            for i in range(skip):
+                if not self.tsv_file.readline():
+                    break
+        source = LineFilter(self.tsv_file, None)
+        if comment_char:
+            source = LineFilter(source,
+                                {"filter": "regex", "pattern": comment_char,
+                                 "action": "exclude_match"})
+        if filters:
+            for f in filters:
+                source = LineFilter(source, f)
+        self.source = source
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        ''' Iteration '''
+        for i, line in enumerate(self.source):
+            fields = line.rstrip('\r\n').split('\t')
+            if self.col_idx:
+                fields = [fields[i] for i in self.col_idx]
+            return fields
+        raise StopIteration
+
+    next = __next__
+
+
+def filter_file(input_file, output, skip=0, comment_char='#', filters=None):
+    data_lines = 0
+    try:
+        tr = TabularReader(input_file, skip=skip, comment_char=comment_char,
+                           filters=filters)
+        for linenum, fields in enumerate(tr):
+            data_lines += 1
+            try:
+                output.write('%s\n' % '\t'.join(fields))
+            except Exception as e:
+                print('Failed at line: %d err: %s' % (linenum, e),
+                      file=sys.stderr)
+    except Exception as e:
+        exit('Error: %s' % (e))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/load_db.py	Tue Jul 18 09:07:07 2017 -0400
@@ -0,0 +1,135 @@
+#!/usr/bin/env python
+
+from __future__ import print_function
+
+import sys
+
+from filters import TabularReader
+
+
+def getValueType(val):
+    if val or 0. == val:
+        try:
+            int(val)
+            return 'INTEGER'
+        except:
+            try:
+                float(val)
+                return 'REAL'
+            except:
+                return 'TEXT'
+    return None
+
+
+def get_column_def(file_path, table_name, skip=0, comment_char='#',
+                   column_names=None, max_lines=100, load_named_columns=False,
+                   filters=None):
+    col_pref = ['TEXT', 'REAL', 'INTEGER', None]
+    col_types = []
+    col_idx = None
+    try:
+        tr = TabularReader(file_path, skip=skip, comment_char=comment_char,
+                           col_idx=None, filters=filters)
+        for linenum, fields in enumerate(tr):
+            if linenum > max_lines:
+                break
+            try:
+                while len(col_types) < len(fields):
+                    col_types.append(None)
+                for i, val in enumerate(fields):
+                    colType = getValueType(val)
+                    if col_pref.index(colType) < col_pref.index(col_types[i]):
+                        col_types[i] = colType
+            except Exception as e:
+                print('Failed at line: %d err: %s' % (linenum, e),
+                      file=sys.stderr)
+    except Exception as e:
+        print('Failed: %s' % (e), file=sys.stderr)
+    for i, col_type in enumerate(col_types):
+        if not col_type:
+            col_types[i] = 'TEXT'
+    if column_names:
+        col_names = []
+        if load_named_columns:
+            col_idx = []
+            for i, cname in enumerate(
+                    [cn.strip() for cn in column_names.split(',')]):
+                if cname != '':
+                    col_idx.append(i)
+                    col_names.append(cname)
+            col_types = [col_types[i] for i in col_idx]
+        else:
+            col_names = ['c%d' % i for i in range(1, len(col_types) + 1)]
+            for i, cname in enumerate(
+                    [cn.strip() for cn in column_names.split(',')]):
+                if cname and i < len(col_names):
+                    col_names[i] = cname
+    else:
+        col_names = ['c%d' % i for i in range(1, len(col_types) + 1)]
+    col_def = []
+    for i, col_name in enumerate(col_names):
+        col_def.append('%s %s' % (col_names[i], col_types[i]))
+    return col_names, col_types, col_def, col_idx
+
+
+def create_table(conn, file_path, table_name, skip=0, comment_char='#',
+                 pkey_autoincr=None, column_names=None,
+                 load_named_columns=False, filters=None,
+                 unique_indexes=[], indexes=[]):
+    col_names, col_types, col_def, col_idx = \
+        get_column_def(file_path, table_name, skip=skip,
+                       comment_char=comment_char, column_names=column_names,
+                       load_named_columns=load_named_columns, filters=filters)
+    col_func = [float if t == 'REAL' else int
+                if t == 'INTEGER' else str for t in col_types]
+    table_def = 'CREATE TABLE %s (\n    %s%s\n);' % (
+                table_name,
+                '%s INTEGER PRIMARY KEY AUTOINCREMENT,' %
+                pkey_autoincr if pkey_autoincr else '',
+                ', \n    '.join(col_def))
+    # print >> sys.stdout, table_def
+    insert_stmt = 'INSERT INTO %s(%s) VALUES(%s)' % (
+                  table_name, ','.join(col_names),
+                  ','.join(["?" for x in col_names]))
+    # print >> sys.stdout, insert_stmt
+    data_lines = 0
+    try:
+        c = conn.cursor()
+        c.execute(table_def)
+        conn.commit()
+        c.close()
+        for i, index in enumerate(unique_indexes):
+            index_name = 'idx_uniq_%s_%d' % (table_name, i)
+            index_columns = index.split(',')
+            create_index(conn, table_name, index_name, index_columns,
+                         unique=True)
+        for i, index in enumerate(indexes):
+            index_name = 'idx_%s_%d' % (table_name, i)
+            index_columns = index.split(',')
+            create_index(conn, table_name, index_name, index_columns)
+        c = conn.cursor()
+        tr = TabularReader(file_path, skip=skip, comment_char=comment_char,
+                           col_idx=col_idx, filters=filters)
+        for linenum, fields in enumerate(tr):
+            data_lines += 1
+            try:
+                vals = [col_func[i](x)
+                        if x else None for i, x in enumerate(fields)]
+                c.execute(insert_stmt, vals)
+            except Exception as e:
+                print('Failed at line: %d err: %s' % (linenum, e),
+                      file=sys.stderr)
+        conn.commit()
+        c.close()
+    except Exception as e:
+        exit('Error: %s' % (e))
+
+
+def create_index(conn, table_name, index_name, index_columns, unique=False):
+    index_def = "CREATE %s INDEX %s on %s(%s)" % (
+                'UNIQUE' if unique else '', index_name,
+                table_name, ','.join(index_columns))
+    c = conn.cursor()
+    c.execute(index_def)
+    conn.commit()
+    c.close()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml	Tue Jul 18 09:07:07 2017 -0400
@@ -0,0 +1,383 @@
+<macros>
+  <token name="@LINEFILTERS@">
+<![CDATA[
+  ## set linefilters to the
+  #set $input_filters = []
+  #for $fi in $linefilters:
+    #if $fi.filter.filter_type == 'skip':
+      #set $skip_lines = None
+      #if str($fi.filter.skip_lines) != '':
+        #set $skip_lines = int($fi.filter.skip_lines)
+      #elif $tbl.table.metadata.comment_lines and $tbl.table.metadata.comment_lines > 0:
+        #set $skip_lines = int($tbl.table.metadata.comment_lines)
+      #end if
+      #if $skip_lines is not None:
+        #set $filter_dict = dict()
+        #set $filter_dict['filter'] = str($fi.filter.filter_type)
+        #set $filter_dict['count'] = $skip_lines
+        #silent $input_filters.append($filter_dict)
+      #end if
+    #elif $fi.filter.filter_type == 'comment':
+      #set $filter_dict = dict()
+      #set $filter_dict['filter'] = 'regex'
+      #set $filter_dict['pattern'] = '^(%s).*$' % '|'.join([chr(int(x)).replace('|','[|]') for x in (str($fi.filter.comment_char)).split(',')])
+      #set $filter_dict['action'] = 'exclude_match'
+      #silent $input_filters.append($filter_dict)
+    #elif $fi.filter.filter_type == 'regex':
+      #set $filter_dict = dict()
+      #set $filter_dict['filter'] = str($fi.filter.filter_type)
+      #set $filter_dict['pattern'] = str($fi.filter.regex_pattern)
+      #set $filter_dict['action'] = str($fi.filter.regex_action)
+      #silent $input_filters.append($filter_dict)
+    #elif $fi.filter.filter_type == 'select_columns':
+      #set $filter_dict = dict()
+      #set $filter_dict['filter'] = str($fi.filter.filter_type)
+      #set $filter_dict['columns'] = [int(str($ci).replace('c','')) for $ci in str($fi.filter.columns).split(',')]
+      #silent $input_filters.append($filter_dict)
+    #elif $fi.filter.filter_type == 'replace':
+      #set $filter_dict = dict()
+      #set $filter_dict['filter'] = str($fi.filter.filter_type)
+      #set $filter_dict['column'] = int(str($fi.filter.column).replace('c',''))
+      #set $filter_dict['pattern'] = str($fi.filter.regex_pattern)
+      #set $filter_dict['replace'] = str($fi.filter.regex_replace)
+      #silent $input_filters.append($filter_dict)
+    #elif str($fi.filter.filter_type).endswith('pend_line_num'):
+      #set $filter_dict = dict()
+      #set $filter_dict['filter'] = str($fi.filter.filter_type)
+      #silent $input_filters.append($filter_dict)
+    #elif str($fi.filter.filter_type).endswith('pend_text'):
+      #set $filter_dict = dict()
+      #set $filter_dict['filter'] = str($fi.filter.filter_type)
+      #set $filter_dict['column_text'] = str($fi.filter.column_text)
+      #silent $input_filters.append($filter_dict)
+    #elif $fi.filter.filter_type == 'normalize':
+      #set $filter_dict = dict()
+      #set $filter_dict['filter'] = str($fi.filter.filter_type)
+      #set $filter_dict['columns'] = [int(str($ci).replace('c','')) for $ci in str($fi.filter.columns).split(',')]
+      #set $filter_dict['separator'] = str($fi.filter.separator)
+      #silent $input_filters.append($filter_dict)
+    #end if
+  #end for
+]]>
+  </token>
+  <xml name="macro_line_filters">
+                <repeat name="linefilters" title="Filter Tabular Input Lines">
+                    <conditional name="filter">
+                        <param name="filter_type" type="select" label="Filter By">
+                            <option value="skip">skip leading lines</option>
+                            <option value="comment">comment char</option>
+                            <option value="regex">by regex expression matching</option>
+                            <option value="select_columns">select columns</option>
+                            <option value="replace">regex replace value in column</option>
+                            <option value="prepend_line_num">prepend a line number column</option>
+                            <option value="append_line_num">append a line number column</option>
+                            <option value="prepend_text">prepend a column with the given text</option>
+                            <option value="append_text">append a column with the given text</option>
+                            <option value="normalize">normalize list columns, replicates row for each item in list</option>
+                        </param>
+                        <when value="skip">
+                             <param name="skip_lines" type="integer" value="" min="0" optional="true" label="Skip lines"
+                                 help="Leave blank to use the comment lines metadata for this dataset" />
+                        </when>
+                        <when value="comment">
+                            <param name="comment_char" type="select" display="checkboxes" multiple="True" label="Ignore lines beginning with these characters" help="lines beginning with these are skipped">
+                                <option value="62">&gt;</option>
+                                <option value="64">@</option>
+                                <option value="43">+</option>
+                                <option value="60">&lt;</option>
+                                <option value="42">*</option>
+                                <option value="45">-</option>
+                                <option value="61">=</option>
+                                <option value="124">|</option>
+                                <option value="63">?</option>
+                                <option value="36">$</option>
+                                <option value="46">.</option>
+                                <option value="58">:</option>
+                                <option value="38">&amp;</option>
+                                <option value="37">%</option>
+                                <option value="94">^</option>
+                                <option value="35">&#35;</option>
+                                <option value="33">!</option>
+                            </param>
+                        </when>
+                        <when value="prepend_line_num"/>
+                        <when value="append_line_num"/>
+                        <when value="prepend_text">
+                            <param name="column_text" type="text" value="" label="text for column">
+                            </param>
+                        </when>
+                        <when value="append_text">
+                            <param name="column_text" type="text" value="" label="text for column">
+                            </param>
+                        </when>
+                        <when value="regex">
+                            <param name="regex_pattern" type="text" value="" label="regex pattern">
+                                <sanitizer sanitize="False"/>
+                            </param>
+                            <param name="regex_action" type="select" label="action for regex match">
+                                <option value="exclude_match">exclude line on pattern match</option>
+                                <option value="include_match">include line on pattern match</option>
+                                <option value="exclude_find">exclude line if pattern found</option>
+                                <option value="include_find">include line if pattern found</option>
+                            </param>
+                        </when>
+                        <when value="select_columns">
+                            <param name="columns" type="text" value="" label="enter column numbers to keep"
+                                help="example: 1,4,2 or c1,c4,c2(selects the first,fourth, and second columns)">
+                                <validator type="regex" message="Column ordinal positions separated by commas">^(c?[1-9]\d*)(,c?[1-9]\d*)*$</validator>
+                            </param>
+                        </when>
+                        <when value="replace">
+                            <param name="column" type="text" value="" label="enter column number to replace"
+                                help="example: 1 or c1 (selects the first column)">
+                                <validator type="regex" message="Column ordinal position separated by commas">^(c?[1-9]\d*)$</validator>
+                            </param>
+                            <param name="regex_pattern" type="text" value="" label="regex pattern">
+                                <sanitizer sanitize="False"/>
+                            </param>
+                            <param name="regex_replace" type="text" value="" label="replacement expression">
+                                <sanitizer sanitize="False"/>
+                            </param>
+                        </when>
+                        <when value="normalize">
+                            <param name="columns" type="text" value="" label="enter column numbers to normalize">
+                                <help><![CDATA[
+                                example: 2,4 or c2,c4 (selects the second, and fourth columns)
+                                If multiple columns are selected, they should have the same length and separator on each line
+                                ]]></help>
+                                <validator type="regex" message="Column ordinal positions separated by commas">^(c?[1-9]\d*)(,c?[1-9]\d*)*$</validator>
+                            </param>
+                            <param name="separator" type="text" value="," label="List item delimiter in column">
+                                <sanitizer sanitize="False"/>
+                                <validator type="regex" message="Anything but TAB or Newline">^[^\t\n\r\f\v]+$</validator>
+                            </param>
+                        </when>
+                    </conditional>
+                </repeat>
+  </xml>
+
+  <token name="@LINEFILTERS_HELP@">
+<![CDATA[
+**Input Line Filters**
+
+  As a tabular file is being read, line filters may be applied.
+
+  ::
+
+  - skip leading lines              skip the first *number* of lines
+  - comment char                    omit any lines that start with the specified comment character
+  - by regex expression matching    *include/exclude* lines the match the regex expression
+  - select columns                  choose to include only selected columns in the order specified
+  - regex replace value in column   replace a field in a column using a regex substitution (good for date reformatting)
+  - prepend a line number column    each line has the ordinal value of the line read by this filter as the first column
+  - append a line number column     each line has the ordinal value of the line read by this filter as the last column
+  - prepend a text column           each line has the text string as the first column
+  - append a text column            each line has the text string as the last column
+  - normalize list columns          replicates the line for each item in the specified list *columns*
+]]>
+  </token>
+
+  <token name="@LINEFILTERS_HELP_EXAMPLE@">
+<![CDATA[
+**Line Filtering Example**
+  *(Six filters are applied as the following file is read)*
+
+  ::
+
+    Input Tabular File:
+
+    #People with pets
+    Pets FirstName           LastName   DOB       PetNames  PetType
+    2    Paula               Brown      24/05/78  Rex,Fluff dog,cat
+    1    Steven              Jones      04/04/74  Allie     cat
+    0    Jane                Doe        24/05/78
+    1    James               Smith      20/10/80  Spot
+
+
+    Filter 1 - append a line number column:
+
+    #People with pets                                                 1
+    Pets FirstName           LastName   DOB       PetNames  PetType   2
+    2    Paula               Brown      24/05/78  Rex,Fluff dog,cat   3
+    1    Steven              Jones      04/04/74  Allie     cat       4
+    0    Jane                Doe        24/05/78                      5
+    1    James               Smith      20/10/80  Spot                6
+
+    Filter 2 - by regex expression matching [include]: '^\d+' (include lines that start with a number)
+
+    2    Paula               Brown      24/05/78  Rex,Fluff dog,cat   3
+    1    Steven              Jones      04/04/74  Allie     cat       4
+    0    Jane                Doe        24/05/78                      5
+    1    James               Smith      20/10/80  Spot                6
+
+    Filter 3 - append a line number column:
+
+    2    Paula               Brown      24/05/78  Rex,Fluff dog,cat   3  1
+    1    Steven              Jones      04/04/74  Allie     cat       4  2
+    0    Jane                Doe        24/05/78                      5  3
+    1    James               Smith      20/10/80  Spot                6  4
+
+    Filter 4 - regex replace value in column[4]: '(\d+)/(\d+)/(\d+)' '19\3-\2-\1' (convert dates to sqlite format)
+
+    2    Paula               Brown      1978-05-24  Rex,Fluff dog,cat   3  1
+    1    Steven              Jones      1974-04-04  Allie     cat       4  2
+    0    Jane                Doe        1978-05-24                      5  3
+    1    James               Smith      1980-10-20  Spot                6  4
+
+    Filter 5 - normalize list columns[5,6]:
+
+    2    Paula               Brown      1978-05-24  Rex       dog       3  1
+    2    Paula               Brown      1978-05-24  Fluff     cat       3  1
+    1    Steven              Jones      1974-04-04  Allie     cat       4  2
+    0    Jane                Doe        1978-05-24                      5  3
+    1    James               Smith      1980-10-20  Spot                6  4
+
+    Filter 6 - append a line number column:
+
+    2    Paula               Brown      1978-05-24  Rex       dog       3  1  1
+    2    Paula               Brown      1978-05-24  Fluff     cat       3  1  2
+    1    Steven              Jones      1974-04-04  Allie     cat       4  2  3
+    0    Jane                Doe        1978-05-24                      5  3  4
+    1    James               Smith      1980-10-20  Spot                6  4  5
+
+]]>
+  </token>
+
+  <token name="@QUERY_HELP@">
+<![CDATA[
+
+For help in using SQLite_ see:  http://www.sqlite.org/docs.html
+
+**NOTE:** input for SQLite dates input field must be in the format: *YYYY-MM-DD* for example: 2015-09-30
+
+See: http://www.sqlite.org/lang_datefunc.html
+
+**Example**
+
+  Given 2 tabular datasets: *customers* and *sales*
+
+   Dataset *customers*
+
+    Table name: "customers"
+
+    Column names: "CustomerID,FirstName,LastName,Email,DOB,Phone"
+
+    =========== ========== ========== ===================== ========== ============
+    #CustomerID FirstName  LastName   Email                 DOB        Phone
+    =========== ========== ========== ===================== ========== ============
+    1           John       Smith      John.Smith@yahoo.com  1968-02-04 626 222-2222
+    2           Steven     Goldfish   goldfish@fishhere.net 1974-04-04 323 455-4545
+    3           Paula      Brown      pb@herowndomain.org   1978-05-24 416 323-3232
+    4           James      Smith      jim@supergig.co.uk    1980-10-20 416 323-8888
+    =========== ========== ========== ===================== ========== ============
+
+   Dataset *sales*
+
+    Table name: "sales"
+
+    Column names: "CustomerID,Date,SaleAmount"
+
+    =============  ============  ============
+      #CustomerID    Date          SaleAmount
+    =============  ============  ============
+               2    2004-05-06         100.22
+               1    2004-05-07          99.95
+               3    2004-05-07         122.95
+               3    2004-05-13         100.00
+               4    2004-05-22         555.55
+    =============  ============  ============
+
+  The query
+
+  ::
+
+    SELECT FirstName,LastName,sum(SaleAmount) as "TotalSales"
+    FROM customers join sales on customers.CustomerID = sales.CustomerID
+    GROUP BY customers.CustomerID ORDER BY TotalSales DESC;
+
+  Produces this tabular output:
+
+    ========== ======== ==========
+    #FirstName LastName TotalSales
+    ========== ======== ==========
+    James      Smith    555.55
+    Paula      Brown    222.95
+    Steven     Goldfish 100.22
+    John       Smith    99.95
+    ========== ======== ==========
+
+
+  If the optional Table name and Column names inputs are not used, the query would be:
+
+  ::
+
+    SELECT t1.c2 as "FirstName", t1.c3 as "LastName", sum(t2.c3) as "TotalSales"
+    FROM t1 join t2 on t1.c1 = t2.c1
+    GROUP BY t1.c1 ORDER BY TotalSales DESC;
+
+  You can selectively name columns, e.g. on the customers input you could just name columns 2,3, and 5:
+
+    Column names: ,FirstName,LastName,,BirthDate
+
+    Results in the following data base table
+
+    =========== ========== ========== ===================== ========== ============
+    #c1         FirstName  LastName   c4                    BirthDate  c6
+    =========== ========== ========== ===================== ========== ============
+    1           John       Smith      John.Smith@yahoo.com  1968-02-04 626 222-2222
+    2           Steven     Goldfish   goldfish@fishhere.net 1974-04-04 323 455-4545
+    3           Paula      Brown      pb@herowndomain.org   1978-05-24 416 323-3232
+    4           James      Smith      jim@supergig.co.uk    1980-10-20 416 323-8888
+    =========== ========== ========== ===================== ========== ============
+
+
+  Regular_expression_ functions are included for:
+
+  ::
+
+    matching:      re_match('pattern',column)
+
+    SELECT t1.FirstName, t1.LastName
+    FROM t1
+    WHERE re_match('^.*\.(net|org)$',c4)
+
+  Results:
+
+    =========== ==========
+    #FirstName  LastName
+    =========== ==========
+    Steven      Goldfish
+    Paula       Brown
+    =========== ==========
+
+
+  ::
+
+    searching:     re_search('pattern',column)
+    substituting:  re_sub('pattern','replacement,column)
+
+    SELECT t1.FirstName, t1.LastName, re_sub('^\d{2}(\d{2})-(\d\d)-(\d\d)','\3/\2/\1',BirthDate) as "DOB"
+    FROM t1
+    WHERE re_search('[hp]er',c4)
+
+  Results:
+
+
+    =========== ========== ==========
+    #FirstName  LastName   DOB
+    =========== ========== ==========
+    Steven      Goldfish   04/04/74
+    Paula       Brown      24/05/78
+    James       Smith      20/10/80
+    =========== ========== ==========
+
+.. _Regular_expression: https://docs.python.org/release/2.7/library/re.html
+.. _SQLite: http://www.sqlite.org/index.html
+.. _SQLite_functions: http://www.sqlite.org/docs.html
+
+
+]]>
+  </token>
+
+</macros>
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/query_db.py	Tue Jul 18 09:07:07 2017 -0400
@@ -0,0 +1,67 @@
+#!/usr/bin/env python
+
+from __future__ import print_function
+
+import re
+import sqlite3 as sqlite
+import sys
+
+
+TABLE_QUERY = \
+    """
+    SELECT name, sql
+    FROM sqlite_master
+    WHERE type='table'
+    ORDER BY name
+    """
+
+
+def regex_match(expr, item):
+    return re.match(expr, item) is not None
+
+
+def regex_search(expr, item):
+    return re.search(expr, item) is not None
+
+
+def regex_sub(expr, replace, item):
+    return re.sub(expr, replace, item)
+
+
+def get_connection(sqlitedb_path, addfunctions=True):
+    conn = sqlite.connect(sqlitedb_path)
+    if addfunctions:
+        conn.create_function("re_match", 2, regex_match)
+        conn.create_function("re_search", 2, regex_search)
+        conn.create_function("re_sub", 3, regex_sub)
+    return conn
+
+
+def describe_tables(conn, outputFile):
+    try:
+        c = conn.cursor()
+        tables_query = TABLE_QUERY
+        rslt = c.execute(tables_query).fetchall()
+        for table, sql in rslt:
+            print("Table %s:" % table, file=outputFile)
+            try:
+                col_query = 'SELECT * FROM %s LIMIT 0' % table
+                cur = conn.cursor().execute(col_query)
+                cols = [col[0] for col in cur.description]
+                print(" Columns: %s" % cols, file=outputFile)
+            except Exception as exc:
+                print("Warning: %s" % exc, file=sys.stderr)
+    except Exception as e:
+        exit('Error: %s' % (e))
+    exit(0)
+
+
+def run_query(conn, query, outputFile, no_header=False):
+    cur = conn.cursor()
+    results = cur.execute(query)
+    if not no_header:
+        outputFile.write("#%s\n" % '\t'.join(
+            [str(col[0]) for col in cur.description]))
+    for i, row in enumerate(results):
+        outputFile.write("%s\n" % '\t'.join(
+            [str(val) if val is not None else '' for val in row]))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/query_tabular.py	Tue Jul 18 09:07:07 2017 -0400
@@ -0,0 +1,137 @@
+#!/usr/bin/env python
+
+from __future__ import print_function
+
+import json
+import optparse
+import os.path
+import sys
+
+from load_db import create_table
+
+from query_db import describe_tables, get_connection, run_query
+
+
+"""
+JSON config:
+{ tables : [
+    { file_path : '/home/galaxy/dataset_101.dat',
+            table_name : 't1',
+            column_names : ['c1','c2','c3'],
+            pkey_autoincr : 'id'
+            comment_lines : 1
+            unique: ['c1'],
+            index: ['c2', 'c3']
+    },
+    { file_path : '/home/galaxy/dataset_102.dat',
+            table_name : 'gff',
+            column_names : ['seqname',,'date','start','end']
+            comment_lines : 1
+            load_named_columns : True
+            filters : [{'filter': 'regex', 'pattern': '#peptide',
+                        'action': 'exclude_match'},
+                       {'filter': 'replace', 'column': 3,
+                        'replace': 'gi[|]', 'pattern': ''}]
+    },
+    { file_path : '/home/galaxy/dataset_103.dat',
+            table_name : 'test',
+            column_names : ['c1', 'c2', 'c3']
+    }
+    ]
+}
+"""
+
+
+def __main__():
+    # Parse Command Line
+    parser = optparse.OptionParser()
+    parser.add_option('-s', '--sqlitedb', dest='sqlitedb', default=None,
+                      help='The SQLite Database')
+    parser.add_option('-j', '--jsonfile', dest='jsonfile', default=None,
+                      help='JSON dict of table specifications')
+    parser.add_option('-q', '--query', dest='query', default=None,
+                      help='SQL query')
+    parser.add_option('-Q', '--query_file', dest='query_file', default=None,
+                      help='SQL query file')
+    parser.add_option('-n', '--no_header', dest='no_header', default=False,
+                      action='store_true',
+                      help='Include a column headers line')
+    parser.add_option('-o', '--output', dest='output', default=None,
+                      help='Output file for query results')
+    (options, args) = parser.parse_args()
+
+    # determine output destination
+    if options.output is not None:
+        try:
+            outputPath = os.path.abspath(options.output)
+            outputFile = open(outputPath, 'w')
+        except Exception as e:
+            exit('Error: %s' % (e))
+    else:
+        outputFile = sys.stdout
+
+    def _create_table(ti, table):
+        path = table['file_path']
+        table_name =\
+            table['table_name'] if 'table_name' in table else 't%d' % (ti + 1)
+        comment_lines =\
+            table['comment_lines'] if 'comment_lines' in table else 0
+        comment_char =\
+            table['comment_char'] if 'comment_char' in table else None
+        column_names =\
+            table['column_names'] if 'column_names' in table else None
+        if column_names:
+            load_named_columns =\
+                table['load_named_columns']\
+                if 'load_named_columns' in table else False
+        else:
+            load_named_columns = False
+        unique_indexes = table['unique'] if 'unique' in table else []
+        indexes = table['index'] if 'index' in table else []
+        filters = table['filters'] if 'filters' in table else None
+        pkey_autoincr = \
+            table['pkey_autoincr'] if 'pkey_autoincr' in table else None
+        create_table(get_connection(options.sqlitedb), path, table_name,
+                     pkey_autoincr=pkey_autoincr,
+                     column_names=column_names,
+                     skip=comment_lines,
+                     comment_char=comment_char,
+                     load_named_columns=load_named_columns,
+                     filters=filters,
+                     unique_indexes=unique_indexes,
+                     indexes=indexes)
+
+    if options.jsonfile:
+        try:
+            with open(options.jsonfile) as fh:
+                tdef = json.load(fh)
+                if 'tables' in tdef:
+                    for ti, table in enumerate(tdef['tables']):
+                        _create_table(ti, table)
+        except Exception as e:
+            exit('Error: %s' % (e))
+
+    query = None
+    if options.query_file is not None:
+        with open(options.query_file, 'r') as fh:
+            query = ''
+            for line in fh:
+                query += line
+    elif options.query is not None:
+        query = options.query
+
+    if query is None:
+        try:
+            describe_tables(get_connection(options.sqlitedb), outputFile)
+        except Exception as e:
+            exit('Error: %s' % (e))
+    else:
+        try:
+            run_query(get_connection(options.sqlitedb), query, outputFile,
+                      no_header=options.no_header)
+        except Exception as e:
+            exit('Error: %s' % (e))
+
+
+if __name__ == "__main__":
+    __main__()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/query_tabular.xml	Tue Jul 18 09:07:07 2017 -0400
@@ -0,0 +1,510 @@
+<tool id="query_tabular" name="Query Tabular" version="1.0.0">
+    <description>using sqlite sql</description>
+
+    <macros>
+         <import>macros.xml</import>
+    </macros>
+
+    <requirements>
+    </requirements>
+
+    <command detect_errors="exit_code"><![CDATA[
+        cat '$query_file' &&
+        #if $add_to_database.withdb:
+            #if $save_db:
+                cp '$add_to_database.withdb' '$sqlitedb' &&
+            #else:
+                cp '$add_to_database.withdb' '$workdb' &&
+            #end if
+        #end if
+        python '$__tool_directory__/query_tabular.py'
+        #if $save_db
+        -s '$sqlitedb'
+        #else
+        -s '$workdb'
+        #end if
+        -j '$table_json'
+        #if $sqlquery:
+          -Q '$query_file'
+          $no_header
+          -o '$output'
+        #end if
+    ]]></command>
+    <configfiles>
+        <configfile name="query_file">
+$sqlquery
+        </configfile>
+        <configfile name="table_json">
+#import json
+#set $jtbldef = dict()
+#set $jtbls = []
+#set $jtbldef['tables'] = $jtbls
+#for $i,$tbl in enumerate($tables):
+  #set $jtbl = dict()
+  #set $jtbl['file_path'] = str($tbl.table)
+  #if $tbl.tbl_opts.table_name:
+  #set $tname = str($tbl.tbl_opts.table_name)
+  #else
+  #set $tname = 't' + str($i + 1)
+  #end if
+  #set $jtbl['table_name'] = $tname
+  ## #if $tbl.tbl_opts.sel_cols:
+  ##   #set $jtbl['sel_cols'] = $tbl.tbl_opts.sel_cols el_cols
+  ## #end if
+  #if $tbl.tbl_opts.pkey_autoincr:
+    #set $jtbl['pkey_autoincr'] = str($tbl.tbl_opts.pkey_autoincr)
+  #end if
+  #if $tbl.tbl_opts.col_names:
+  #set $col_names = str($tbl.tbl_opts.col_names)
+    #if $tbl.tbl_opts.load_named_columns:
+      #set $jtbl['load_named_columns'] = True
+    #end if
+  #else
+  #set $col_names = ''
+  #end if
+  #set $jtbl['column_names'] = $col_names
+  #set $idx_unique = []
+  #set $idx_non = []
+  #for $idx in $tbl.tbl_opts.indexes:
+    #if $idx.unique:
+      #silent $idx_unique.append(str($idx.index_columns))
+    #else:
+      #silent $idx_non.append(str($idx.index_columns))
+    #end if
+  #end for
+  #if len($idx_unique) > 0:
+    #set $jtbl['unique'] = $idx_unique
+  #end if
+  #if len($idx_non) > 0:
+    #set $jtbl['index'] = $idx_non
+  #end if
+  #set $linefilters = $tbl.input_opts.linefilters
+  @LINEFILTERS@
+  #if $input_filters:
+    #set $jtbl['filters'] = $input_filters
+  #end if
+  #set $jtbls += [$jtbl]
+#end for
+#echo $json.dumps($jtbldef)
+        </configfile>
+    </configfiles>
+    <inputs>
+        <param name="workdb" type="hidden" value="workdb.sqlite" label=""/>
+        <section name="add_to_database" expanded="false" title="Add tables to an existing database">
+            <param name="withdb" type="data" format="sqlite" optional="true" label="Add tables to this Database"
+               help="Make sure your added table names are not already in this database"/>
+        </section>
+        <repeat name="tables" title="Database Table" min="0">
+            <param name="table" type="data" format="tabular" label="Tabular Dataset for Table"/>
+            <section name="input_opts" expanded="false" title="Filter Dataset Input">
+                <expand macro="macro_line_filters" />
+            </section>
+            <section name="tbl_opts" expanded="false" title="Table Options">
+                <param name="table_name" type="text" value="" optional="true" label="Specify Name for Table">
+                    <help>By default, tables will be named: t1,t2,...,tn (table names must be unique)</help>
+                    <validator type="regex" message="Table name should start with a letter and may contain additional letters, digits, and underscores">^[A-Za-z]\w*$</validator>
+                </param>
+                <param name="col_names" type="text" value="" optional="true" label="Specify Column Names (comma-separated list)">
+                    <help>By default, table columns will be named: c1,c2,c3,...,cn  (column names for a table must be unique)
+                          You can override the default names by entering a comma -separated list of names, e.g. ',name1,,,name2' would rename the second and fifth columns.
+                    </help>
+                    <sanitizer sanitize="False"/>
+                    <validator type="regex" message="A List of names separated by commas: Column names should start with a letter and may contain additional letters, digits, and underscores. Otherwise, the name must be eclosed in: double quotes, back quotes, or square brackets.">^([A-Za-z]\w*|"\S+[^,"]*"|`\S+[^,`]*`|[[]\S+[^,"]*[]])?(,([A-Za-z]\w*|"\S+.*"|`\S+[^,`]*`|[[]\S+[^,"]*[]])?)*$</validator>
+                </param>
+                <param name="load_named_columns" type="boolean" truevalue="load_named_columns" falsevalue="" checked="false" label="Only load the columns you have named into database"/>
+                <param name="pkey_autoincr" type="text" value="" optional="true" label="Add an auto increment primary key column with this name"
+                       help="Only creates this additional column when a name is entered. (This can not be the same name as any of the other columns in this table.)">
+                        <validator type="regex" message="Column name">^([A-Za-z]\w*)?$</validator>
+                </param>
+                <repeat name="indexes" title="Table Index">
+                    <param name="unique" type="boolean" truevalue="yes" falsevalue="no" checked="False" label="This is a unique index"/>
+                    <param name="index_columns" type="text" value="" label="Index on Columns">
+                        <help>Create an index on the column names: e,g, c1  or c2,c4</help>
+                        <validator type="regex" message="Column name, separated by commes if more than one">^([A-Za-z]\w*|"\S+[^,"]*"|`\S+[^,`]*`|[[]\S+[^,"]*[]])(,([A-Za-z]\w*|"\S+.*"|`\S+[^,`]*`|[[]\S+[^,"]*[]])?)*$</validator>
+                    </param>
+                </repeat>
+            </section>
+        </repeat>
+        <param name="save_db" type="boolean" truevalue="yes" falsevalue="no" checked="false" label="Save the sqlite database in your history"
+            help="SQLite to tabular tool can run additional queries on this database"/>
+        <param name="sqlquery" type="text" area="true" size="20x80" value="" optional="true" label="SQL Query to generate tabular output">
+                <help>By default: tables are named: t1,t2,...,tn and columns in each table: c1,c2,...,cn</help>
+                <sanitizer sanitize="False"/>
+                <validator type="regex" message="">^(?ims)\s*select\s+.*\s+from\s+.*$</validator>
+        </param>
+        <param name="no_header" type="boolean" truevalue="-n" falsevalue="" checked="False" label="Omit column headers from tabular output"/>
+    </inputs>
+    <outputs>
+        <data format="sqlite" name="sqlitedb" label="sqlite db of ${on_string}">
+            <filter>save_db</filter>
+        </data>
+        <data format="tabular" name="output" label="query results on ${on_string}">
+            <filter>not save_db or (sqlquery and len(sqlquery.strip()) > 0)</filter>
+        </data>
+    </outputs>
+    <tests>
+        <test>
+            <repeat name="tables">
+                <param name="table" ftype="tabular" value="customers.tsv"/>
+                <section name="input_opts">
+                    <repeat name="linefilters">
+                        <conditional name="filter">
+                            <param name="filter_type" value="comment"/>
+                            <param name="comment_char" value="35"/>
+                        </conditional>
+                    </repeat>
+                </section>
+                <section name="tbl_opts">
+                    <param name="table_name" value="customers"/>
+                    <param name="col_names" value="CustomerID,FirstName,LastName,Email,DOB,Phone"/>
+                </section>
+            </repeat>
+            <repeat name="tables">
+                <param name="table" ftype="tabular" value="sales.tsv"/>
+                <section name="input_opts">
+                    <repeat name="linefilters">
+                        <conditional name="filter">
+                            <param name="filter_type" value="comment"/>
+                            <param name="comment_char" value="35"/>
+                        </conditional>
+                    </repeat>
+                </section>
+                <section name="tbl_opts">
+                    <param name="table_name" value="sales"/>
+                    <param name="col_names" value="CustomerID,Date,SaleAmount"/>
+                </section>
+            </repeat>
+            <param name="sqlquery" value="SELECT FirstName,LastName,sum(SaleAmount) as &quot;TotalSales&quot; FROM customers join sales on customers.CustomerID = sales.CustomerID GROUP BY customers.CustomerID ORDER BY TotalSales DESC"/>
+            <output name="output" file="sales_results.tsv"/>
+        </test>
+
+        <test>
+            <repeat name="tables">
+                <param name="table" ftype="tabular" value="customers.tsv"/>
+                <section name="input_opts">
+                    <repeat name="linefilters">
+                        <conditional name="filter">
+                            <param name="filter_type" value="comment"/>
+                            <param name="comment_char" value="35"/>
+                        </conditional>
+                    </repeat>
+                </section>
+                <section name="tbl_opts">
+                    <param name="col_names" value=",FirstName,LastName,,DOB,"/>
+                </section>
+            </repeat>
+            <repeat name="tables">
+                <param name="table" ftype="tabular" value="sales.tsv"/>
+                <section name="input_opts">
+                    <repeat name="linefilters">
+                        <conditional name="filter">
+                            <param name="filter_type" value="skip"/>
+                            <param name="skip_lines" value="1"/>
+                        </conditional>
+                    </repeat>
+                </section>
+            </repeat>
+            <param name="sqlquery" value="SELECT FirstName,LastName,sum(t2.c3) as &quot;TotalSales&quot; FROM t1 join t2 on t1.c1 = t2.c1 GROUP BY t1.c1 ORDER BY TotalSales DESC;"/>
+            <output name="output" file="sales_results.tsv"/>
+        </test>
+
+        <test>
+            <repeat name="tables">
+                <param name="table" ftype="tabular" value="customers.tsv"/>
+                <section name="input_opts">
+                    <repeat name="linefilters">
+                        <conditional name="filter">
+                            <param name="filter_type" value="skip"/>
+                            <param name="skip_lines" value="1"/>
+                        </conditional>
+                    </repeat>
+                </section>
+                <section name="tbl_opts">
+                    <param name="col_names" value=",FirstName,LastName,,BirthDate,"/>
+                </section>
+            </repeat>
+            <param name="sqlquery" value="select FirstName,LastName,re_sub('^\d{2}(\d{2})-(\d\d)-(\d\d)','\3/\2/\1',BirthDate) as &quot;DOB&quot; from t1 WHERE re_search('[hp]er',c4)"/>
+            <output name="output" file="regex_results.tsv"/>
+        </test>
+
+        <test>
+            <repeat name="tables">
+                <param name="table" ftype="tabular" value="IEDB.tsv"/>
+                <section name="input_opts">
+                    <repeat name="linefilters">
+                        <conditional name="filter">
+                            <param name="filter_type" value="comment"/>
+                            <param name="comment_char" value="35"/>
+                        </conditional>
+                    </repeat>
+                </section>
+                <section name="tbl_opts">
+                    <param name="table_name" value="iedb"/>
+                    <param name="col_names" value="ID,allele,seq_num,start,end,length,peptide,method,percentile_rank,ann_ic50,ann_rank,smm_ic50,smm_rank,comblib_sidney2008_score,comblib_sidney2008_rank,netmhcpan_ic50,netmhcpan_rank"/>
+                </section>
+            </repeat>
+            <repeat name="tables">
+                <param name="table" ftype="tabular" value="netMHC_summary.tsv"/>
+                <section name="input_opts">
+                    <repeat name="linefilters">
+                        <conditional name="filter">
+                            <param name="filter_type" value="skip"/>
+                            <param name="skip_lines" value="1"/>
+                        </conditional>
+                    </repeat>
+                </section>
+                <section name="tbl_opts">
+                    <param name="table_name" value="mhc_summary"/>
+                    <param name="col_names" value="pos,peptide,logscore,affinity,Bind_Level,Protein,Allele"/>
+                </section>
+            </repeat>
+            <param name="sqlquery" value="select iedb.ID,iedb.peptide,iedb.start,iedb.end,iedb.percentile_rank,mhc_summary.logscore,mhc_summary.affinity,mhc_summary.Bind_Level from iedb left outer join mhc_summary on iedb.peptide = mhc_summary.peptide order by affinity,Bind_Level,percentile_rank"/>
+            <output name="output" file="query_results.tsv"/>
+        </test>
+
+        <test>
+            <section name="add_to_database">
+                <param name="withdb" ftype="sqlite" value="testdb.sqlite"/>
+            </section>
+            <repeat name="tables">
+                <param name="table" ftype="tabular" value="pets.tsv"/>
+                <section name="input_opts">
+                    <repeat name="linefilters">
+                        <conditional name="filter">
+                            <param name="filter_type" value="regex"/>
+                            <param name="regex_pattern" value="^\d+"/>
+                            <param name="regex_action" value="include_find"/>
+                        </conditional>
+                    </repeat>
+                    <repeat name="linefilters">
+                        <conditional name="filter">
+                            <param name="filter_type" value="comment"/>
+                            <param name="comment_char" value="35"/>
+                        </conditional>
+                    </repeat>
+                    <repeat name="linefilters">
+                        <conditional name="filter">
+                            <param name="filter_type" value="append_line_num"/>
+                        </conditional>
+                    </repeat>
+                    <repeat name="linefilters">
+                        <conditional name="filter">
+                            <param name="filter_type" value="select_columns"/>
+                            <param name="columns" value="7,2,3,4,1"/>
+                        </conditional>
+                    </repeat>
+                    <repeat name="linefilters">
+                        <conditional name="filter">
+                            <param name="filter_type" value="replace"/>
+                            <param name="column" value="c4"/>
+                            <param name="regex_pattern" value="(\d+)/(\d+)/(\d+)"/>
+                            <param name="regex_replace" value="19\3-\2-\1"/>
+                        </conditional>
+                    </repeat>
+                </section>
+                <section name="tbl_opts">
+                    <param name="table_name" value="people"/>
+                    <param name="col_names" value="id,first,last,dob,pets"/>
+                </section>
+            </repeat>
+            <param name="sqlquery" value="SELECT people.id,first,last,pets,quote FROM people JOIN contacts ON people.first = contacts.first_name"/>
+            <output name="output" file="add_to_db_results.tsv"/>
+        </test>
+
+        <test>
+            <repeat name="tables">
+                <param name="table" ftype="tabular" value="pets.tsv"/>
+                <section name="input_opts">
+                    <repeat name="linefilters">
+                        <conditional name="filter">
+                            <param name="filter_type" value="regex"/>
+                            <param name="regex_pattern" value="^\d+"/>
+                            <param name="regex_action" value="include_find"/>
+                        </conditional>
+                    </repeat>
+                    <repeat name="linefilters">
+                        <conditional name="filter">
+                            <param name="filter_type" value="comment"/>
+                            <param name="comment_char" value="35"/>
+                        </conditional>
+                    </repeat>
+                    <repeat name="linefilters">
+                        <conditional name="filter">
+                            <param name="filter_type" value="append_line_num"/>
+                        </conditional>
+                    </repeat>
+                    <repeat name="linefilters">
+                        <conditional name="filter">
+                            <param name="filter_type" value="select_columns"/>
+                            <param name="columns" value="7,2,3,4,1"/>
+                        </conditional>
+                    </repeat>
+                    <repeat name="linefilters">
+                        <conditional name="filter">
+                            <param name="filter_type" value="replace"/>
+                            <param name="column" value="c4"/>
+                            <param name="regex_pattern" value="(\d+)/(\d+)/(\d+)"/>
+                            <param name="regex_replace" value="19\3-\2-\1"/>
+                        </conditional>
+                    </repeat>
+                </section>
+                <section name="tbl_opts">
+                    <param name="table_name" value="people"/>
+                    <param name="col_names" value="id,first,last,dob,pets"/>
+                </section>
+            </repeat>
+            <repeat name="tables">
+                <param name="table" ftype="tabular" value="pets.tsv"/>
+                <section name="input_opts">
+                    <repeat name="linefilters">
+                        <conditional name="filter">
+                            <param name="filter_type" value="regex"/>
+                            <param name="regex_pattern" value="^\d+"/>
+                            <param name="regex_action" value="include_find"/>
+                        </conditional>
+                    </repeat>
+                    <repeat name="linefilters">
+                        <conditional name="filter">
+                            <param name="filter_type" value="append_line_num"/>
+                        </conditional>
+                    </repeat>
+                    <repeat name="linefilters">
+                        <conditional name="filter">
+                            <param name="filter_type" value="select_columns"/>
+                            <param name="columns" value="c7,c5,c6"/>
+                        </conditional>
+                    </repeat>
+                    <repeat name="linefilters">
+                        <conditional name="filter">
+                            <param name="filter_type" value="normalize"/>
+                            <param name="columns" value="c2,c3"/>
+                            <param name="separator" value=","/>
+                        </conditional>
+                    </repeat>
+                </section>
+                <section name="tbl_opts">
+                    <param name="table_name" value="pet"/>
+                    <param name="col_names" value="id,name,animal"/>
+                </section>
+            </repeat>
+            <param name="sqlquery" value="SELECT people.id,first,last,dob,name,animal,pets FROM people JOIN pet ON people.id = pet.id WHERE animal = 'cat'"/>
+            <output name="output" file="pet_normalized_query_results.tsv"/>
+        </test>
+
+    </tests>
+    <help><![CDATA[
+=============
+Query Tabular
+=============
+
+**Inputs**
+
+  Loads tabular datasets into a SQLite_ data base.
+
+  An existing SQLite_ data base can be used as input, and any selected tabular datasets will be added as new tables in that data base.
+
+
+@LINEFILTERS_HELP@
+
+
+**Outputs**
+
+  The results of a SQL query are output to the history as a tabular file.
+
+  The SQLite_ data base can also be saved and output as a dataset in the history.
+
+    *(The* **SQLite to tabular** *tool can run additional queries on this database.)*
+
+
+@QUERY_HELP@
+
+@LINEFILTERS_HELP_EXAMPLE@
+
+
+  Table name: pets
+
+  Table columns: Pets,FirstName,LastName,Birthdate,PetNames,PetType,line_num,entry_num,row_num
+
+  Query: SELECT * FROM pets
+
+  Result:
+
+     ======  ==========  ========  ==========  =========  ========  =========  ==========  ========
+     #Pets   FirstName   LastName  BirthDate   PetNames   PetType   line_num   entry_num    row_num
+     ======  ==========  ========  ==========  =========  ========  =========  ==========  ========
+     2       Paula       Brown     1978-05-24  Rex        dog              3           1         1
+     2       Paula       Brown     1978-05-24  Fluff      cat              3           1         2
+     1       Steven      Jones     1974-04-04  Allie      cat              4           2         3
+     0       Jane        Doe       1978-05-24                              5           3         4
+     1       James       Smith     1980-10-20  Spot                        6           4         5
+     ======  ==========  ========  ==========  =========  ========  =========  ==========  ========
+
+
+**Normalizing by Line Filtering into 2 Tables**
+
+*Relational database opertions work with single-valued column entries.
+To apply relational operations to tabular files that contain fields with lists of values,
+we need to "normalize" those fields, duplicating lines for each item in the list.
+In this example we create 2 tables, one for single-valued fields and a second with list-valued fields normalized.
+Becauce we add a line number first for each table, we can join the 2 tables on the line number column.*
+https://en.wikipedia.org/wiki/First_normal_form
+
+    *People Table*
+
+      ::
+
+        Filter 1 - by regex expression matching [include]: '^\d+' (include lines that start with a number)
+        Filter 2 - append a line number column:
+        Filter 3 - regex replace value in column[4]: '(\d+)/(\d+)/(\d+)' '19\3-\2-\1' (convert dates to sqlite format)
+        Filter 4 - select columns 7,2,3,4,1
+
+      Table: People
+      Columns: id,FirstName,LastName,DOB,Pets
+
+      ==  =========  ========   ==========  ====
+      id  FirstName  LastName   DOB         Pets
+      ==  =========  ========   ==========  ====
+      1     Paula      Brown    1978-05-24  2
+      2     Steven     Jones    1974-04-04  1
+      3     Jane       Doe      1978-05-24  0
+      4     James      Smith    1980-10-20  1
+      ==  =========  ========   ==========  ====
+
+
+    *Pet Table*
+
+      ::
+
+        Filter 1 - by regex expression matching [include]: '^\d+' (include lines that start with a number)
+        Filter 2 - append a line number column:
+        Filter 3 - by regex expression matching [exclude]: '^0\t' (exclude lines with no pets)
+        Filter 4 - normalize list columns[5,6]:
+        Filter 5 - select columns 7,5,6
+
+      Table: Pet
+      Columns: id,PetName,PetType
+
+      ==  ========  ========
+      id  PetName   PetType
+      ==  ========  ========
+      1   Rex       dog
+      1   Fluff     cat
+      2   Allie     cat
+      4   Spot
+      ==  ========  ========
+
+
+    Query: SELECT FirstName,LastName,PetName FROM People JOIN Pet ON People.id = Pet.id WHERE PetType = 'cat';
+
+    Result:
+
+     =========  ========  ========
+     FirstName  LastName  PetName
+     =========  ========  ========
+     Paula      Brown     Fluff
+     Steven     Jones     Allie
+     =========  ========  ========
+
+
+    ]]></help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sqlite_to_tabular.py	Tue Jul 18 09:07:07 2017 -0400
@@ -0,0 +1,60 @@
+#!/usr/bin/env python
+
+from __future__ import print_function
+
+import optparse
+import os.path
+import sys
+
+from query_db import describe_tables, get_connection, run_query
+
+
+def __main__():
+    # Parse Command Line
+    parser = optparse.OptionParser()
+    parser.add_option('-s', '--sqlitedb', dest='sqlitedb', default=None,
+                      help='The SQLite Database')
+    parser.add_option('-q', '--query', dest='query', default=None,
+                      help='SQL query')
+    parser.add_option('-Q', '--query_file', dest='query_file', default=None,
+                      help='SQL query file')
+    parser.add_option('-n', '--no_header', dest='no_header', default=False,
+                      action='store_true',
+                      help='Include a column headers line')
+    parser.add_option('-o', '--output', dest='output', default=None,
+                      help='Output file for query results')
+    (options, args) = parser.parse_args()
+
+    # determine output destination
+    if options.output is not None:
+        try:
+            outputPath = os.path.abspath(options.output)
+            outputFile = open(outputPath, 'w')
+        except Exception as e:
+            exit('Error: %s' % (e))
+    else:
+        outputFile = sys.stdout
+
+    query = None
+    if options.query_file is not None:
+        with open(options.query_file, 'r') as fh:
+            query = fh.read()
+    elif options.query is not None:
+        query = options.query
+
+    if query is None:
+        try:
+            describe_tables(get_connection(options.sqlitedb), outputFile)
+        except Exception as e:
+            exit('Error: %s' % (e))
+        exit(0)
+    else:
+        try:
+            run_query(get_connection(options.sqlitedb), query, outputFile,
+                      no_header=options.no_header)
+        except Exception as e:
+            exit('Error: %s' % (e))
+
+
+if __name__ == "__main__":
+    __main__()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/IEDB.tsv	Tue Jul 18 09:07:07 2017 -0400
@@ -0,0 +1,17 @@
+#ID	allele	seq_num	start	end	length	peptide	method	percentile_rank	ann_ic50	ann_rank	smm_ic50	smm_rank	comblib_sidney2008_score	comblib_sidney2008_rank	netmhcpan_ic50	netmhcpan_rank
+PPAP2C	HLA-A*02:01	1	3	11	9	GMYCMVFLV	Consensus (ann/smm/comblib_sidney2008)	0.2	4	0.2	3.77	0.2	7.1e-06	0.5	-	-
+PPAP2C	HLA-A*23:01	1	1	9	9	SFGMYCMVF	Consensus (ann/smm)	0.5	67	0.5	137.54	0.5	-	-	-	-
+PPAP2C	HLA-A*23:01	1	4	12	9	MYCMVFLVK	Consensus (ann/smm)	0.65	146	0.7	160.11	0.6	-	-	-	-
+PPAP2C	HLA-A*02:01	1	2	10	9	FGMYCMVFL	Consensus (ann/smm/comblib_sidney2008)	2.3	222	3.1	150.01	2.3	2.14e-05	1.3	-	-
+PPAP2C	HLA-A*23:01	1	3	11	9	GMYCMVFLV	Consensus (ann/smm)	4.95	3256	4	2706.64	5.9	-	-	-	-
+PPAP2C	HLA-A*23:01	1	2	10	9	FGMYCMVFL	Consensus (ann/smm)	6.55	4423	4.9	4144.10	8.2	-	-	-	-
+PPAP2C	HLA-A*02:01	1	1	9	9	SFGMYCMVF	Consensus (ann/smm/comblib_sidney2008)	45	24390	45	44989.38	39	0.01	91	-	-
+PPAP2C	HLA-A*02:01	1	4	12	9	MYCMVFLVK	Consensus (ann/smm/comblib_sidney2008)	54	23399	41	157801.09	54	0.01	86	-	-
+ADAMTSL1	HLA-A*02:01	1	1	9	9	SLDMCISGL	Consensus (ann/smm/comblib_sidney2008)	1	26	1	51.65	0.9	3.02e-05	1.7	-	-
+ADAMTSL1	HLA-A*23:01	1	4	12	9	MCISGLCQL	Consensus (ann/smm)	6.65	5781	5.9	3626.02	7.4	-	-	-	-
+ADAMTSL1	HLA-A*02:01	1	4	12	9	MCISGLCQL	Consensus (ann/smm/comblib_sidney2008)	14	1823	6.5	2612.82	14	0.00056	24	-	-
+ADAMTSL1	HLA-A*23:01	1	1	9	9	SLDMCISGL	Consensus (ann/smm)	30.5	27179	34	24684.82	27	-	-	-	-
+ADAMTSL1	HLA-A*02:01	1	2	10	9	LDMCISGLC	Consensus (ann/smm/comblib_sidney2008)	42	23677	42	53716.78	41	0.01	71	-	-
+ADAMTSL1	HLA-A*23:01	1	3	11	9	DMCISGLCQ	Consensus (ann/smm)	64.5	34451	73	118148.99	56	-	-	-	-
+ADAMTSL1	HLA-A*23:01	1	2	10	9	LDMCISGLC	Consensus (ann/smm)	76.0	33222	62	665932.18	90	-	-	-	-
+ADAMTSL1	HLA-A*02:01	1	3	11	9	DMCISGLCQ	Consensus (ann/smm/comblib_sidney2008)	97	31630	98	639896.89	71	0.03	97	-	-
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/add_to_db_results.tsv	Tue Jul 18 09:07:07 2017 -0400
@@ -0,0 +1,3 @@
+#id	first	last	pets	quote
+1	Paula	Brown	2	Time flies like and arrow.  Fruit flies like a banana.
+2	Steven	Jones	1	I would have wrtten less if I had more time
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/customers.tsv	Tue Jul 18 09:07:07 2017 -0400
@@ -0,0 +1,5 @@
+#CustomerID	FirstName	LastName	Email	DOB	Phone
+1	John	Smith	John.Smith@yahoo.com	1968-02-04	626 222-2222
+2	Steven	Goldfish	goldfish@fishhere.net	1974-04-04	323 455-4545
+3	Paula	Brown	pb@herowndomain.org	1978-05-24	416 323-3232
+4	James	Smith	jim@supergig.co.uk	1980-10-20	416 323-8888
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/filtered_people_results.tsv	Tue Jul 18 09:07:07 2017 -0400
@@ -0,0 +1,4 @@
+1	Paula	Brown	1978-05-24	2
+2	Steven	Jones	1974-04-04	1
+3	Jane	Doe	1978-05-24	0
+4	James	Smith	1980-10-20	1
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/filtered_pets_results.tsv	Tue Jul 18 09:07:07 2017 -0400
@@ -0,0 +1,4 @@
+1	Rex	dog
+1	Fluff	cat
+2	Allie	cat
+4	Spot
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/netMHC_summary.tsv	Tue Jul 18 09:07:07 2017 -0400
@@ -0,0 +1,9 @@
+#pos	peptide	logscore	affinity(nM)	Bind Level	Protein Name	Allele
+2	GMYCMVFLV	0.858	4	SB	PPAP2C	HLA-A02:01
+1	FGMYCMVFL	0.501	222	WB	PPAP2C	HLA-A02:01
+3	MYCMVFLVK	0.070	23399		PPAP2C	HLA-A02:01
+0	SFGMYCMVF	0.066	24390		PPAP2C	HLA-A02:01
+0	SLDMCISGL	0.698	26	SB	ADAMTSL1	HLA-A02:01
+3	MCISGLCQL	0.306	1823		ADAMTSL1	HLA-A02:01
+1	LDMCISGLC	0.069	23677		ADAMTSL1	HLA-A02:01
+2	DMCISGLCQ	0.042	31630		ADAMTSL1	HLA-A02:01
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/pet_normalized_query_results.tsv	Tue Jul 18 09:07:07 2017 -0400
@@ -0,0 +1,3 @@
+#id	first	last	dob	name	animal	pets
+1	Paula	Brown	1978-05-24	Fluff	cat	2
+2	Steven	Jones	1974-04-04	Allie	cat	1
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/pets.tsv	Tue Jul 18 09:07:07 2017 -0400
@@ -0,0 +1,7 @@
+#People with pets
+Pets	FirstName	LastName	DOB	PetNames	PetType
+2	Paula	Brown	24/05/78	Rex,Fluff	dog,cat
+1	Steven	Jones	04/04/74	Allie	cat
+0	Jane	Doe	24/05/78
+1	James	Smith	20/10/80	Spot
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/query_results.tsv	Tue Jul 18 09:07:07 2017 -0400
@@ -0,0 +1,17 @@
+#ID	peptide	start	end	percentile_rank	logscore	affinity	Bind_Level
+PPAP2C	GMYCMVFLV	3	11	0.2	0.858	4	SB
+PPAP2C	GMYCMVFLV	3	11	4.95	0.858	4	SB
+ADAMTSL1	SLDMCISGL	1	9	1.0	0.698	26	SB
+ADAMTSL1	SLDMCISGL	1	9	30.5	0.698	26	SB
+PPAP2C	FGMYCMVFL	2	10	2.3	0.501	222	WB
+PPAP2C	FGMYCMVFL	2	10	6.55	0.501	222	WB
+ADAMTSL1	MCISGLCQL	4	12	6.65	0.306	1823
+ADAMTSL1	MCISGLCQL	4	12	14.0	0.306	1823
+PPAP2C	MYCMVFLVK	4	12	0.65	0.07	23399
+PPAP2C	MYCMVFLVK	4	12	54.0	0.07	23399
+ADAMTSL1	LDMCISGLC	2	10	42.0	0.069	23677
+ADAMTSL1	LDMCISGLC	2	10	76.0	0.069	23677
+PPAP2C	SFGMYCMVF	1	9	0.5	0.066	24390
+PPAP2C	SFGMYCMVF	1	9	45.0	0.066	24390
+ADAMTSL1	DMCISGLCQ	3	11	64.5	0.042	31630
+ADAMTSL1	DMCISGLCQ	3	11	97.0	0.042	31630
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/regex_results.tsv	Tue Jul 18 09:07:07 2017 -0400
@@ -0,0 +1,4 @@
+#FirstName	LastName	DOB
+Steven	Goldfish	04/04/74
+Paula	Brown	24/05/78
+James	Smith	20/10/80
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sales.tsv	Tue Jul 18 09:07:07 2017 -0400
@@ -0,0 +1,6 @@
+#CustomerID	Date	SaleAmount
+2	2004-05-06	100.22
+1	2004-05-07	99.95
+3	2004-05-07	122.95
+3	2004-05-13	100.00
+4	2004-05-22	555.55
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sales_results.tsv	Tue Jul 18 09:07:07 2017 -0400
@@ -0,0 +1,5 @@
+#FirstName	LastName	TotalSales
+James	Smith	555.55
+Paula	Brown	222.95
+Steven	Goldfish	100.22
+John	Smith	99.95
Binary file test-data/testdb.sqlite has changed