Mercurial > repos > iuc > query_tabular
changeset 0:3708ff0198b7 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/query_tabular commit 74915fc9cee746bbce1c4b507e13231259de177d
| author | iuc | 
|---|---|
| date | Tue, 18 Jul 2017 09:07:07 -0400 | 
| parents | |
| children | 8a33b442ecd9 | 
| files | filter_tabular.py filters.py load_db.py macros.xml query_db.py query_tabular.py query_tabular.xml sqlite_to_tabular.py test-data/IEDB.tsv test-data/add_to_db_results.tsv test-data/customers.tsv test-data/filtered_people_results.tsv test-data/filtered_pets_results.tsv test-data/netMHC_summary.tsv test-data/pet_normalized_query_results.tsv test-data/pets.tsv test-data/query_results.tsv test-data/regex_results.tsv test-data/sales.tsv test-data/sales_results.tsv test-data/testdb.sqlite | 
| diffstat | 21 files changed, 1600 insertions(+), 0 deletions(-) [+] | 
line wrap: on
 line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filter_tabular.py Tue Jul 18 09:07:07 2017 -0400 @@ -0,0 +1,68 @@ +#!/usr/bin/env python + +from __future__ import print_function + +import json +import optparse +import os.path +import sys + +from filters import filter_file + + +def __main__(): + # Parse Command Line + parser = optparse.OptionParser() + parser.add_option('-i', '--input', dest='input', default=None, + help='Input file for filtering') + parser.add_option('-j', '--jsonfile', dest='jsonfile', default=None, + help='JSON array of filter specifications') + parser.add_option('-o', '--output', dest='output', default=None, + help='Output file for query results') + parser.add_option('-v', '--verbose', dest='verbose', default=False, + action='store_true', + help='verbose') + (options, args) = parser.parse_args() + + if options.input is not None: + try: + inputPath = os.path.abspath(options.input) + inputFile = open(inputPath, 'r') + except Exception as e: + exit('Error: %s' % (e)) + else: + inputFile = sys.stdin + + if options.output is not None: + try: + outputPath = os.path.abspath(options.output) + outputFile = open(outputPath, 'w') + except Exception as e: + exit('Error: %s' % (e)) + else: + outputFile = sys.stdout + + filters = None + if options.jsonfile: + try: + with open(options.jsonfile) as fh: + filters = json.load(fh) + except Exception as e: + exit('Error: %s' % (e)) + + if options.verbose and filters: + for f in filters: + print('%s %s' % (f['filter'], + ', '.join( + ['%s: %s' % (k, f[k]) + for k in set(f.keys()) - set(['filter'])])), + file=sys.stdout) + + try: + filter_file(inputFile, outputFile, filters=filters) + except Exception as e: + exit('Error: %s' % (e)) + + +if __name__ == "__main__": + __main__()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters.py Tue Jul 18 09:07:07 2017 -0400 @@ -0,0 +1,156 @@ +#!/usr/binsenv python + +from __future__ import print_function + +import re +import sys + + +class LineFilter(object): + def __init__(self, source, filter_dict): + self.source = source + self.filter_dict = filter_dict + self.func = lambda i, l: l.rstrip('\r\n') if l else None + self.src_lines = [] + self.src_line_cnt = 0 + if not filter_dict: + return + if filter_dict['filter'] == 'regex': + rgx = re.compile(filter_dict['pattern']) + if filter_dict['action'] == 'exclude_match': + self.func = lambda i, l: l if not rgx.match(l) else None + elif filter_dict['action'] == 'include_match': + self.func = lambda i, l: l if rgx.match(l) else None + elif filter_dict['action'] == 'exclude_find': + self.func = lambda i, l: l if not rgx.search(l) else None + elif filter_dict['action'] == 'include_find': + self.func = lambda i, l: l if rgx.search(l) else None + elif filter_dict['filter'] == 'select_columns': + cols = [int(c) - 1 for c in filter_dict['columns']] + self.func = lambda i, l: self.select_columns(l, cols) + elif filter_dict['filter'] == 'replace': + p = filter_dict['pattern'] + r = filter_dict['replace'] + c = int(filter_dict['column']) - 1 + self.func = lambda i, l: '\t'.join( + [x if j != c else re.sub(p, r, x) for j, x in enumerate(l.split('\t'))]) + elif filter_dict['filter'] == 'prepend_line_num': + self.func = lambda i, l: '%d\t%s' % (i, l) + elif filter_dict['filter'] == 'append_line_num': + self.func = lambda i, l: '%s\t%d' % (l.rstrip('\r\n'), i) + elif filter_dict['filter'] == 'prepend_text': + s = filter_dict['column_text'] + self.func = lambda i, l: '%s\t%s' % (s, l) + elif filter_dict['filter'] == 'append_text': + s = filter_dict['column_text'] + self.func = lambda i, l: '%s\t%s' % (l.rstrip('\r\n'), s) + elif filter_dict['filter'] == 'skip': + cnt = filter_dict['count'] + self.func = lambda i, l: l if i > cnt else None + elif filter_dict['filter'] == 'normalize': + cols = [int(c) - 1 for c in filter_dict['columns']] + sep = filter_dict['separator'] + self.func = lambda i, l: self.normalize(l, cols, sep) + + def __iter__(self): + return self + + def __next__(self): + if not self.src_lines: + self.get_lines() + if self.src_lines: + return self.src_lines.pop(0) + raise StopIteration + + next = __next__ + + def select_columns(self, line, cols): + fields = line.split('\t') + return '\t'.join([fields[x] for x in cols]) + + def normalize(self, line, split_cols, sep): + lines = [] + fields = line.rstrip('\r\n').split('\t') + split_fields = dict() + cnt = 0 + for c in split_cols: + if c < len(fields): + split_fields[c] = fields[c].split(sep) + cnt = max(cnt, len(split_fields[c])) + if cnt == 0: + lines.append('\t'.join(fields)) + else: + for n in range(0, cnt): + flds = [x if c not in split_cols else split_fields[c][n] + if n < len(split_fields[c]) + else '' for (c, x) in enumerate(fields)] + lines.append('\t'.join(flds)) + return lines + + def get_lines(self): + for i, next_line in enumerate(self.source): + self.src_line_cnt += 1 + line = self.func(self.src_line_cnt, next_line) + if line: + if isinstance(line, list): + self.src_lines.extend(line) + else: + self.src_lines.append(line) + return + + +class TabularReader: + """ + Tabular file iterator. Returns a list + """ + def __init__(self, input_file, skip=0, comment_char=None, col_idx=None, + filters=None): + self.skip = skip + self.comment_char = comment_char + self.col_idx = col_idx + self.filters = filters + self.tsv_file = \ + input_file if hasattr(input_file, 'readline') else open(input_file) + if skip and skip > 0: + for i in range(skip): + if not self.tsv_file.readline(): + break + source = LineFilter(self.tsv_file, None) + if comment_char: + source = LineFilter(source, + {"filter": "regex", "pattern": comment_char, + "action": "exclude_match"}) + if filters: + for f in filters: + source = LineFilter(source, f) + self.source = source + + def __iter__(self): + return self + + def __next__(self): + ''' Iteration ''' + for i, line in enumerate(self.source): + fields = line.rstrip('\r\n').split('\t') + if self.col_idx: + fields = [fields[i] for i in self.col_idx] + return fields + raise StopIteration + + next = __next__ + + +def filter_file(input_file, output, skip=0, comment_char='#', filters=None): + data_lines = 0 + try: + tr = TabularReader(input_file, skip=skip, comment_char=comment_char, + filters=filters) + for linenum, fields in enumerate(tr): + data_lines += 1 + try: + output.write('%s\n' % '\t'.join(fields)) + except Exception as e: + print('Failed at line: %d err: %s' % (linenum, e), + file=sys.stderr) + except Exception as e: + exit('Error: %s' % (e))
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/load_db.py Tue Jul 18 09:07:07 2017 -0400 @@ -0,0 +1,135 @@ +#!/usr/bin/env python + +from __future__ import print_function + +import sys + +from filters import TabularReader + + +def getValueType(val): + if val or 0. == val: + try: + int(val) + return 'INTEGER' + except: + try: + float(val) + return 'REAL' + except: + return 'TEXT' + return None + + +def get_column_def(file_path, table_name, skip=0, comment_char='#', + column_names=None, max_lines=100, load_named_columns=False, + filters=None): + col_pref = ['TEXT', 'REAL', 'INTEGER', None] + col_types = [] + col_idx = None + try: + tr = TabularReader(file_path, skip=skip, comment_char=comment_char, + col_idx=None, filters=filters) + for linenum, fields in enumerate(tr): + if linenum > max_lines: + break + try: + while len(col_types) < len(fields): + col_types.append(None) + for i, val in enumerate(fields): + colType = getValueType(val) + if col_pref.index(colType) < col_pref.index(col_types[i]): + col_types[i] = colType + except Exception as e: + print('Failed at line: %d err: %s' % (linenum, e), + file=sys.stderr) + except Exception as e: + print('Failed: %s' % (e), file=sys.stderr) + for i, col_type in enumerate(col_types): + if not col_type: + col_types[i] = 'TEXT' + if column_names: + col_names = [] + if load_named_columns: + col_idx = [] + for i, cname in enumerate( + [cn.strip() for cn in column_names.split(',')]): + if cname != '': + col_idx.append(i) + col_names.append(cname) + col_types = [col_types[i] for i in col_idx] + else: + col_names = ['c%d' % i for i in range(1, len(col_types) + 1)] + for i, cname in enumerate( + [cn.strip() for cn in column_names.split(',')]): + if cname and i < len(col_names): + col_names[i] = cname + else: + col_names = ['c%d' % i for i in range(1, len(col_types) + 1)] + col_def = [] + for i, col_name in enumerate(col_names): + col_def.append('%s %s' % (col_names[i], col_types[i])) + return col_names, col_types, col_def, col_idx + + +def create_table(conn, file_path, table_name, skip=0, comment_char='#', + pkey_autoincr=None, column_names=None, + load_named_columns=False, filters=None, + unique_indexes=[], indexes=[]): + col_names, col_types, col_def, col_idx = \ + get_column_def(file_path, table_name, skip=skip, + comment_char=comment_char, column_names=column_names, + load_named_columns=load_named_columns, filters=filters) + col_func = [float if t == 'REAL' else int + if t == 'INTEGER' else str for t in col_types] + table_def = 'CREATE TABLE %s (\n %s%s\n);' % ( + table_name, + '%s INTEGER PRIMARY KEY AUTOINCREMENT,' % + pkey_autoincr if pkey_autoincr else '', + ', \n '.join(col_def)) + # print >> sys.stdout, table_def + insert_stmt = 'INSERT INTO %s(%s) VALUES(%s)' % ( + table_name, ','.join(col_names), + ','.join(["?" for x in col_names])) + # print >> sys.stdout, insert_stmt + data_lines = 0 + try: + c = conn.cursor() + c.execute(table_def) + conn.commit() + c.close() + for i, index in enumerate(unique_indexes): + index_name = 'idx_uniq_%s_%d' % (table_name, i) + index_columns = index.split(',') + create_index(conn, table_name, index_name, index_columns, + unique=True) + for i, index in enumerate(indexes): + index_name = 'idx_%s_%d' % (table_name, i) + index_columns = index.split(',') + create_index(conn, table_name, index_name, index_columns) + c = conn.cursor() + tr = TabularReader(file_path, skip=skip, comment_char=comment_char, + col_idx=col_idx, filters=filters) + for linenum, fields in enumerate(tr): + data_lines += 1 + try: + vals = [col_func[i](x) + if x else None for i, x in enumerate(fields)] + c.execute(insert_stmt, vals) + except Exception as e: + print('Failed at line: %d err: %s' % (linenum, e), + file=sys.stderr) + conn.commit() + c.close() + except Exception as e: + exit('Error: %s' % (e)) + + +def create_index(conn, table_name, index_name, index_columns, unique=False): + index_def = "CREATE %s INDEX %s on %s(%s)" % ( + 'UNIQUE' if unique else '', index_name, + table_name, ','.join(index_columns)) + c = conn.cursor() + c.execute(index_def) + conn.commit() + c.close()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Tue Jul 18 09:07:07 2017 -0400 @@ -0,0 +1,383 @@ +<macros> + <token name="@LINEFILTERS@"> +<![CDATA[ + ## set linefilters to the + #set $input_filters = [] + #for $fi in $linefilters: + #if $fi.filter.filter_type == 'skip': + #set $skip_lines = None + #if str($fi.filter.skip_lines) != '': + #set $skip_lines = int($fi.filter.skip_lines) + #elif $tbl.table.metadata.comment_lines and $tbl.table.metadata.comment_lines > 0: + #set $skip_lines = int($tbl.table.metadata.comment_lines) + #end if + #if $skip_lines is not None: + #set $filter_dict = dict() + #set $filter_dict['filter'] = str($fi.filter.filter_type) + #set $filter_dict['count'] = $skip_lines + #silent $input_filters.append($filter_dict) + #end if + #elif $fi.filter.filter_type == 'comment': + #set $filter_dict = dict() + #set $filter_dict['filter'] = 'regex' + #set $filter_dict['pattern'] = '^(%s).*$' % '|'.join([chr(int(x)).replace('|','[|]') for x in (str($fi.filter.comment_char)).split(',')]) + #set $filter_dict['action'] = 'exclude_match' + #silent $input_filters.append($filter_dict) + #elif $fi.filter.filter_type == 'regex': + #set $filter_dict = dict() + #set $filter_dict['filter'] = str($fi.filter.filter_type) + #set $filter_dict['pattern'] = str($fi.filter.regex_pattern) + #set $filter_dict['action'] = str($fi.filter.regex_action) + #silent $input_filters.append($filter_dict) + #elif $fi.filter.filter_type == 'select_columns': + #set $filter_dict = dict() + #set $filter_dict['filter'] = str($fi.filter.filter_type) + #set $filter_dict['columns'] = [int(str($ci).replace('c','')) for $ci in str($fi.filter.columns).split(',')] + #silent $input_filters.append($filter_dict) + #elif $fi.filter.filter_type == 'replace': + #set $filter_dict = dict() + #set $filter_dict['filter'] = str($fi.filter.filter_type) + #set $filter_dict['column'] = int(str($fi.filter.column).replace('c','')) + #set $filter_dict['pattern'] = str($fi.filter.regex_pattern) + #set $filter_dict['replace'] = str($fi.filter.regex_replace) + #silent $input_filters.append($filter_dict) + #elif str($fi.filter.filter_type).endswith('pend_line_num'): + #set $filter_dict = dict() + #set $filter_dict['filter'] = str($fi.filter.filter_type) + #silent $input_filters.append($filter_dict) + #elif str($fi.filter.filter_type).endswith('pend_text'): + #set $filter_dict = dict() + #set $filter_dict['filter'] = str($fi.filter.filter_type) + #set $filter_dict['column_text'] = str($fi.filter.column_text) + #silent $input_filters.append($filter_dict) + #elif $fi.filter.filter_type == 'normalize': + #set $filter_dict = dict() + #set $filter_dict['filter'] = str($fi.filter.filter_type) + #set $filter_dict['columns'] = [int(str($ci).replace('c','')) for $ci in str($fi.filter.columns).split(',')] + #set $filter_dict['separator'] = str($fi.filter.separator) + #silent $input_filters.append($filter_dict) + #end if + #end for +]]> + </token> + <xml name="macro_line_filters"> + <repeat name="linefilters" title="Filter Tabular Input Lines"> + <conditional name="filter"> + <param name="filter_type" type="select" label="Filter By"> + <option value="skip">skip leading lines</option> + <option value="comment">comment char</option> + <option value="regex">by regex expression matching</option> + <option value="select_columns">select columns</option> + <option value="replace">regex replace value in column</option> + <option value="prepend_line_num">prepend a line number column</option> + <option value="append_line_num">append a line number column</option> + <option value="prepend_text">prepend a column with the given text</option> + <option value="append_text">append a column with the given text</option> + <option value="normalize">normalize list columns, replicates row for each item in list</option> + </param> + <when value="skip"> + <param name="skip_lines" type="integer" value="" min="0" optional="true" label="Skip lines" + help="Leave blank to use the comment lines metadata for this dataset" /> + </when> + <when value="comment"> + <param name="comment_char" type="select" display="checkboxes" multiple="True" label="Ignore lines beginning with these characters" help="lines beginning with these are skipped"> + <option value="62">></option> + <option value="64">@</option> + <option value="43">+</option> + <option value="60"><</option> + <option value="42">*</option> + <option value="45">-</option> + <option value="61">=</option> + <option value="124">|</option> + <option value="63">?</option> + <option value="36">$</option> + <option value="46">.</option> + <option value="58">:</option> + <option value="38">&</option> + <option value="37">%</option> + <option value="94">^</option> + <option value="35">#</option> + <option value="33">!</option> + </param> + </when> + <when value="prepend_line_num"/> + <when value="append_line_num"/> + <when value="prepend_text"> + <param name="column_text" type="text" value="" label="text for column"> + </param> + </when> + <when value="append_text"> + <param name="column_text" type="text" value="" label="text for column"> + </param> + </when> + <when value="regex"> + <param name="regex_pattern" type="text" value="" label="regex pattern"> + <sanitizer sanitize="False"/> + </param> + <param name="regex_action" type="select" label="action for regex match"> + <option value="exclude_match">exclude line on pattern match</option> + <option value="include_match">include line on pattern match</option> + <option value="exclude_find">exclude line if pattern found</option> + <option value="include_find">include line if pattern found</option> + </param> + </when> + <when value="select_columns"> + <param name="columns" type="text" value="" label="enter column numbers to keep" + help="example: 1,4,2 or c1,c4,c2(selects the first,fourth, and second columns)"> + <validator type="regex" message="Column ordinal positions separated by commas">^(c?[1-9]\d*)(,c?[1-9]\d*)*$</validator> + </param> + </when> + <when value="replace"> + <param name="column" type="text" value="" label="enter column number to replace" + help="example: 1 or c1 (selects the first column)"> + <validator type="regex" message="Column ordinal position separated by commas">^(c?[1-9]\d*)$</validator> + </param> + <param name="regex_pattern" type="text" value="" label="regex pattern"> + <sanitizer sanitize="False"/> + </param> + <param name="regex_replace" type="text" value="" label="replacement expression"> + <sanitizer sanitize="False"/> + </param> + </when> + <when value="normalize"> + <param name="columns" type="text" value="" label="enter column numbers to normalize"> + <help><![CDATA[ + example: 2,4 or c2,c4 (selects the second, and fourth columns) + If multiple columns are selected, they should have the same length and separator on each line + ]]></help> + <validator type="regex" message="Column ordinal positions separated by commas">^(c?[1-9]\d*)(,c?[1-9]\d*)*$</validator> + </param> + <param name="separator" type="text" value="," label="List item delimiter in column"> + <sanitizer sanitize="False"/> + <validator type="regex" message="Anything but TAB or Newline">^[^\t\n\r\f\v]+$</validator> + </param> + </when> + </conditional> + </repeat> + </xml> + + <token name="@LINEFILTERS_HELP@"> +<![CDATA[ +**Input Line Filters** + + As a tabular file is being read, line filters may be applied. + + :: + + - skip leading lines skip the first *number* of lines + - comment char omit any lines that start with the specified comment character + - by regex expression matching *include/exclude* lines the match the regex expression + - select columns choose to include only selected columns in the order specified + - regex replace value in column replace a field in a column using a regex substitution (good for date reformatting) + - prepend a line number column each line has the ordinal value of the line read by this filter as the first column + - append a line number column each line has the ordinal value of the line read by this filter as the last column + - prepend a text column each line has the text string as the first column + - append a text column each line has the text string as the last column + - normalize list columns replicates the line for each item in the specified list *columns* +]]> + </token> + + <token name="@LINEFILTERS_HELP_EXAMPLE@"> +<![CDATA[ +**Line Filtering Example** + *(Six filters are applied as the following file is read)* + + :: + + Input Tabular File: + + #People with pets + Pets FirstName LastName DOB PetNames PetType + 2 Paula Brown 24/05/78 Rex,Fluff dog,cat + 1 Steven Jones 04/04/74 Allie cat + 0 Jane Doe 24/05/78 + 1 James Smith 20/10/80 Spot + + + Filter 1 - append a line number column: + + #People with pets 1 + Pets FirstName LastName DOB PetNames PetType 2 + 2 Paula Brown 24/05/78 Rex,Fluff dog,cat 3 + 1 Steven Jones 04/04/74 Allie cat 4 + 0 Jane Doe 24/05/78 5 + 1 James Smith 20/10/80 Spot 6 + + Filter 2 - by regex expression matching [include]: '^\d+' (include lines that start with a number) + + 2 Paula Brown 24/05/78 Rex,Fluff dog,cat 3 + 1 Steven Jones 04/04/74 Allie cat 4 + 0 Jane Doe 24/05/78 5 + 1 James Smith 20/10/80 Spot 6 + + Filter 3 - append a line number column: + + 2 Paula Brown 24/05/78 Rex,Fluff dog,cat 3 1 + 1 Steven Jones 04/04/74 Allie cat 4 2 + 0 Jane Doe 24/05/78 5 3 + 1 James Smith 20/10/80 Spot 6 4 + + Filter 4 - regex replace value in column[4]: '(\d+)/(\d+)/(\d+)' '19\3-\2-\1' (convert dates to sqlite format) + + 2 Paula Brown 1978-05-24 Rex,Fluff dog,cat 3 1 + 1 Steven Jones 1974-04-04 Allie cat 4 2 + 0 Jane Doe 1978-05-24 5 3 + 1 James Smith 1980-10-20 Spot 6 4 + + Filter 5 - normalize list columns[5,6]: + + 2 Paula Brown 1978-05-24 Rex dog 3 1 + 2 Paula Brown 1978-05-24 Fluff cat 3 1 + 1 Steven Jones 1974-04-04 Allie cat 4 2 + 0 Jane Doe 1978-05-24 5 3 + 1 James Smith 1980-10-20 Spot 6 4 + + Filter 6 - append a line number column: + + 2 Paula Brown 1978-05-24 Rex dog 3 1 1 + 2 Paula Brown 1978-05-24 Fluff cat 3 1 2 + 1 Steven Jones 1974-04-04 Allie cat 4 2 3 + 0 Jane Doe 1978-05-24 5 3 4 + 1 James Smith 1980-10-20 Spot 6 4 5 + +]]> + </token> + + <token name="@QUERY_HELP@"> +<![CDATA[ + +For help in using SQLite_ see: http://www.sqlite.org/docs.html + +**NOTE:** input for SQLite dates input field must be in the format: *YYYY-MM-DD* for example: 2015-09-30 + +See: http://www.sqlite.org/lang_datefunc.html + +**Example** + + Given 2 tabular datasets: *customers* and *sales* + + Dataset *customers* + + Table name: "customers" + + Column names: "CustomerID,FirstName,LastName,Email,DOB,Phone" + + =========== ========== ========== ===================== ========== ============ + #CustomerID FirstName LastName Email DOB Phone + =========== ========== ========== ===================== ========== ============ + 1 John Smith John.Smith@yahoo.com 1968-02-04 626 222-2222 + 2 Steven Goldfish goldfish@fishhere.net 1974-04-04 323 455-4545 + 3 Paula Brown pb@herowndomain.org 1978-05-24 416 323-3232 + 4 James Smith jim@supergig.co.uk 1980-10-20 416 323-8888 + =========== ========== ========== ===================== ========== ============ + + Dataset *sales* + + Table name: "sales" + + Column names: "CustomerID,Date,SaleAmount" + + ============= ============ ============ + #CustomerID Date SaleAmount + ============= ============ ============ + 2 2004-05-06 100.22 + 1 2004-05-07 99.95 + 3 2004-05-07 122.95 + 3 2004-05-13 100.00 + 4 2004-05-22 555.55 + ============= ============ ============ + + The query + + :: + + SELECT FirstName,LastName,sum(SaleAmount) as "TotalSales" + FROM customers join sales on customers.CustomerID = sales.CustomerID + GROUP BY customers.CustomerID ORDER BY TotalSales DESC; + + Produces this tabular output: + + ========== ======== ========== + #FirstName LastName TotalSales + ========== ======== ========== + James Smith 555.55 + Paula Brown 222.95 + Steven Goldfish 100.22 + John Smith 99.95 + ========== ======== ========== + + + If the optional Table name and Column names inputs are not used, the query would be: + + :: + + SELECT t1.c2 as "FirstName", t1.c3 as "LastName", sum(t2.c3) as "TotalSales" + FROM t1 join t2 on t1.c1 = t2.c1 + GROUP BY t1.c1 ORDER BY TotalSales DESC; + + You can selectively name columns, e.g. on the customers input you could just name columns 2,3, and 5: + + Column names: ,FirstName,LastName,,BirthDate + + Results in the following data base table + + =========== ========== ========== ===================== ========== ============ + #c1 FirstName LastName c4 BirthDate c6 + =========== ========== ========== ===================== ========== ============ + 1 John Smith John.Smith@yahoo.com 1968-02-04 626 222-2222 + 2 Steven Goldfish goldfish@fishhere.net 1974-04-04 323 455-4545 + 3 Paula Brown pb@herowndomain.org 1978-05-24 416 323-3232 + 4 James Smith jim@supergig.co.uk 1980-10-20 416 323-8888 + =========== ========== ========== ===================== ========== ============ + + + Regular_expression_ functions are included for: + + :: + + matching: re_match('pattern',column) + + SELECT t1.FirstName, t1.LastName + FROM t1 + WHERE re_match('^.*\.(net|org)$',c4) + + Results: + + =========== ========== + #FirstName LastName + =========== ========== + Steven Goldfish + Paula Brown + =========== ========== + + + :: + + searching: re_search('pattern',column) + substituting: re_sub('pattern','replacement,column) + + SELECT t1.FirstName, t1.LastName, re_sub('^\d{2}(\d{2})-(\d\d)-(\d\d)','\3/\2/\1',BirthDate) as "DOB" + FROM t1 + WHERE re_search('[hp]er',c4) + + Results: + + + =========== ========== ========== + #FirstName LastName DOB + =========== ========== ========== + Steven Goldfish 04/04/74 + Paula Brown 24/05/78 + James Smith 20/10/80 + =========== ========== ========== + +.. _Regular_expression: https://docs.python.org/release/2.7/library/re.html +.. _SQLite: http://www.sqlite.org/index.html +.. _SQLite_functions: http://www.sqlite.org/docs.html + + +]]> + </token> + +</macros> +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/query_db.py Tue Jul 18 09:07:07 2017 -0400 @@ -0,0 +1,67 @@ +#!/usr/bin/env python + +from __future__ import print_function + +import re +import sqlite3 as sqlite +import sys + + +TABLE_QUERY = \ + """ + SELECT name, sql + FROM sqlite_master + WHERE type='table' + ORDER BY name + """ + + +def regex_match(expr, item): + return re.match(expr, item) is not None + + +def regex_search(expr, item): + return re.search(expr, item) is not None + + +def regex_sub(expr, replace, item): + return re.sub(expr, replace, item) + + +def get_connection(sqlitedb_path, addfunctions=True): + conn = sqlite.connect(sqlitedb_path) + if addfunctions: + conn.create_function("re_match", 2, regex_match) + conn.create_function("re_search", 2, regex_search) + conn.create_function("re_sub", 3, regex_sub) + return conn + + +def describe_tables(conn, outputFile): + try: + c = conn.cursor() + tables_query = TABLE_QUERY + rslt = c.execute(tables_query).fetchall() + for table, sql in rslt: + print("Table %s:" % table, file=outputFile) + try: + col_query = 'SELECT * FROM %s LIMIT 0' % table + cur = conn.cursor().execute(col_query) + cols = [col[0] for col in cur.description] + print(" Columns: %s" % cols, file=outputFile) + except Exception as exc: + print("Warning: %s" % exc, file=sys.stderr) + except Exception as e: + exit('Error: %s' % (e)) + exit(0) + + +def run_query(conn, query, outputFile, no_header=False): + cur = conn.cursor() + results = cur.execute(query) + if not no_header: + outputFile.write("#%s\n" % '\t'.join( + [str(col[0]) for col in cur.description])) + for i, row in enumerate(results): + outputFile.write("%s\n" % '\t'.join( + [str(val) if val is not None else '' for val in row]))
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/query_tabular.py Tue Jul 18 09:07:07 2017 -0400 @@ -0,0 +1,137 @@ +#!/usr/bin/env python + +from __future__ import print_function + +import json +import optparse +import os.path +import sys + +from load_db import create_table + +from query_db import describe_tables, get_connection, run_query + + +""" +JSON config: +{ tables : [ + { file_path : '/home/galaxy/dataset_101.dat', + table_name : 't1', + column_names : ['c1','c2','c3'], + pkey_autoincr : 'id' + comment_lines : 1 + unique: ['c1'], + index: ['c2', 'c3'] + }, + { file_path : '/home/galaxy/dataset_102.dat', + table_name : 'gff', + column_names : ['seqname',,'date','start','end'] + comment_lines : 1 + load_named_columns : True + filters : [{'filter': 'regex', 'pattern': '#peptide', + 'action': 'exclude_match'}, + {'filter': 'replace', 'column': 3, + 'replace': 'gi[|]', 'pattern': ''}] + }, + { file_path : '/home/galaxy/dataset_103.dat', + table_name : 'test', + column_names : ['c1', 'c2', 'c3'] + } + ] +} +""" + + +def __main__(): + # Parse Command Line + parser = optparse.OptionParser() + parser.add_option('-s', '--sqlitedb', dest='sqlitedb', default=None, + help='The SQLite Database') + parser.add_option('-j', '--jsonfile', dest='jsonfile', default=None, + help='JSON dict of table specifications') + parser.add_option('-q', '--query', dest='query', default=None, + help='SQL query') + parser.add_option('-Q', '--query_file', dest='query_file', default=None, + help='SQL query file') + parser.add_option('-n', '--no_header', dest='no_header', default=False, + action='store_true', + help='Include a column headers line') + parser.add_option('-o', '--output', dest='output', default=None, + help='Output file for query results') + (options, args) = parser.parse_args() + + # determine output destination + if options.output is not None: + try: + outputPath = os.path.abspath(options.output) + outputFile = open(outputPath, 'w') + except Exception as e: + exit('Error: %s' % (e)) + else: + outputFile = sys.stdout + + def _create_table(ti, table): + path = table['file_path'] + table_name =\ + table['table_name'] if 'table_name' in table else 't%d' % (ti + 1) + comment_lines =\ + table['comment_lines'] if 'comment_lines' in table else 0 + comment_char =\ + table['comment_char'] if 'comment_char' in table else None + column_names =\ + table['column_names'] if 'column_names' in table else None + if column_names: + load_named_columns =\ + table['load_named_columns']\ + if 'load_named_columns' in table else False + else: + load_named_columns = False + unique_indexes = table['unique'] if 'unique' in table else [] + indexes = table['index'] if 'index' in table else [] + filters = table['filters'] if 'filters' in table else None + pkey_autoincr = \ + table['pkey_autoincr'] if 'pkey_autoincr' in table else None + create_table(get_connection(options.sqlitedb), path, table_name, + pkey_autoincr=pkey_autoincr, + column_names=column_names, + skip=comment_lines, + comment_char=comment_char, + load_named_columns=load_named_columns, + filters=filters, + unique_indexes=unique_indexes, + indexes=indexes) + + if options.jsonfile: + try: + with open(options.jsonfile) as fh: + tdef = json.load(fh) + if 'tables' in tdef: + for ti, table in enumerate(tdef['tables']): + _create_table(ti, table) + except Exception as e: + exit('Error: %s' % (e)) + + query = None + if options.query_file is not None: + with open(options.query_file, 'r') as fh: + query = '' + for line in fh: + query += line + elif options.query is not None: + query = options.query + + if query is None: + try: + describe_tables(get_connection(options.sqlitedb), outputFile) + except Exception as e: + exit('Error: %s' % (e)) + else: + try: + run_query(get_connection(options.sqlitedb), query, outputFile, + no_header=options.no_header) + except Exception as e: + exit('Error: %s' % (e)) + + +if __name__ == "__main__": + __main__()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/query_tabular.xml Tue Jul 18 09:07:07 2017 -0400 @@ -0,0 +1,510 @@ +<tool id="query_tabular" name="Query Tabular" version="1.0.0"> + <description>using sqlite sql</description> + + <macros> + <import>macros.xml</import> + </macros> + + <requirements> + </requirements> + + <command detect_errors="exit_code"><![CDATA[ + cat '$query_file' && + #if $add_to_database.withdb: + #if $save_db: + cp '$add_to_database.withdb' '$sqlitedb' && + #else: + cp '$add_to_database.withdb' '$workdb' && + #end if + #end if + python '$__tool_directory__/query_tabular.py' + #if $save_db + -s '$sqlitedb' + #else + -s '$workdb' + #end if + -j '$table_json' + #if $sqlquery: + -Q '$query_file' + $no_header + -o '$output' + #end if + ]]></command> + <configfiles> + <configfile name="query_file"> +$sqlquery + </configfile> + <configfile name="table_json"> +#import json +#set $jtbldef = dict() +#set $jtbls = [] +#set $jtbldef['tables'] = $jtbls +#for $i,$tbl in enumerate($tables): + #set $jtbl = dict() + #set $jtbl['file_path'] = str($tbl.table) + #if $tbl.tbl_opts.table_name: + #set $tname = str($tbl.tbl_opts.table_name) + #else + #set $tname = 't' + str($i + 1) + #end if + #set $jtbl['table_name'] = $tname + ## #if $tbl.tbl_opts.sel_cols: + ## #set $jtbl['sel_cols'] = $tbl.tbl_opts.sel_cols el_cols + ## #end if + #if $tbl.tbl_opts.pkey_autoincr: + #set $jtbl['pkey_autoincr'] = str($tbl.tbl_opts.pkey_autoincr) + #end if + #if $tbl.tbl_opts.col_names: + #set $col_names = str($tbl.tbl_opts.col_names) + #if $tbl.tbl_opts.load_named_columns: + #set $jtbl['load_named_columns'] = True + #end if + #else + #set $col_names = '' + #end if + #set $jtbl['column_names'] = $col_names + #set $idx_unique = [] + #set $idx_non = [] + #for $idx in $tbl.tbl_opts.indexes: + #if $idx.unique: + #silent $idx_unique.append(str($idx.index_columns)) + #else: + #silent $idx_non.append(str($idx.index_columns)) + #end if + #end for + #if len($idx_unique) > 0: + #set $jtbl['unique'] = $idx_unique + #end if + #if len($idx_non) > 0: + #set $jtbl['index'] = $idx_non + #end if + #set $linefilters = $tbl.input_opts.linefilters + @LINEFILTERS@ + #if $input_filters: + #set $jtbl['filters'] = $input_filters + #end if + #set $jtbls += [$jtbl] +#end for +#echo $json.dumps($jtbldef) + </configfile> + </configfiles> + <inputs> + <param name="workdb" type="hidden" value="workdb.sqlite" label=""/> + <section name="add_to_database" expanded="false" title="Add tables to an existing database"> + <param name="withdb" type="data" format="sqlite" optional="true" label="Add tables to this Database" + help="Make sure your added table names are not already in this database"/> + </section> + <repeat name="tables" title="Database Table" min="0"> + <param name="table" type="data" format="tabular" label="Tabular Dataset for Table"/> + <section name="input_opts" expanded="false" title="Filter Dataset Input"> + <expand macro="macro_line_filters" /> + </section> + <section name="tbl_opts" expanded="false" title="Table Options"> + <param name="table_name" type="text" value="" optional="true" label="Specify Name for Table"> + <help>By default, tables will be named: t1,t2,...,tn (table names must be unique)</help> + <validator type="regex" message="Table name should start with a letter and may contain additional letters, digits, and underscores">^[A-Za-z]\w*$</validator> + </param> + <param name="col_names" type="text" value="" optional="true" label="Specify Column Names (comma-separated list)"> + <help>By default, table columns will be named: c1,c2,c3,...,cn (column names for a table must be unique) + You can override the default names by entering a comma -separated list of names, e.g. ',name1,,,name2' would rename the second and fifth columns. + </help> + <sanitizer sanitize="False"/> + <validator type="regex" message="A List of names separated by commas: Column names should start with a letter and may contain additional letters, digits, and underscores. Otherwise, the name must be eclosed in: double quotes, back quotes, or square brackets.">^([A-Za-z]\w*|"\S+[^,"]*"|`\S+[^,`]*`|[[]\S+[^,"]*[]])?(,([A-Za-z]\w*|"\S+.*"|`\S+[^,`]*`|[[]\S+[^,"]*[]])?)*$</validator> + </param> + <param name="load_named_columns" type="boolean" truevalue="load_named_columns" falsevalue="" checked="false" label="Only load the columns you have named into database"/> + <param name="pkey_autoincr" type="text" value="" optional="true" label="Add an auto increment primary key column with this name" + help="Only creates this additional column when a name is entered. (This can not be the same name as any of the other columns in this table.)"> + <validator type="regex" message="Column name">^([A-Za-z]\w*)?$</validator> + </param> + <repeat name="indexes" title="Table Index"> + <param name="unique" type="boolean" truevalue="yes" falsevalue="no" checked="False" label="This is a unique index"/> + <param name="index_columns" type="text" value="" label="Index on Columns"> + <help>Create an index on the column names: e,g, c1 or c2,c4</help> + <validator type="regex" message="Column name, separated by commes if more than one">^([A-Za-z]\w*|"\S+[^,"]*"|`\S+[^,`]*`|[[]\S+[^,"]*[]])(,([A-Za-z]\w*|"\S+.*"|`\S+[^,`]*`|[[]\S+[^,"]*[]])?)*$</validator> + </param> + </repeat> + </section> + </repeat> + <param name="save_db" type="boolean" truevalue="yes" falsevalue="no" checked="false" label="Save the sqlite database in your history" + help="SQLite to tabular tool can run additional queries on this database"/> + <param name="sqlquery" type="text" area="true" size="20x80" value="" optional="true" label="SQL Query to generate tabular output"> + <help>By default: tables are named: t1,t2,...,tn and columns in each table: c1,c2,...,cn</help> + <sanitizer sanitize="False"/> + <validator type="regex" message="">^(?ims)\s*select\s+.*\s+from\s+.*$</validator> + </param> + <param name="no_header" type="boolean" truevalue="-n" falsevalue="" checked="False" label="Omit column headers from tabular output"/> + </inputs> + <outputs> + <data format="sqlite" name="sqlitedb" label="sqlite db of ${on_string}"> + <filter>save_db</filter> + </data> + <data format="tabular" name="output" label="query results on ${on_string}"> + <filter>not save_db or (sqlquery and len(sqlquery.strip()) > 0)</filter> + </data> + </outputs> + <tests> + <test> + <repeat name="tables"> + <param name="table" ftype="tabular" value="customers.tsv"/> + <section name="input_opts"> + <repeat name="linefilters"> + <conditional name="filter"> + <param name="filter_type" value="comment"/> + <param name="comment_char" value="35"/> + </conditional> + </repeat> + </section> + <section name="tbl_opts"> + <param name="table_name" value="customers"/> + <param name="col_names" value="CustomerID,FirstName,LastName,Email,DOB,Phone"/> + </section> + </repeat> + <repeat name="tables"> + <param name="table" ftype="tabular" value="sales.tsv"/> + <section name="input_opts"> + <repeat name="linefilters"> + <conditional name="filter"> + <param name="filter_type" value="comment"/> + <param name="comment_char" value="35"/> + </conditional> + </repeat> + </section> + <section name="tbl_opts"> + <param name="table_name" value="sales"/> + <param name="col_names" value="CustomerID,Date,SaleAmount"/> + </section> + </repeat> + <param name="sqlquery" value="SELECT FirstName,LastName,sum(SaleAmount) as "TotalSales" FROM customers join sales on customers.CustomerID = sales.CustomerID GROUP BY customers.CustomerID ORDER BY TotalSales DESC"/> + <output name="output" file="sales_results.tsv"/> + </test> + + <test> + <repeat name="tables"> + <param name="table" ftype="tabular" value="customers.tsv"/> + <section name="input_opts"> + <repeat name="linefilters"> + <conditional name="filter"> + <param name="filter_type" value="comment"/> + <param name="comment_char" value="35"/> + </conditional> + </repeat> + </section> + <section name="tbl_opts"> + <param name="col_names" value=",FirstName,LastName,,DOB,"/> + </section> + </repeat> + <repeat name="tables"> + <param name="table" ftype="tabular" value="sales.tsv"/> + <section name="input_opts"> + <repeat name="linefilters"> + <conditional name="filter"> + <param name="filter_type" value="skip"/> + <param name="skip_lines" value="1"/> + </conditional> + </repeat> + </section> + </repeat> + <param name="sqlquery" value="SELECT FirstName,LastName,sum(t2.c3) as "TotalSales" FROM t1 join t2 on t1.c1 = t2.c1 GROUP BY t1.c1 ORDER BY TotalSales DESC;"/> + <output name="output" file="sales_results.tsv"/> + </test> + + <test> + <repeat name="tables"> + <param name="table" ftype="tabular" value="customers.tsv"/> + <section name="input_opts"> + <repeat name="linefilters"> + <conditional name="filter"> + <param name="filter_type" value="skip"/> + <param name="skip_lines" value="1"/> + </conditional> + </repeat> + </section> + <section name="tbl_opts"> + <param name="col_names" value=",FirstName,LastName,,BirthDate,"/> + </section> + </repeat> + <param name="sqlquery" value="select FirstName,LastName,re_sub('^\d{2}(\d{2})-(\d\d)-(\d\d)','\3/\2/\1',BirthDate) as "DOB" from t1 WHERE re_search('[hp]er',c4)"/> + <output name="output" file="regex_results.tsv"/> + </test> + + <test> + <repeat name="tables"> + <param name="table" ftype="tabular" value="IEDB.tsv"/> + <section name="input_opts"> + <repeat name="linefilters"> + <conditional name="filter"> + <param name="filter_type" value="comment"/> + <param name="comment_char" value="35"/> + </conditional> + </repeat> + </section> + <section name="tbl_opts"> + <param name="table_name" value="iedb"/> + <param name="col_names" value="ID,allele,seq_num,start,end,length,peptide,method,percentile_rank,ann_ic50,ann_rank,smm_ic50,smm_rank,comblib_sidney2008_score,comblib_sidney2008_rank,netmhcpan_ic50,netmhcpan_rank"/> + </section> + </repeat> + <repeat name="tables"> + <param name="table" ftype="tabular" value="netMHC_summary.tsv"/> + <section name="input_opts"> + <repeat name="linefilters"> + <conditional name="filter"> + <param name="filter_type" value="skip"/> + <param name="skip_lines" value="1"/> + </conditional> + </repeat> + </section> + <section name="tbl_opts"> + <param name="table_name" value="mhc_summary"/> + <param name="col_names" value="pos,peptide,logscore,affinity,Bind_Level,Protein,Allele"/> + </section> + </repeat> + <param name="sqlquery" value="select iedb.ID,iedb.peptide,iedb.start,iedb.end,iedb.percentile_rank,mhc_summary.logscore,mhc_summary.affinity,mhc_summary.Bind_Level from iedb left outer join mhc_summary on iedb.peptide = mhc_summary.peptide order by affinity,Bind_Level,percentile_rank"/> + <output name="output" file="query_results.tsv"/> + </test> + + <test> + <section name="add_to_database"> + <param name="withdb" ftype="sqlite" value="testdb.sqlite"/> + </section> + <repeat name="tables"> + <param name="table" ftype="tabular" value="pets.tsv"/> + <section name="input_opts"> + <repeat name="linefilters"> + <conditional name="filter"> + <param name="filter_type" value="regex"/> + <param name="regex_pattern" value="^\d+"/> + <param name="regex_action" value="include_find"/> + </conditional> + </repeat> + <repeat name="linefilters"> + <conditional name="filter"> + <param name="filter_type" value="comment"/> + <param name="comment_char" value="35"/> + </conditional> + </repeat> + <repeat name="linefilters"> + <conditional name="filter"> + <param name="filter_type" value="append_line_num"/> + </conditional> + </repeat> + <repeat name="linefilters"> + <conditional name="filter"> + <param name="filter_type" value="select_columns"/> + <param name="columns" value="7,2,3,4,1"/> + </conditional> + </repeat> + <repeat name="linefilters"> + <conditional name="filter"> + <param name="filter_type" value="replace"/> + <param name="column" value="c4"/> + <param name="regex_pattern" value="(\d+)/(\d+)/(\d+)"/> + <param name="regex_replace" value="19\3-\2-\1"/> + </conditional> + </repeat> + </section> + <section name="tbl_opts"> + <param name="table_name" value="people"/> + <param name="col_names" value="id,first,last,dob,pets"/> + </section> + </repeat> + <param name="sqlquery" value="SELECT people.id,first,last,pets,quote FROM people JOIN contacts ON people.first = contacts.first_name"/> + <output name="output" file="add_to_db_results.tsv"/> + </test> + + <test> + <repeat name="tables"> + <param name="table" ftype="tabular" value="pets.tsv"/> + <section name="input_opts"> + <repeat name="linefilters"> + <conditional name="filter"> + <param name="filter_type" value="regex"/> + <param name="regex_pattern" value="^\d+"/> + <param name="regex_action" value="include_find"/> + </conditional> + </repeat> + <repeat name="linefilters"> + <conditional name="filter"> + <param name="filter_type" value="comment"/> + <param name="comment_char" value="35"/> + </conditional> + </repeat> + <repeat name="linefilters"> + <conditional name="filter"> + <param name="filter_type" value="append_line_num"/> + </conditional> + </repeat> + <repeat name="linefilters"> + <conditional name="filter"> + <param name="filter_type" value="select_columns"/> + <param name="columns" value="7,2,3,4,1"/> + </conditional> + </repeat> + <repeat name="linefilters"> + <conditional name="filter"> + <param name="filter_type" value="replace"/> + <param name="column" value="c4"/> + <param name="regex_pattern" value="(\d+)/(\d+)/(\d+)"/> + <param name="regex_replace" value="19\3-\2-\1"/> + </conditional> + </repeat> + </section> + <section name="tbl_opts"> + <param name="table_name" value="people"/> + <param name="col_names" value="id,first,last,dob,pets"/> + </section> + </repeat> + <repeat name="tables"> + <param name="table" ftype="tabular" value="pets.tsv"/> + <section name="input_opts"> + <repeat name="linefilters"> + <conditional name="filter"> + <param name="filter_type" value="regex"/> + <param name="regex_pattern" value="^\d+"/> + <param name="regex_action" value="include_find"/> + </conditional> + </repeat> + <repeat name="linefilters"> + <conditional name="filter"> + <param name="filter_type" value="append_line_num"/> + </conditional> + </repeat> + <repeat name="linefilters"> + <conditional name="filter"> + <param name="filter_type" value="select_columns"/> + <param name="columns" value="c7,c5,c6"/> + </conditional> + </repeat> + <repeat name="linefilters"> + <conditional name="filter"> + <param name="filter_type" value="normalize"/> + <param name="columns" value="c2,c3"/> + <param name="separator" value=","/> + </conditional> + </repeat> + </section> + <section name="tbl_opts"> + <param name="table_name" value="pet"/> + <param name="col_names" value="id,name,animal"/> + </section> + </repeat> + <param name="sqlquery" value="SELECT people.id,first,last,dob,name,animal,pets FROM people JOIN pet ON people.id = pet.id WHERE animal = 'cat'"/> + <output name="output" file="pet_normalized_query_results.tsv"/> + </test> + + </tests> + <help><![CDATA[ +============= +Query Tabular +============= + +**Inputs** + + Loads tabular datasets into a SQLite_ data base. + + An existing SQLite_ data base can be used as input, and any selected tabular datasets will be added as new tables in that data base. + + +@LINEFILTERS_HELP@ + + +**Outputs** + + The results of a SQL query are output to the history as a tabular file. + + The SQLite_ data base can also be saved and output as a dataset in the history. + + *(The* **SQLite to tabular** *tool can run additional queries on this database.)* + + +@QUERY_HELP@ + +@LINEFILTERS_HELP_EXAMPLE@ + + + Table name: pets + + Table columns: Pets,FirstName,LastName,Birthdate,PetNames,PetType,line_num,entry_num,row_num + + Query: SELECT * FROM pets + + Result: + + ====== ========== ======== ========== ========= ======== ========= ========== ======== + #Pets FirstName LastName BirthDate PetNames PetType line_num entry_num row_num + ====== ========== ======== ========== ========= ======== ========= ========== ======== + 2 Paula Brown 1978-05-24 Rex dog 3 1 1 + 2 Paula Brown 1978-05-24 Fluff cat 3 1 2 + 1 Steven Jones 1974-04-04 Allie cat 4 2 3 + 0 Jane Doe 1978-05-24 5 3 4 + 1 James Smith 1980-10-20 Spot 6 4 5 + ====== ========== ======== ========== ========= ======== ========= ========== ======== + + +**Normalizing by Line Filtering into 2 Tables** + +*Relational database opertions work with single-valued column entries. +To apply relational operations to tabular files that contain fields with lists of values, +we need to "normalize" those fields, duplicating lines for each item in the list. +In this example we create 2 tables, one for single-valued fields and a second with list-valued fields normalized. +Becauce we add a line number first for each table, we can join the 2 tables on the line number column.* +https://en.wikipedia.org/wiki/First_normal_form + + *People Table* + + :: + + Filter 1 - by regex expression matching [include]: '^\d+' (include lines that start with a number) + Filter 2 - append a line number column: + Filter 3 - regex replace value in column[4]: '(\d+)/(\d+)/(\d+)' '19\3-\2-\1' (convert dates to sqlite format) + Filter 4 - select columns 7,2,3,4,1 + + Table: People + Columns: id,FirstName,LastName,DOB,Pets + + == ========= ======== ========== ==== + id FirstName LastName DOB Pets + == ========= ======== ========== ==== + 1 Paula Brown 1978-05-24 2 + 2 Steven Jones 1974-04-04 1 + 3 Jane Doe 1978-05-24 0 + 4 James Smith 1980-10-20 1 + == ========= ======== ========== ==== + + + *Pet Table* + + :: + + Filter 1 - by regex expression matching [include]: '^\d+' (include lines that start with a number) + Filter 2 - append a line number column: + Filter 3 - by regex expression matching [exclude]: '^0\t' (exclude lines with no pets) + Filter 4 - normalize list columns[5,6]: + Filter 5 - select columns 7,5,6 + + Table: Pet + Columns: id,PetName,PetType + + == ======== ======== + id PetName PetType + == ======== ======== + 1 Rex dog + 1 Fluff cat + 2 Allie cat + 4 Spot + == ======== ======== + + + Query: SELECT FirstName,LastName,PetName FROM People JOIN Pet ON People.id = Pet.id WHERE PetType = 'cat'; + + Result: + + ========= ======== ======== + FirstName LastName PetName + ========= ======== ======== + Paula Brown Fluff + Steven Jones Allie + ========= ======== ======== + + + ]]></help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sqlite_to_tabular.py Tue Jul 18 09:07:07 2017 -0400 @@ -0,0 +1,60 @@ +#!/usr/bin/env python + +from __future__ import print_function + +import optparse +import os.path +import sys + +from query_db import describe_tables, get_connection, run_query + + +def __main__(): + # Parse Command Line + parser = optparse.OptionParser() + parser.add_option('-s', '--sqlitedb', dest='sqlitedb', default=None, + help='The SQLite Database') + parser.add_option('-q', '--query', dest='query', default=None, + help='SQL query') + parser.add_option('-Q', '--query_file', dest='query_file', default=None, + help='SQL query file') + parser.add_option('-n', '--no_header', dest='no_header', default=False, + action='store_true', + help='Include a column headers line') + parser.add_option('-o', '--output', dest='output', default=None, + help='Output file for query results') + (options, args) = parser.parse_args() + + # determine output destination + if options.output is not None: + try: + outputPath = os.path.abspath(options.output) + outputFile = open(outputPath, 'w') + except Exception as e: + exit('Error: %s' % (e)) + else: + outputFile = sys.stdout + + query = None + if options.query_file is not None: + with open(options.query_file, 'r') as fh: + query = fh.read() + elif options.query is not None: + query = options.query + + if query is None: + try: + describe_tables(get_connection(options.sqlitedb), outputFile) + except Exception as e: + exit('Error: %s' % (e)) + exit(0) + else: + try: + run_query(get_connection(options.sqlitedb), query, outputFile, + no_header=options.no_header) + except Exception as e: + exit('Error: %s' % (e)) + + +if __name__ == "__main__": + __main__()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/IEDB.tsv Tue Jul 18 09:07:07 2017 -0400 @@ -0,0 +1,17 @@ +#ID allele seq_num start end length peptide method percentile_rank ann_ic50 ann_rank smm_ic50 smm_rank comblib_sidney2008_score comblib_sidney2008_rank netmhcpan_ic50 netmhcpan_rank +PPAP2C HLA-A*02:01 1 3 11 9 GMYCMVFLV Consensus (ann/smm/comblib_sidney2008) 0.2 4 0.2 3.77 0.2 7.1e-06 0.5 - - +PPAP2C HLA-A*23:01 1 1 9 9 SFGMYCMVF Consensus (ann/smm) 0.5 67 0.5 137.54 0.5 - - - - +PPAP2C HLA-A*23:01 1 4 12 9 MYCMVFLVK Consensus (ann/smm) 0.65 146 0.7 160.11 0.6 - - - - +PPAP2C HLA-A*02:01 1 2 10 9 FGMYCMVFL Consensus (ann/smm/comblib_sidney2008) 2.3 222 3.1 150.01 2.3 2.14e-05 1.3 - - +PPAP2C HLA-A*23:01 1 3 11 9 GMYCMVFLV Consensus (ann/smm) 4.95 3256 4 2706.64 5.9 - - - - +PPAP2C HLA-A*23:01 1 2 10 9 FGMYCMVFL Consensus (ann/smm) 6.55 4423 4.9 4144.10 8.2 - - - - +PPAP2C HLA-A*02:01 1 1 9 9 SFGMYCMVF Consensus (ann/smm/comblib_sidney2008) 45 24390 45 44989.38 39 0.01 91 - - +PPAP2C HLA-A*02:01 1 4 12 9 MYCMVFLVK Consensus (ann/smm/comblib_sidney2008) 54 23399 41 157801.09 54 0.01 86 - - +ADAMTSL1 HLA-A*02:01 1 1 9 9 SLDMCISGL Consensus (ann/smm/comblib_sidney2008) 1 26 1 51.65 0.9 3.02e-05 1.7 - - +ADAMTSL1 HLA-A*23:01 1 4 12 9 MCISGLCQL Consensus (ann/smm) 6.65 5781 5.9 3626.02 7.4 - - - - +ADAMTSL1 HLA-A*02:01 1 4 12 9 MCISGLCQL Consensus (ann/smm/comblib_sidney2008) 14 1823 6.5 2612.82 14 0.00056 24 - - +ADAMTSL1 HLA-A*23:01 1 1 9 9 SLDMCISGL Consensus (ann/smm) 30.5 27179 34 24684.82 27 - - - - +ADAMTSL1 HLA-A*02:01 1 2 10 9 LDMCISGLC Consensus (ann/smm/comblib_sidney2008) 42 23677 42 53716.78 41 0.01 71 - - +ADAMTSL1 HLA-A*23:01 1 3 11 9 DMCISGLCQ Consensus (ann/smm) 64.5 34451 73 118148.99 56 - - - - +ADAMTSL1 HLA-A*23:01 1 2 10 9 LDMCISGLC Consensus (ann/smm) 76.0 33222 62 665932.18 90 - - - - +ADAMTSL1 HLA-A*02:01 1 3 11 9 DMCISGLCQ Consensus (ann/smm/comblib_sidney2008) 97 31630 98 639896.89 71 0.03 97 - -
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/add_to_db_results.tsv Tue Jul 18 09:07:07 2017 -0400 @@ -0,0 +1,3 @@ +#id first last pets quote +1 Paula Brown 2 Time flies like and arrow. Fruit flies like a banana. +2 Steven Jones 1 I would have wrtten less if I had more time
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/customers.tsv Tue Jul 18 09:07:07 2017 -0400 @@ -0,0 +1,5 @@ +#CustomerID FirstName LastName Email DOB Phone +1 John Smith John.Smith@yahoo.com 1968-02-04 626 222-2222 +2 Steven Goldfish goldfish@fishhere.net 1974-04-04 323 455-4545 +3 Paula Brown pb@herowndomain.org 1978-05-24 416 323-3232 +4 James Smith jim@supergig.co.uk 1980-10-20 416 323-8888
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/filtered_people_results.tsv Tue Jul 18 09:07:07 2017 -0400 @@ -0,0 +1,4 @@ +1 Paula Brown 1978-05-24 2 +2 Steven Jones 1974-04-04 1 +3 Jane Doe 1978-05-24 0 +4 James Smith 1980-10-20 1
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/filtered_pets_results.tsv Tue Jul 18 09:07:07 2017 -0400 @@ -0,0 +1,4 @@ +1 Rex dog +1 Fluff cat +2 Allie cat +4 Spot
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/netMHC_summary.tsv Tue Jul 18 09:07:07 2017 -0400 @@ -0,0 +1,9 @@ +#pos peptide logscore affinity(nM) Bind Level Protein Name Allele +2 GMYCMVFLV 0.858 4 SB PPAP2C HLA-A02:01 +1 FGMYCMVFL 0.501 222 WB PPAP2C HLA-A02:01 +3 MYCMVFLVK 0.070 23399 PPAP2C HLA-A02:01 +0 SFGMYCMVF 0.066 24390 PPAP2C HLA-A02:01 +0 SLDMCISGL 0.698 26 SB ADAMTSL1 HLA-A02:01 +3 MCISGLCQL 0.306 1823 ADAMTSL1 HLA-A02:01 +1 LDMCISGLC 0.069 23677 ADAMTSL1 HLA-A02:01 +2 DMCISGLCQ 0.042 31630 ADAMTSL1 HLA-A02:01
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/pet_normalized_query_results.tsv Tue Jul 18 09:07:07 2017 -0400 @@ -0,0 +1,3 @@ +#id first last dob name animal pets +1 Paula Brown 1978-05-24 Fluff cat 2 +2 Steven Jones 1974-04-04 Allie cat 1
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/pets.tsv Tue Jul 18 09:07:07 2017 -0400 @@ -0,0 +1,7 @@ +#People with pets +Pets FirstName LastName DOB PetNames PetType +2 Paula Brown 24/05/78 Rex,Fluff dog,cat +1 Steven Jones 04/04/74 Allie cat +0 Jane Doe 24/05/78 +1 James Smith 20/10/80 Spot +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/query_results.tsv Tue Jul 18 09:07:07 2017 -0400 @@ -0,0 +1,17 @@ +#ID peptide start end percentile_rank logscore affinity Bind_Level +PPAP2C GMYCMVFLV 3 11 0.2 0.858 4 SB +PPAP2C GMYCMVFLV 3 11 4.95 0.858 4 SB +ADAMTSL1 SLDMCISGL 1 9 1.0 0.698 26 SB +ADAMTSL1 SLDMCISGL 1 9 30.5 0.698 26 SB +PPAP2C FGMYCMVFL 2 10 2.3 0.501 222 WB +PPAP2C FGMYCMVFL 2 10 6.55 0.501 222 WB +ADAMTSL1 MCISGLCQL 4 12 6.65 0.306 1823 +ADAMTSL1 MCISGLCQL 4 12 14.0 0.306 1823 +PPAP2C MYCMVFLVK 4 12 0.65 0.07 23399 +PPAP2C MYCMVFLVK 4 12 54.0 0.07 23399 +ADAMTSL1 LDMCISGLC 2 10 42.0 0.069 23677 +ADAMTSL1 LDMCISGLC 2 10 76.0 0.069 23677 +PPAP2C SFGMYCMVF 1 9 0.5 0.066 24390 +PPAP2C SFGMYCMVF 1 9 45.0 0.066 24390 +ADAMTSL1 DMCISGLCQ 3 11 64.5 0.042 31630 +ADAMTSL1 DMCISGLCQ 3 11 97.0 0.042 31630
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/regex_results.tsv Tue Jul 18 09:07:07 2017 -0400 @@ -0,0 +1,4 @@ +#FirstName LastName DOB +Steven Goldfish 04/04/74 +Paula Brown 24/05/78 +James Smith 20/10/80
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sales.tsv Tue Jul 18 09:07:07 2017 -0400 @@ -0,0 +1,6 @@ +#CustomerID Date SaleAmount +2 2004-05-06 100.22 +1 2004-05-07 99.95 +3 2004-05-07 122.95 +3 2004-05-13 100.00 +4 2004-05-22 555.55
