Next changeset 1:8a33b442ecd9 (2017-08-18) |
Commit message:
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/query_tabular commit 74915fc9cee746bbce1c4b507e13231259de177d |
added:
filter_tabular.py filters.py load_db.py macros.xml query_db.py query_tabular.py query_tabular.xml sqlite_to_tabular.py test-data/IEDB.tsv test-data/add_to_db_results.tsv test-data/customers.tsv test-data/filtered_people_results.tsv test-data/filtered_pets_results.tsv test-data/netMHC_summary.tsv test-data/pet_normalized_query_results.tsv test-data/pets.tsv test-data/query_results.tsv test-data/regex_results.tsv test-data/sales.tsv test-data/sales_results.tsv test-data/testdb.sqlite |
b |
diff -r 000000000000 -r 3708ff0198b7 filter_tabular.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filter_tabular.py Tue Jul 18 09:07:07 2017 -0400 |
[ |
@@ -0,0 +1,68 @@ +#!/usr/bin/env python + +from __future__ import print_function + +import json +import optparse +import os.path +import sys + +from filters import filter_file + + +def __main__(): + # Parse Command Line + parser = optparse.OptionParser() + parser.add_option('-i', '--input', dest='input', default=None, + help='Input file for filtering') + parser.add_option('-j', '--jsonfile', dest='jsonfile', default=None, + help='JSON array of filter specifications') + parser.add_option('-o', '--output', dest='output', default=None, + help='Output file for query results') + parser.add_option('-v', '--verbose', dest='verbose', default=False, + action='store_true', + help='verbose') + (options, args) = parser.parse_args() + + if options.input is not None: + try: + inputPath = os.path.abspath(options.input) + inputFile = open(inputPath, 'r') + except Exception as e: + exit('Error: %s' % (e)) + else: + inputFile = sys.stdin + + if options.output is not None: + try: + outputPath = os.path.abspath(options.output) + outputFile = open(outputPath, 'w') + except Exception as e: + exit('Error: %s' % (e)) + else: + outputFile = sys.stdout + + filters = None + if options.jsonfile: + try: + with open(options.jsonfile) as fh: + filters = json.load(fh) + except Exception as e: + exit('Error: %s' % (e)) + + if options.verbose and filters: + for f in filters: + print('%s %s' % (f['filter'], + ', '.join( + ['%s: %s' % (k, f[k]) + for k in set(f.keys()) - set(['filter'])])), + file=sys.stdout) + + try: + filter_file(inputFile, outputFile, filters=filters) + except Exception as e: + exit('Error: %s' % (e)) + + +if __name__ == "__main__": + __main__() |
b |
diff -r 000000000000 -r 3708ff0198b7 filters.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters.py Tue Jul 18 09:07:07 2017 -0400 |
[ |
@@ -0,0 +1,156 @@ +#!/usr/binsenv python + +from __future__ import print_function + +import re +import sys + + +class LineFilter(object): + def __init__(self, source, filter_dict): + self.source = source + self.filter_dict = filter_dict + self.func = lambda i, l: l.rstrip('\r\n') if l else None + self.src_lines = [] + self.src_line_cnt = 0 + if not filter_dict: + return + if filter_dict['filter'] == 'regex': + rgx = re.compile(filter_dict['pattern']) + if filter_dict['action'] == 'exclude_match': + self.func = lambda i, l: l if not rgx.match(l) else None + elif filter_dict['action'] == 'include_match': + self.func = lambda i, l: l if rgx.match(l) else None + elif filter_dict['action'] == 'exclude_find': + self.func = lambda i, l: l if not rgx.search(l) else None + elif filter_dict['action'] == 'include_find': + self.func = lambda i, l: l if rgx.search(l) else None + elif filter_dict['filter'] == 'select_columns': + cols = [int(c) - 1 for c in filter_dict['columns']] + self.func = lambda i, l: self.select_columns(l, cols) + elif filter_dict['filter'] == 'replace': + p = filter_dict['pattern'] + r = filter_dict['replace'] + c = int(filter_dict['column']) - 1 + self.func = lambda i, l: '\t'.join( + [x if j != c else re.sub(p, r, x) for j, x in enumerate(l.split('\t'))]) + elif filter_dict['filter'] == 'prepend_line_num': + self.func = lambda i, l: '%d\t%s' % (i, l) + elif filter_dict['filter'] == 'append_line_num': + self.func = lambda i, l: '%s\t%d' % (l.rstrip('\r\n'), i) + elif filter_dict['filter'] == 'prepend_text': + s = filter_dict['column_text'] + self.func = lambda i, l: '%s\t%s' % (s, l) + elif filter_dict['filter'] == 'append_text': + s = filter_dict['column_text'] + self.func = lambda i, l: '%s\t%s' % (l.rstrip('\r\n'), s) + elif filter_dict['filter'] == 'skip': + cnt = filter_dict['count'] + self.func = lambda i, l: l if i > cnt else None + elif filter_dict['filter'] == 'normalize': + cols = [int(c) - 1 for c in filter_dict['columns']] + sep = filter_dict['separator'] + self.func = lambda i, l: self.normalize(l, cols, sep) + + def __iter__(self): + return self + + def __next__(self): + if not self.src_lines: + self.get_lines() + if self.src_lines: + return self.src_lines.pop(0) + raise StopIteration + + next = __next__ + + def select_columns(self, line, cols): + fields = line.split('\t') + return '\t'.join([fields[x] for x in cols]) + + def normalize(self, line, split_cols, sep): + lines = [] + fields = line.rstrip('\r\n').split('\t') + split_fields = dict() + cnt = 0 + for c in split_cols: + if c < len(fields): + split_fields[c] = fields[c].split(sep) + cnt = max(cnt, len(split_fields[c])) + if cnt == 0: + lines.append('\t'.join(fields)) + else: + for n in range(0, cnt): + flds = [x if c not in split_cols else split_fields[c][n] + if n < len(split_fields[c]) + else '' for (c, x) in enumerate(fields)] + lines.append('\t'.join(flds)) + return lines + + def get_lines(self): + for i, next_line in enumerate(self.source): + self.src_line_cnt += 1 + line = self.func(self.src_line_cnt, next_line) + if line: + if isinstance(line, list): + self.src_lines.extend(line) + else: + self.src_lines.append(line) + return + + +class TabularReader: + """ + Tabular file iterator. Returns a list + """ + def __init__(self, input_file, skip=0, comment_char=None, col_idx=None, + filters=None): + self.skip = skip + self.comment_char = comment_char + self.col_idx = col_idx + self.filters = filters + self.tsv_file = \ + input_file if hasattr(input_file, 'readline') else open(input_file) + if skip and skip > 0: + for i in range(skip): + if not self.tsv_file.readline(): + break + source = LineFilter(self.tsv_file, None) + if comment_char: + source = LineFilter(source, + {"filter": "regex", "pattern": comment_char, + "action": "exclude_match"}) + if filters: + for f in filters: + source = LineFilter(source, f) + self.source = source + + def __iter__(self): + return self + + def __next__(self): + ''' Iteration ''' + for i, line in enumerate(self.source): + fields = line.rstrip('\r\n').split('\t') + if self.col_idx: + fields = [fields[i] for i in self.col_idx] + return fields + raise StopIteration + + next = __next__ + + +def filter_file(input_file, output, skip=0, comment_char='#', filters=None): + data_lines = 0 + try: + tr = TabularReader(input_file, skip=skip, comment_char=comment_char, + filters=filters) + for linenum, fields in enumerate(tr): + data_lines += 1 + try: + output.write('%s\n' % '\t'.join(fields)) + except Exception as e: + print('Failed at line: %d err: %s' % (linenum, e), + file=sys.stderr) + except Exception as e: + exit('Error: %s' % (e)) |
b |
diff -r 000000000000 -r 3708ff0198b7 load_db.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/load_db.py Tue Jul 18 09:07:07 2017 -0400 |
[ |
@@ -0,0 +1,135 @@ +#!/usr/bin/env python + +from __future__ import print_function + +import sys + +from filters import TabularReader + + +def getValueType(val): + if val or 0. == val: + try: + int(val) + return 'INTEGER' + except: + try: + float(val) + return 'REAL' + except: + return 'TEXT' + return None + + +def get_column_def(file_path, table_name, skip=0, comment_char='#', + column_names=None, max_lines=100, load_named_columns=False, + filters=None): + col_pref = ['TEXT', 'REAL', 'INTEGER', None] + col_types = [] + col_idx = None + try: + tr = TabularReader(file_path, skip=skip, comment_char=comment_char, + col_idx=None, filters=filters) + for linenum, fields in enumerate(tr): + if linenum > max_lines: + break + try: + while len(col_types) < len(fields): + col_types.append(None) + for i, val in enumerate(fields): + colType = getValueType(val) + if col_pref.index(colType) < col_pref.index(col_types[i]): + col_types[i] = colType + except Exception as e: + print('Failed at line: %d err: %s' % (linenum, e), + file=sys.stderr) + except Exception as e: + print('Failed: %s' % (e), file=sys.stderr) + for i, col_type in enumerate(col_types): + if not col_type: + col_types[i] = 'TEXT' + if column_names: + col_names = [] + if load_named_columns: + col_idx = [] + for i, cname in enumerate( + [cn.strip() for cn in column_names.split(',')]): + if cname != '': + col_idx.append(i) + col_names.append(cname) + col_types = [col_types[i] for i in col_idx] + else: + col_names = ['c%d' % i for i in range(1, len(col_types) + 1)] + for i, cname in enumerate( + [cn.strip() for cn in column_names.split(',')]): + if cname and i < len(col_names): + col_names[i] = cname + else: + col_names = ['c%d' % i for i in range(1, len(col_types) + 1)] + col_def = [] + for i, col_name in enumerate(col_names): + col_def.append('%s %s' % (col_names[i], col_types[i])) + return col_names, col_types, col_def, col_idx + + +def create_table(conn, file_path, table_name, skip=0, comment_char='#', + pkey_autoincr=None, column_names=None, + load_named_columns=False, filters=None, + unique_indexes=[], indexes=[]): + col_names, col_types, col_def, col_idx = \ + get_column_def(file_path, table_name, skip=skip, + comment_char=comment_char, column_names=column_names, + load_named_columns=load_named_columns, filters=filters) + col_func = [float if t == 'REAL' else int + if t == 'INTEGER' else str for t in col_types] + table_def = 'CREATE TABLE %s (\n %s%s\n);' % ( + table_name, + '%s INTEGER PRIMARY KEY AUTOINCREMENT,' % + pkey_autoincr if pkey_autoincr else '', + ', \n '.join(col_def)) + # print >> sys.stdout, table_def + insert_stmt = 'INSERT INTO %s(%s) VALUES(%s)' % ( + table_name, ','.join(col_names), + ','.join(["?" for x in col_names])) + # print >> sys.stdout, insert_stmt + data_lines = 0 + try: + c = conn.cursor() + c.execute(table_def) + conn.commit() + c.close() + for i, index in enumerate(unique_indexes): + index_name = 'idx_uniq_%s_%d' % (table_name, i) + index_columns = index.split(',') + create_index(conn, table_name, index_name, index_columns, + unique=True) + for i, index in enumerate(indexes): + index_name = 'idx_%s_%d' % (table_name, i) + index_columns = index.split(',') + create_index(conn, table_name, index_name, index_columns) + c = conn.cursor() + tr = TabularReader(file_path, skip=skip, comment_char=comment_char, + col_idx=col_idx, filters=filters) + for linenum, fields in enumerate(tr): + data_lines += 1 + try: + vals = [col_func[i](x) + if x else None for i, x in enumerate(fields)] + c.execute(insert_stmt, vals) + except Exception as e: + print('Failed at line: %d err: %s' % (linenum, e), + file=sys.stderr) + conn.commit() + c.close() + except Exception as e: + exit('Error: %s' % (e)) + + +def create_index(conn, table_name, index_name, index_columns, unique=False): + index_def = "CREATE %s INDEX %s on %s(%s)" % ( + 'UNIQUE' if unique else '', index_name, + table_name, ','.join(index_columns)) + c = conn.cursor() + c.execute(index_def) + conn.commit() + c.close() |
b |
diff -r 000000000000 -r 3708ff0198b7 macros.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Tue Jul 18 09:07:07 2017 -0400 |
[ |
b'@@ -0,0 +1,383 @@\n+<macros>\n+ <token name="@LINEFILTERS@">\n+<![CDATA[\n+ ## set linefilters to the \n+ #set $input_filters = []\n+ #for $fi in $linefilters:\n+ #if $fi.filter.filter_type == \'skip\':\n+ #set $skip_lines = None\n+ #if str($fi.filter.skip_lines) != \'\':\n+ #set $skip_lines = int($fi.filter.skip_lines)\n+ #elif $tbl.table.metadata.comment_lines and $tbl.table.metadata.comment_lines > 0:\n+ #set $skip_lines = int($tbl.table.metadata.comment_lines)\n+ #end if\n+ #if $skip_lines is not None:\n+ #set $filter_dict = dict()\n+ #set $filter_dict[\'filter\'] = str($fi.filter.filter_type)\n+ #set $filter_dict[\'count\'] = $skip_lines\n+ #silent $input_filters.append($filter_dict)\n+ #end if\n+ #elif $fi.filter.filter_type == \'comment\':\n+ #set $filter_dict = dict()\n+ #set $filter_dict[\'filter\'] = \'regex\'\n+ #set $filter_dict[\'pattern\'] = \'^(%s).*$\' % \'|\'.join([chr(int(x)).replace(\'|\',\'[|]\') for x in (str($fi.filter.comment_char)).split(\',\')])\n+ #set $filter_dict[\'action\'] = \'exclude_match\'\n+ #silent $input_filters.append($filter_dict)\n+ #elif $fi.filter.filter_type == \'regex\':\n+ #set $filter_dict = dict()\n+ #set $filter_dict[\'filter\'] = str($fi.filter.filter_type)\n+ #set $filter_dict[\'pattern\'] = str($fi.filter.regex_pattern)\n+ #set $filter_dict[\'action\'] = str($fi.filter.regex_action)\n+ #silent $input_filters.append($filter_dict)\n+ #elif $fi.filter.filter_type == \'select_columns\':\n+ #set $filter_dict = dict()\n+ #set $filter_dict[\'filter\'] = str($fi.filter.filter_type)\n+ #set $filter_dict[\'columns\'] = [int(str($ci).replace(\'c\',\'\')) for $ci in str($fi.filter.columns).split(\',\')]\n+ #silent $input_filters.append($filter_dict)\n+ #elif $fi.filter.filter_type == \'replace\':\n+ #set $filter_dict = dict()\n+ #set $filter_dict[\'filter\'] = str($fi.filter.filter_type)\n+ #set $filter_dict[\'column\'] = int(str($fi.filter.column).replace(\'c\',\'\'))\n+ #set $filter_dict[\'pattern\'] = str($fi.filter.regex_pattern)\n+ #set $filter_dict[\'replace\'] = str($fi.filter.regex_replace)\n+ #silent $input_filters.append($filter_dict)\n+ #elif str($fi.filter.filter_type).endswith(\'pend_line_num\'):\n+ #set $filter_dict = dict()\n+ #set $filter_dict[\'filter\'] = str($fi.filter.filter_type)\n+ #silent $input_filters.append($filter_dict)\n+ #elif str($fi.filter.filter_type).endswith(\'pend_text\'):\n+ #set $filter_dict = dict()\n+ #set $filter_dict[\'filter\'] = str($fi.filter.filter_type)\n+ #set $filter_dict[\'column_text\'] = str($fi.filter.column_text)\n+ #silent $input_filters.append($filter_dict)\n+ #elif $fi.filter.filter_type == \'normalize\':\n+ #set $filter_dict = dict()\n+ #set $filter_dict[\'filter\'] = str($fi.filter.filter_type)\n+ #set $filter_dict[\'columns\'] = [int(str($ci).replace(\'c\',\'\')) for $ci in str($fi.filter.columns).split(\',\')]\n+ #set $filter_dict[\'separator\'] = str($fi.filter.separator)\n+ #silent $input_filters.append($filter_dict)\n+ #end if\n+ #end for\n+]]>\n+ </token>\n+ <xml name="macro_line_filters">\n+ <repeat name="linefilters" title="Filter Tabular Input Lines">\n+ <conditional name="filter">\n+ <param name="filter_type" type="select" label="Filter By">\n+ <option value="skip">skip leading lines</option>\n+ <option value="comment">comment char</option>\n+ <option value="regex">by regex expression matching</option>\n+ <option value="select_columns">select columns</option>\n+ <option value="replace">regex replace value in column</option>\n+ <option value="prepend_line_num">prepend a line number column</option>\n+ <option value="append_line_num">append a line number column</option>\n+ '..b'==== ============\n+ #CustomerID FirstName LastName Email DOB Phone\n+ =========== ========== ========== ===================== ========== ============\n+ 1 John Smith John.Smith@yahoo.com 1968-02-04 626 222-2222\n+ 2 Steven Goldfish goldfish@fishhere.net 1974-04-04 323 455-4545\n+ 3 Paula Brown pb@herowndomain.org 1978-05-24 416 323-3232\n+ 4 James Smith jim@supergig.co.uk 1980-10-20 416 323-8888\n+ =========== ========== ========== ===================== ========== ============\n+ \n+ Dataset *sales*\n+ \n+ Table name: "sales"\n+ \n+ Column names: "CustomerID,Date,SaleAmount"\n+ \n+ ============= ============ ============\n+ #CustomerID Date SaleAmount\n+ ============= ============ ============\n+ 2 2004-05-06 100.22\n+ 1 2004-05-07 99.95\n+ 3 2004-05-07 122.95\n+ 3 2004-05-13 100.00\n+ 4 2004-05-22 555.55\n+ ============= ============ ============\n+ \n+ The query\n+ \n+ ::\n+ \n+ SELECT FirstName,LastName,sum(SaleAmount) as "TotalSales" \n+ FROM customers join sales on customers.CustomerID = sales.CustomerID \n+ GROUP BY customers.CustomerID ORDER BY TotalSales DESC;\n+ \n+ Produces this tabular output:\n+ \n+ ========== ======== ==========\n+ #FirstName LastName TotalSales\n+ ========== ======== ==========\n+ James Smith 555.55\n+ Paula Brown 222.95\n+ Steven Goldfish 100.22\n+ John Smith 99.95\n+ ========== ======== ==========\n+ \n+ \n+ If the optional Table name and Column names inputs are not used, the query would be:\n+ \n+ ::\n+ \n+ SELECT t1.c2 as "FirstName", t1.c3 as "LastName", sum(t2.c3) as "TotalSales" \n+ FROM t1 join t2 on t1.c1 = t2.c1 \n+ GROUP BY t1.c1 ORDER BY TotalSales DESC;\n+ \n+ You can selectively name columns, e.g. on the customers input you could just name columns 2,3, and 5: \n+ \n+ Column names: ,FirstName,LastName,,BirthDate\n+ \n+ Results in the following data base table\n+ \n+ =========== ========== ========== ===================== ========== ============\n+ #c1 FirstName LastName c4 BirthDate c6\n+ =========== ========== ========== ===================== ========== ============\n+ 1 John Smith John.Smith@yahoo.com 1968-02-04 626 222-2222\n+ 2 Steven Goldfish goldfish@fishhere.net 1974-04-04 323 455-4545\n+ 3 Paula Brown pb@herowndomain.org 1978-05-24 416 323-3232\n+ 4 James Smith jim@supergig.co.uk 1980-10-20 416 323-8888\n+ =========== ========== ========== ===================== ========== ============\n+\n+\n+ Regular_expression_ functions are included for: \n+\n+ ::\n+\n+ matching: re_match(\'pattern\',column) \n+\n+ SELECT t1.FirstName, t1.LastName\n+ FROM t1\n+ WHERE re_match(\'^.*\\.(net|org)$\',c4)\n+\n+ Results:\n+\n+ =========== ==========\n+ #FirstName LastName\n+ =========== ==========\n+ Steven Goldfish\n+ Paula Brown\n+ =========== ==========\n+\n+\n+ ::\n+\n+ searching: re_search(\'pattern\',column)\n+ substituting: re_sub(\'pattern\',\'replacement,column)\n+\n+ SELECT t1.FirstName, t1.LastName, re_sub(\'^\\d{2}(\\d{2})-(\\d\\d)-(\\d\\d)\',\'\\3/\\2/\\1\',BirthDate) as "DOB"\n+ FROM t1\n+ WHERE re_search(\'[hp]er\',c4)\n+\n+ Results:\n+\n+\n+ =========== ========== ==========\n+ #FirstName LastName DOB\n+ =========== ========== ==========\n+ Steven Goldfish 04/04/74\n+ Paula Brown 24/05/78\n+ James Smith 20/10/80\n+ =========== ========== ==========\n+\n+.. _Regular_expression: https://docs.python.org/release/2.7/library/re.html\n+.. _SQLite: http://www.sqlite.org/index.html\n+.. _SQLite_functions: http://www.sqlite.org/docs.html\n+\n+\n+]]>\n+ </token>\n+\n+</macros>\n+\n' |
b |
diff -r 000000000000 -r 3708ff0198b7 query_db.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/query_db.py Tue Jul 18 09:07:07 2017 -0400 |
[ |
@@ -0,0 +1,67 @@ +#!/usr/bin/env python + +from __future__ import print_function + +import re +import sqlite3 as sqlite +import sys + + +TABLE_QUERY = \ + """ + SELECT name, sql + FROM sqlite_master + WHERE type='table' + ORDER BY name + """ + + +def regex_match(expr, item): + return re.match(expr, item) is not None + + +def regex_search(expr, item): + return re.search(expr, item) is not None + + +def regex_sub(expr, replace, item): + return re.sub(expr, replace, item) + + +def get_connection(sqlitedb_path, addfunctions=True): + conn = sqlite.connect(sqlitedb_path) + if addfunctions: + conn.create_function("re_match", 2, regex_match) + conn.create_function("re_search", 2, regex_search) + conn.create_function("re_sub", 3, regex_sub) + return conn + + +def describe_tables(conn, outputFile): + try: + c = conn.cursor() + tables_query = TABLE_QUERY + rslt = c.execute(tables_query).fetchall() + for table, sql in rslt: + print("Table %s:" % table, file=outputFile) + try: + col_query = 'SELECT * FROM %s LIMIT 0' % table + cur = conn.cursor().execute(col_query) + cols = [col[0] for col in cur.description] + print(" Columns: %s" % cols, file=outputFile) + except Exception as exc: + print("Warning: %s" % exc, file=sys.stderr) + except Exception as e: + exit('Error: %s' % (e)) + exit(0) + + +def run_query(conn, query, outputFile, no_header=False): + cur = conn.cursor() + results = cur.execute(query) + if not no_header: + outputFile.write("#%s\n" % '\t'.join( + [str(col[0]) for col in cur.description])) + for i, row in enumerate(results): + outputFile.write("%s\n" % '\t'.join( + [str(val) if val is not None else '' for val in row])) |
b |
diff -r 000000000000 -r 3708ff0198b7 query_tabular.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/query_tabular.py Tue Jul 18 09:07:07 2017 -0400 |
[ |
@@ -0,0 +1,137 @@ +#!/usr/bin/env python + +from __future__ import print_function + +import json +import optparse +import os.path +import sys + +from load_db import create_table + +from query_db import describe_tables, get_connection, run_query + + +""" +JSON config: +{ tables : [ + { file_path : '/home/galaxy/dataset_101.dat', + table_name : 't1', + column_names : ['c1','c2','c3'], + pkey_autoincr : 'id' + comment_lines : 1 + unique: ['c1'], + index: ['c2', 'c3'] + }, + { file_path : '/home/galaxy/dataset_102.dat', + table_name : 'gff', + column_names : ['seqname',,'date','start','end'] + comment_lines : 1 + load_named_columns : True + filters : [{'filter': 'regex', 'pattern': '#peptide', + 'action': 'exclude_match'}, + {'filter': 'replace', 'column': 3, + 'replace': 'gi[|]', 'pattern': ''}] + }, + { file_path : '/home/galaxy/dataset_103.dat', + table_name : 'test', + column_names : ['c1', 'c2', 'c3'] + } + ] +} +""" + + +def __main__(): + # Parse Command Line + parser = optparse.OptionParser() + parser.add_option('-s', '--sqlitedb', dest='sqlitedb', default=None, + help='The SQLite Database') + parser.add_option('-j', '--jsonfile', dest='jsonfile', default=None, + help='JSON dict of table specifications') + parser.add_option('-q', '--query', dest='query', default=None, + help='SQL query') + parser.add_option('-Q', '--query_file', dest='query_file', default=None, + help='SQL query file') + parser.add_option('-n', '--no_header', dest='no_header', default=False, + action='store_true', + help='Include a column headers line') + parser.add_option('-o', '--output', dest='output', default=None, + help='Output file for query results') + (options, args) = parser.parse_args() + + # determine output destination + if options.output is not None: + try: + outputPath = os.path.abspath(options.output) + outputFile = open(outputPath, 'w') + except Exception as e: + exit('Error: %s' % (e)) + else: + outputFile = sys.stdout + + def _create_table(ti, table): + path = table['file_path'] + table_name =\ + table['table_name'] if 'table_name' in table else 't%d' % (ti + 1) + comment_lines =\ + table['comment_lines'] if 'comment_lines' in table else 0 + comment_char =\ + table['comment_char'] if 'comment_char' in table else None + column_names =\ + table['column_names'] if 'column_names' in table else None + if column_names: + load_named_columns =\ + table['load_named_columns']\ + if 'load_named_columns' in table else False + else: + load_named_columns = False + unique_indexes = table['unique'] if 'unique' in table else [] + indexes = table['index'] if 'index' in table else [] + filters = table['filters'] if 'filters' in table else None + pkey_autoincr = \ + table['pkey_autoincr'] if 'pkey_autoincr' in table else None + create_table(get_connection(options.sqlitedb), path, table_name, + pkey_autoincr=pkey_autoincr, + column_names=column_names, + skip=comment_lines, + comment_char=comment_char, + load_named_columns=load_named_columns, + filters=filters, + unique_indexes=unique_indexes, + indexes=indexes) + + if options.jsonfile: + try: + with open(options.jsonfile) as fh: + tdef = json.load(fh) + if 'tables' in tdef: + for ti, table in enumerate(tdef['tables']): + _create_table(ti, table) + except Exception as e: + exit('Error: %s' % (e)) + + query = None + if options.query_file is not None: + with open(options.query_file, 'r') as fh: + query = '' + for line in fh: + query += line + elif options.query is not None: + query = options.query + + if query is None: + try: + describe_tables(get_connection(options.sqlitedb), outputFile) + except Exception as e: + exit('Error: %s' % (e)) + else: + try: + run_query(get_connection(options.sqlitedb), query, outputFile, + no_header=options.no_header) + except Exception as e: + exit('Error: %s' % (e)) + + +if __name__ == "__main__": + __main__() |
b |
diff -r 000000000000 -r 3708ff0198b7 query_tabular.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/query_tabular.xml Tue Jul 18 09:07:07 2017 -0400 |
[ |
b'@@ -0,0 +1,510 @@\n+<tool id="query_tabular" name="Query Tabular" version="1.0.0">\n+ <description>using sqlite sql</description>\n+\n+ <macros>\n+ <import>macros.xml</import>\n+ </macros>\n+\n+ <requirements>\n+ </requirements>\n+\n+ <command detect_errors="exit_code"><![CDATA[\n+ cat \'$query_file\' &&\n+ #if $add_to_database.withdb: \n+ #if $save_db:\n+ cp \'$add_to_database.withdb\' \'$sqlitedb\' &&\n+ #else:\n+ cp \'$add_to_database.withdb\' \'$workdb\' &&\n+ #end if \n+ #end if\n+ python \'$__tool_directory__/query_tabular.py\'\n+ #if $save_db\n+ -s \'$sqlitedb\'\n+ #else\n+ -s \'$workdb\'\n+ #end if\n+ -j \'$table_json\'\n+ #if $sqlquery:\n+ -Q \'$query_file\' \n+ $no_header\n+ -o \'$output\'\n+ #end if\n+ ]]></command>\n+ <configfiles>\n+ <configfile name="query_file">\n+$sqlquery\n+ </configfile>\n+ <configfile name="table_json">\n+#import json\n+#set $jtbldef = dict()\n+#set $jtbls = []\n+#set $jtbldef[\'tables\'] = $jtbls\n+#for $i,$tbl in enumerate($tables):\n+ #set $jtbl = dict()\n+ #set $jtbl[\'file_path\'] = str($tbl.table)\n+ #if $tbl.tbl_opts.table_name:\n+ #set $tname = str($tbl.tbl_opts.table_name)\n+ #else\n+ #set $tname = \'t\' + str($i + 1) \n+ #end if\n+ #set $jtbl[\'table_name\'] = $tname\n+ ## #if $tbl.tbl_opts.sel_cols:\n+ ## #set $jtbl[\'sel_cols\'] = $tbl.tbl_opts.sel_cols el_cols\n+ ## #end if\n+ #if $tbl.tbl_opts.pkey_autoincr:\n+ #set $jtbl[\'pkey_autoincr\'] = str($tbl.tbl_opts.pkey_autoincr)\n+ #end if\n+ #if $tbl.tbl_opts.col_names:\n+ #set $col_names = str($tbl.tbl_opts.col_names)\n+ #if $tbl.tbl_opts.load_named_columns:\n+ #set $jtbl[\'load_named_columns\'] = True\n+ #end if\n+ #else \n+ #set $col_names = \'\'\n+ #end if\n+ #set $jtbl[\'column_names\'] = $col_names\n+ #set $idx_unique = []\n+ #set $idx_non = []\n+ #for $idx in $tbl.tbl_opts.indexes:\n+ #if $idx.unique:\n+ #silent $idx_unique.append(str($idx.index_columns))\n+ #else:\n+ #silent $idx_non.append(str($idx.index_columns))\n+ #end if\n+ #end for\n+ #if len($idx_unique) > 0:\n+ #set $jtbl[\'unique\'] = $idx_unique\n+ #end if\n+ #if len($idx_non) > 0:\n+ #set $jtbl[\'index\'] = $idx_non\n+ #end if\n+ #set $linefilters = $tbl.input_opts.linefilters\n+ @LINEFILTERS@\n+ #if $input_filters:\n+ #set $jtbl[\'filters\'] = $input_filters\n+ #end if\n+ #set $jtbls += [$jtbl]\n+#end for\n+#echo $json.dumps($jtbldef)\n+ </configfile>\n+ </configfiles>\n+ <inputs>\n+ <param name="workdb" type="hidden" value="workdb.sqlite" label=""/>\n+ <section name="add_to_database" expanded="false" title="Add tables to an existing database">\n+ <param name="withdb" type="data" format="sqlite" optional="true" label="Add tables to this Database" \n+ help="Make sure your added table names are not already in this database"/>\n+ </section>\n+ <repeat name="tables" title="Database Table" min="0">\n+ <param name="table" type="data" format="tabular" label="Tabular Dataset for Table"/>\n+ <section name="input_opts" expanded="false" title="Filter Dataset Input">\n+ <expand macro="macro_line_filters" />\n+ </section>\n+ <section name="tbl_opts" expanded="false" title="Table Options">\n+ <param name="table_name" type="text" value="" optional="true" label="Specify Name for Table">\n+ <help>By default, tables will be named: t1,t2,...,tn (table names must be unique)</help>\n+ <validator type="regex" message="Table name should start with a letter and may contain additional letters, digits, and underscores">^[A-Za-z]\\w*$</validator>\n+ </param>\n+ <param name="col_names" type="text" value="" optional="true" label="Specify Column Names (comma-separated list)">\n+ <help>By default, table columns will be nam'..b'data base can be used as input, and any selected tabular datasets will be added as new tables in that data base.\n+\n+\n+@LINEFILTERS_HELP@\n+\n+\n+**Outputs**\n+\n+ The results of a SQL query are output to the history as a tabular file.\n+\n+ The SQLite_ data base can also be saved and output as a dataset in the history. \n+\n+ *(The* **SQLite to tabular** *tool can run additional queries on this database.)*\n+\n+\n+@QUERY_HELP@\n+\n+@LINEFILTERS_HELP_EXAMPLE@\n+\n+\n+ Table name: pets\n+\n+ Table columns: Pets,FirstName,LastName,Birthdate,PetNames,PetType,line_num,entry_num,row_num\n+\n+ Query: SELECT * FROM pets \n+\n+ Result:\n+\n+ ====== ========== ======== ========== ========= ======== ========= ========== ========\n+ #Pets FirstName LastName BirthDate PetNames PetType line_num entry_num row_num\n+ ====== ========== ======== ========== ========= ======== ========= ========== ========\n+ 2 Paula Brown 1978-05-24 Rex dog 3 1 1\n+ 2 Paula Brown 1978-05-24 Fluff cat 3 1 2\n+ 1 Steven Jones 1974-04-04 Allie cat 4 2 3\n+ 0 Jane Doe 1978-05-24 5 3 4\n+ 1 James Smith 1980-10-20 Spot 6 4 5 \n+ ====== ========== ======== ========== ========= ======== ========= ========== ======== \n+\n+\n+**Normalizing by Line Filtering into 2 Tables** \n+\n+*Relational database opertions work with single-valued column entries. \n+To apply relational operations to tabular files that contain fields with lists of values,\n+we need to "normalize" those fields, duplicating lines for each item in the list. \n+In this example we create 2 tables, one for single-valued fields and a second with list-valued fields normalized. \n+Becauce we add a line number first for each table, we can join the 2 tables on the line number column.*\n+https://en.wikipedia.org/wiki/First_normal_form \n+\n+ *People Table*\n+\n+ ::\n+ \n+ Filter 1 - by regex expression matching [include]: \'^\\d+\' (include lines that start with a number) \n+ Filter 2 - append a line number column:\n+ Filter 3 - regex replace value in column[4]: \'(\\d+)/(\\d+)/(\\d+)\' \'19\\3-\\2-\\1\' (convert dates to sqlite format) \n+ Filter 4 - select columns 7,2,3,4,1\n+\n+ Table: People\n+ Columns: id,FirstName,LastName,DOB,Pets\n+\n+ == ========= ======== ========== ====\n+ id FirstName LastName DOB Pets\n+ == ========= ======== ========== ====\n+ 1 Paula Brown 1978-05-24 2\n+ 2 Steven Jones 1974-04-04 1\n+ 3 Jane Doe 1978-05-24 0\n+ 4 James Smith 1980-10-20 1\n+ == ========= ======== ========== ====\n+\n+\n+ *Pet Table*\n+\n+ :: \n+\n+ Filter 1 - by regex expression matching [include]: \'^\\d+\' (include lines that start with a number) \n+ Filter 2 - append a line number column:\n+ Filter 3 - by regex expression matching [exclude]: \'^0\\t\' (exclude lines with no pets)\n+ Filter 4 - normalize list columns[5,6]:\n+ Filter 5 - select columns 7,5,6\n+\n+ Table: Pet\n+ Columns: id,PetName,PetType\n+\n+ == ======== ========\n+ id PetName PetType \n+ == ======== ========\n+ 1 Rex dog \n+ 1 Fluff cat \n+ 2 Allie cat \n+ 4 Spot \n+ == ======== ========\n+\n+\n+ Query: SELECT FirstName,LastName,PetName FROM People JOIN Pet ON People.id = Pet.id WHERE PetType = \'cat\'; \n+\n+ Result:\n+\n+ ========= ======== ========\n+ FirstName LastName PetName \n+ ========= ======== ========\n+ Paula Brown Fluff \n+ Steven Jones Allie \n+ ========= ======== ========\n+\n+\n+ ]]></help>\n+</tool>\n' |
b |
diff -r 000000000000 -r 3708ff0198b7 sqlite_to_tabular.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sqlite_to_tabular.py Tue Jul 18 09:07:07 2017 -0400 |
b |
@@ -0,0 +1,60 @@ +#!/usr/bin/env python + +from __future__ import print_function + +import optparse +import os.path +import sys + +from query_db import describe_tables, get_connection, run_query + + +def __main__(): + # Parse Command Line + parser = optparse.OptionParser() + parser.add_option('-s', '--sqlitedb', dest='sqlitedb', default=None, + help='The SQLite Database') + parser.add_option('-q', '--query', dest='query', default=None, + help='SQL query') + parser.add_option('-Q', '--query_file', dest='query_file', default=None, + help='SQL query file') + parser.add_option('-n', '--no_header', dest='no_header', default=False, + action='store_true', + help='Include a column headers line') + parser.add_option('-o', '--output', dest='output', default=None, + help='Output file for query results') + (options, args) = parser.parse_args() + + # determine output destination + if options.output is not None: + try: + outputPath = os.path.abspath(options.output) + outputFile = open(outputPath, 'w') + except Exception as e: + exit('Error: %s' % (e)) + else: + outputFile = sys.stdout + + query = None + if options.query_file is not None: + with open(options.query_file, 'r') as fh: + query = fh.read() + elif options.query is not None: + query = options.query + + if query is None: + try: + describe_tables(get_connection(options.sqlitedb), outputFile) + except Exception as e: + exit('Error: %s' % (e)) + exit(0) + else: + try: + run_query(get_connection(options.sqlitedb), query, outputFile, + no_header=options.no_header) + except Exception as e: + exit('Error: %s' % (e)) + + +if __name__ == "__main__": + __main__() |
b |
diff -r 000000000000 -r 3708ff0198b7 test-data/IEDB.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/IEDB.tsv Tue Jul 18 09:07:07 2017 -0400 |
b |
@@ -0,0 +1,17 @@ +#ID allele seq_num start end length peptide method percentile_rank ann_ic50 ann_rank smm_ic50 smm_rank comblib_sidney2008_score comblib_sidney2008_rank netmhcpan_ic50 netmhcpan_rank +PPAP2C HLA-A*02:01 1 3 11 9 GMYCMVFLV Consensus (ann/smm/comblib_sidney2008) 0.2 4 0.2 3.77 0.2 7.1e-06 0.5 - - +PPAP2C HLA-A*23:01 1 1 9 9 SFGMYCMVF Consensus (ann/smm) 0.5 67 0.5 137.54 0.5 - - - - +PPAP2C HLA-A*23:01 1 4 12 9 MYCMVFLVK Consensus (ann/smm) 0.65 146 0.7 160.11 0.6 - - - - +PPAP2C HLA-A*02:01 1 2 10 9 FGMYCMVFL Consensus (ann/smm/comblib_sidney2008) 2.3 222 3.1 150.01 2.3 2.14e-05 1.3 - - +PPAP2C HLA-A*23:01 1 3 11 9 GMYCMVFLV Consensus (ann/smm) 4.95 3256 4 2706.64 5.9 - - - - +PPAP2C HLA-A*23:01 1 2 10 9 FGMYCMVFL Consensus (ann/smm) 6.55 4423 4.9 4144.10 8.2 - - - - +PPAP2C HLA-A*02:01 1 1 9 9 SFGMYCMVF Consensus (ann/smm/comblib_sidney2008) 45 24390 45 44989.38 39 0.01 91 - - +PPAP2C HLA-A*02:01 1 4 12 9 MYCMVFLVK Consensus (ann/smm/comblib_sidney2008) 54 23399 41 157801.09 54 0.01 86 - - +ADAMTSL1 HLA-A*02:01 1 1 9 9 SLDMCISGL Consensus (ann/smm/comblib_sidney2008) 1 26 1 51.65 0.9 3.02e-05 1.7 - - +ADAMTSL1 HLA-A*23:01 1 4 12 9 MCISGLCQL Consensus (ann/smm) 6.65 5781 5.9 3626.02 7.4 - - - - +ADAMTSL1 HLA-A*02:01 1 4 12 9 MCISGLCQL Consensus (ann/smm/comblib_sidney2008) 14 1823 6.5 2612.82 14 0.00056 24 - - +ADAMTSL1 HLA-A*23:01 1 1 9 9 SLDMCISGL Consensus (ann/smm) 30.5 27179 34 24684.82 27 - - - - +ADAMTSL1 HLA-A*02:01 1 2 10 9 LDMCISGLC Consensus (ann/smm/comblib_sidney2008) 42 23677 42 53716.78 41 0.01 71 - - +ADAMTSL1 HLA-A*23:01 1 3 11 9 DMCISGLCQ Consensus (ann/smm) 64.5 34451 73 118148.99 56 - - - - +ADAMTSL1 HLA-A*23:01 1 2 10 9 LDMCISGLC Consensus (ann/smm) 76.0 33222 62 665932.18 90 - - - - +ADAMTSL1 HLA-A*02:01 1 3 11 9 DMCISGLCQ Consensus (ann/smm/comblib_sidney2008) 97 31630 98 639896.89 71 0.03 97 - - |
b |
diff -r 000000000000 -r 3708ff0198b7 test-data/add_to_db_results.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/add_to_db_results.tsv Tue Jul 18 09:07:07 2017 -0400 |
b |
@@ -0,0 +1,3 @@ +#id first last pets quote +1 Paula Brown 2 Time flies like and arrow. Fruit flies like a banana. +2 Steven Jones 1 I would have wrtten less if I had more time |
b |
diff -r 000000000000 -r 3708ff0198b7 test-data/customers.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/customers.tsv Tue Jul 18 09:07:07 2017 -0400 |
b |
@@ -0,0 +1,5 @@ +#CustomerID FirstName LastName Email DOB Phone +1 John Smith John.Smith@yahoo.com 1968-02-04 626 222-2222 +2 Steven Goldfish goldfish@fishhere.net 1974-04-04 323 455-4545 +3 Paula Brown pb@herowndomain.org 1978-05-24 416 323-3232 +4 James Smith jim@supergig.co.uk 1980-10-20 416 323-8888 |
b |
diff -r 000000000000 -r 3708ff0198b7 test-data/filtered_people_results.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/filtered_people_results.tsv Tue Jul 18 09:07:07 2017 -0400 |
b |
@@ -0,0 +1,4 @@ +1 Paula Brown 1978-05-24 2 +2 Steven Jones 1974-04-04 1 +3 Jane Doe 1978-05-24 0 +4 James Smith 1980-10-20 1 |
b |
diff -r 000000000000 -r 3708ff0198b7 test-data/filtered_pets_results.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/filtered_pets_results.tsv Tue Jul 18 09:07:07 2017 -0400 |
b |
@@ -0,0 +1,4 @@ +1 Rex dog +1 Fluff cat +2 Allie cat +4 Spot |
b |
diff -r 000000000000 -r 3708ff0198b7 test-data/netMHC_summary.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/netMHC_summary.tsv Tue Jul 18 09:07:07 2017 -0400 |
b |
@@ -0,0 +1,9 @@ +#pos peptide logscore affinity(nM) Bind Level Protein Name Allele +2 GMYCMVFLV 0.858 4 SB PPAP2C HLA-A02:01 +1 FGMYCMVFL 0.501 222 WB PPAP2C HLA-A02:01 +3 MYCMVFLVK 0.070 23399 PPAP2C HLA-A02:01 +0 SFGMYCMVF 0.066 24390 PPAP2C HLA-A02:01 +0 SLDMCISGL 0.698 26 SB ADAMTSL1 HLA-A02:01 +3 MCISGLCQL 0.306 1823 ADAMTSL1 HLA-A02:01 +1 LDMCISGLC 0.069 23677 ADAMTSL1 HLA-A02:01 +2 DMCISGLCQ 0.042 31630 ADAMTSL1 HLA-A02:01 |
b |
diff -r 000000000000 -r 3708ff0198b7 test-data/pet_normalized_query_results.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/pet_normalized_query_results.tsv Tue Jul 18 09:07:07 2017 -0400 |
b |
@@ -0,0 +1,3 @@ +#id first last dob name animal pets +1 Paula Brown 1978-05-24 Fluff cat 2 +2 Steven Jones 1974-04-04 Allie cat 1 |
b |
diff -r 000000000000 -r 3708ff0198b7 test-data/pets.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/pets.tsv Tue Jul 18 09:07:07 2017 -0400 |
b |
@@ -0,0 +1,7 @@ +#People with pets +Pets FirstName LastName DOB PetNames PetType +2 Paula Brown 24/05/78 Rex,Fluff dog,cat +1 Steven Jones 04/04/74 Allie cat +0 Jane Doe 24/05/78 +1 James Smith 20/10/80 Spot + |
b |
diff -r 000000000000 -r 3708ff0198b7 test-data/query_results.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/query_results.tsv Tue Jul 18 09:07:07 2017 -0400 |
b |
@@ -0,0 +1,17 @@ +#ID peptide start end percentile_rank logscore affinity Bind_Level +PPAP2C GMYCMVFLV 3 11 0.2 0.858 4 SB +PPAP2C GMYCMVFLV 3 11 4.95 0.858 4 SB +ADAMTSL1 SLDMCISGL 1 9 1.0 0.698 26 SB +ADAMTSL1 SLDMCISGL 1 9 30.5 0.698 26 SB +PPAP2C FGMYCMVFL 2 10 2.3 0.501 222 WB +PPAP2C FGMYCMVFL 2 10 6.55 0.501 222 WB +ADAMTSL1 MCISGLCQL 4 12 6.65 0.306 1823 +ADAMTSL1 MCISGLCQL 4 12 14.0 0.306 1823 +PPAP2C MYCMVFLVK 4 12 0.65 0.07 23399 +PPAP2C MYCMVFLVK 4 12 54.0 0.07 23399 +ADAMTSL1 LDMCISGLC 2 10 42.0 0.069 23677 +ADAMTSL1 LDMCISGLC 2 10 76.0 0.069 23677 +PPAP2C SFGMYCMVF 1 9 0.5 0.066 24390 +PPAP2C SFGMYCMVF 1 9 45.0 0.066 24390 +ADAMTSL1 DMCISGLCQ 3 11 64.5 0.042 31630 +ADAMTSL1 DMCISGLCQ 3 11 97.0 0.042 31630 |
b |
diff -r 000000000000 -r 3708ff0198b7 test-data/regex_results.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/regex_results.tsv Tue Jul 18 09:07:07 2017 -0400 |
b |
@@ -0,0 +1,4 @@ +#FirstName LastName DOB +Steven Goldfish 04/04/74 +Paula Brown 24/05/78 +James Smith 20/10/80 |
b |
diff -r 000000000000 -r 3708ff0198b7 test-data/sales.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sales.tsv Tue Jul 18 09:07:07 2017 -0400 |
b |
@@ -0,0 +1,6 @@ +#CustomerID Date SaleAmount +2 2004-05-06 100.22 +1 2004-05-07 99.95 +3 2004-05-07 122.95 +3 2004-05-13 100.00 +4 2004-05-22 555.55 |
b |
diff -r 000000000000 -r 3708ff0198b7 test-data/sales_results.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sales_results.tsv Tue Jul 18 09:07:07 2017 -0400 |
b |
@@ -0,0 +1,5 @@ +#FirstName LastName TotalSales +James Smith 555.55 +Paula Brown 222.95 +Steven Goldfish 100.22 +John Smith 99.95 |
b |
diff -r 000000000000 -r 3708ff0198b7 test-data/testdb.sqlite |
b |
Binary file test-data/testdb.sqlite has changed |