Next changeset 1:cd2a99849f8b (2017-08-18) |
Commit message:
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/query_tabular commit 74915fc9cee746bbce1c4b507e13231259de177d |
added:
filter_tabular.py filter_tabular.xml filters.py load_db.py macros.xml query_db.py query_tabular.py sqlite_to_tabular.py test-data/IEDB.tsv test-data/add_to_db_results.tsv test-data/customers.tsv test-data/filtered_people_results.tsv test-data/filtered_pets_results.tsv test-data/netMHC_summary.tsv test-data/pet_normalized_query_results.tsv test-data/pets.tsv test-data/query_results.tsv test-data/regex_results.tsv test-data/sales.tsv test-data/sales_results.tsv test-data/testdb.sqlite |
b |
diff -r 000000000000 -r 6fbd9d25ceef filter_tabular.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filter_tabular.py Tue Jul 18 09:06:47 2017 -0400 |
[ |
@@ -0,0 +1,68 @@ +#!/usr/bin/env python + +from __future__ import print_function + +import json +import optparse +import os.path +import sys + +from filters import filter_file + + +def __main__(): + # Parse Command Line + parser = optparse.OptionParser() + parser.add_option('-i', '--input', dest='input', default=None, + help='Input file for filtering') + parser.add_option('-j', '--jsonfile', dest='jsonfile', default=None, + help='JSON array of filter specifications') + parser.add_option('-o', '--output', dest='output', default=None, + help='Output file for query results') + parser.add_option('-v', '--verbose', dest='verbose', default=False, + action='store_true', + help='verbose') + (options, args) = parser.parse_args() + + if options.input is not None: + try: + inputPath = os.path.abspath(options.input) + inputFile = open(inputPath, 'r') + except Exception as e: + exit('Error: %s' % (e)) + else: + inputFile = sys.stdin + + if options.output is not None: + try: + outputPath = os.path.abspath(options.output) + outputFile = open(outputPath, 'w') + except Exception as e: + exit('Error: %s' % (e)) + else: + outputFile = sys.stdout + + filters = None + if options.jsonfile: + try: + with open(options.jsonfile) as fh: + filters = json.load(fh) + except Exception as e: + exit('Error: %s' % (e)) + + if options.verbose and filters: + for f in filters: + print('%s %s' % (f['filter'], + ', '.join( + ['%s: %s' % (k, f[k]) + for k in set(f.keys()) - set(['filter'])])), + file=sys.stdout) + + try: + filter_file(inputFile, outputFile, filters=filters) + except Exception as e: + exit('Error: %s' % (e)) + + +if __name__ == "__main__": + __main__() |
b |
diff -r 000000000000 -r 6fbd9d25ceef filter_tabular.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filter_tabular.xml Tue Jul 18 09:06:47 2017 -0400 |
[ |
@@ -0,0 +1,130 @@ +<tool id="filter_tabular" name="Filter Tabular" version="1.0.0"> + <description></description> + + <macros> + <import>macros.xml</import> + </macros> + + <requirements> + </requirements> + <command detect_errors="exit_code"><![CDATA[ + python '$__tool_directory__/filter_tabular.py' + -i '$input' + -j '$filter_json' + -o '$output' + ]]></command> + <configfiles> + <configfile name="filter_json"> +#import json +@LINEFILTERS@ +#if $input_filters: +#echo $json.dumps($input_filters) +#end if + </configfile> + </configfiles> + <inputs> + <param name="input" type="data" format="tabular" label="Tabular Dataset to filter"/> + <expand macro="macro_line_filters" /> + </inputs> + <outputs> + <data format="tabular" name="output" /> + </outputs> + <tests> + <test> + <param name="input" ftype="tabular" value="pets.tsv"/> + <repeat name="linefilters"> + <conditional name="filter"> + <param name="filter_type" value="regex"/> + <param name="regex_pattern" value="^\d+"/> + <param name="regex_action" value="include_find"/> + </conditional> + </repeat> + <repeat name="linefilters"> + <conditional name="filter"> + <param name="filter_type" value="append_line_num"/> + </conditional> + </repeat> + <repeat name="linefilters"> + <conditional name="filter"> + <param name="filter_type" value="select_columns"/> + <param name="columns" value="7,2,3,4,1"/> + </conditional> + </repeat> + <repeat name="linefilters"> + <conditional name="filter"> + <param name="filter_type" value="replace"/> + <param name="column" value="c4"/> + <param name="regex_pattern" value="(\d+)/(\d+)/(\d+)"/> + <param name="regex_replace" value="19\3-\2-\1"/> + </conditional> + </repeat> + <output name="output" file="filtered_people_results.tsv"/> + </test> + <test> + <param name="input" ftype="tabular" value="pets.tsv"/> + <repeat name="linefilters"> + <conditional name="filter"> + <param name="filter_type" value="comment"/> + <param name="comment_char" value="35"/> + </conditional> + </repeat> + <repeat name="linefilters"> + <conditional name="filter"> + <param name="filter_type" value="regex"/> + <param name="regex_pattern" value="^\d+"/> + <param name="regex_action" value="include_find"/> + </conditional> + </repeat> + <repeat name="linefilters"> + <conditional name="filter"> + <param name="filter_type" value="append_line_num"/> + </conditional> + </repeat> + <repeat name="linefilters"> + <conditional name="filter"> + <param name="filter_type" value="select_columns"/> + <param name="columns" value="c7,c5,c6"/> + </conditional> + </repeat> + <repeat name="linefilters"> + <conditional name="filter"> + <param name="filter_type" value="normalize"/> + <param name="columns" value="c2,c3"/> + <param name="separator" value=","/> + </conditional> + </repeat> + <repeat name="linefilters"> + <conditional name="filter"> + <param name="filter_type" value="regex"/> + <param name="regex_pattern" value="^\d+\t\t"/> + <param name="regex_action" value="exclude_match"/> + </conditional> + </repeat> + <output name="output" file="filtered_pets_results.tsv"/> + </test> + + </tests> + <help><![CDATA[ +============== +Filter Tabular +============== + + Filter a tabular dataset by applying line filters as it is being read. + Multiple filters may be used with each filter using the result of the previous filter. + +**Inputs** + + A tabular dataset. + + +**Outputs** + + A filtered tabular dataset. + + +@LINEFILTERS_HELP@ + +@LINEFILTERS_HELP_EXAMPLE@ + + ]]></help> +</tool> |
b |
diff -r 000000000000 -r 6fbd9d25ceef filters.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters.py Tue Jul 18 09:06:47 2017 -0400 |
[ |
@@ -0,0 +1,156 @@ +#!/usr/binsenv python + +from __future__ import print_function + +import re +import sys + + +class LineFilter(object): + def __init__(self, source, filter_dict): + self.source = source + self.filter_dict = filter_dict + self.func = lambda i, l: l.rstrip('\r\n') if l else None + self.src_lines = [] + self.src_line_cnt = 0 + if not filter_dict: + return + if filter_dict['filter'] == 'regex': + rgx = re.compile(filter_dict['pattern']) + if filter_dict['action'] == 'exclude_match': + self.func = lambda i, l: l if not rgx.match(l) else None + elif filter_dict['action'] == 'include_match': + self.func = lambda i, l: l if rgx.match(l) else None + elif filter_dict['action'] == 'exclude_find': + self.func = lambda i, l: l if not rgx.search(l) else None + elif filter_dict['action'] == 'include_find': + self.func = lambda i, l: l if rgx.search(l) else None + elif filter_dict['filter'] == 'select_columns': + cols = [int(c) - 1 for c in filter_dict['columns']] + self.func = lambda i, l: self.select_columns(l, cols) + elif filter_dict['filter'] == 'replace': + p = filter_dict['pattern'] + r = filter_dict['replace'] + c = int(filter_dict['column']) - 1 + self.func = lambda i, l: '\t'.join( + [x if j != c else re.sub(p, r, x) for j, x in enumerate(l.split('\t'))]) + elif filter_dict['filter'] == 'prepend_line_num': + self.func = lambda i, l: '%d\t%s' % (i, l) + elif filter_dict['filter'] == 'append_line_num': + self.func = lambda i, l: '%s\t%d' % (l.rstrip('\r\n'), i) + elif filter_dict['filter'] == 'prepend_text': + s = filter_dict['column_text'] + self.func = lambda i, l: '%s\t%s' % (s, l) + elif filter_dict['filter'] == 'append_text': + s = filter_dict['column_text'] + self.func = lambda i, l: '%s\t%s' % (l.rstrip('\r\n'), s) + elif filter_dict['filter'] == 'skip': + cnt = filter_dict['count'] + self.func = lambda i, l: l if i > cnt else None + elif filter_dict['filter'] == 'normalize': + cols = [int(c) - 1 for c in filter_dict['columns']] + sep = filter_dict['separator'] + self.func = lambda i, l: self.normalize(l, cols, sep) + + def __iter__(self): + return self + + def __next__(self): + if not self.src_lines: + self.get_lines() + if self.src_lines: + return self.src_lines.pop(0) + raise StopIteration + + next = __next__ + + def select_columns(self, line, cols): + fields = line.split('\t') + return '\t'.join([fields[x] for x in cols]) + + def normalize(self, line, split_cols, sep): + lines = [] + fields = line.rstrip('\r\n').split('\t') + split_fields = dict() + cnt = 0 + for c in split_cols: + if c < len(fields): + split_fields[c] = fields[c].split(sep) + cnt = max(cnt, len(split_fields[c])) + if cnt == 0: + lines.append('\t'.join(fields)) + else: + for n in range(0, cnt): + flds = [x if c not in split_cols else split_fields[c][n] + if n < len(split_fields[c]) + else '' for (c, x) in enumerate(fields)] + lines.append('\t'.join(flds)) + return lines + + def get_lines(self): + for i, next_line in enumerate(self.source): + self.src_line_cnt += 1 + line = self.func(self.src_line_cnt, next_line) + if line: + if isinstance(line, list): + self.src_lines.extend(line) + else: + self.src_lines.append(line) + return + + +class TabularReader: + """ + Tabular file iterator. Returns a list + """ + def __init__(self, input_file, skip=0, comment_char=None, col_idx=None, + filters=None): + self.skip = skip + self.comment_char = comment_char + self.col_idx = col_idx + self.filters = filters + self.tsv_file = \ + input_file if hasattr(input_file, 'readline') else open(input_file) + if skip and skip > 0: + for i in range(skip): + if not self.tsv_file.readline(): + break + source = LineFilter(self.tsv_file, None) + if comment_char: + source = LineFilter(source, + {"filter": "regex", "pattern": comment_char, + "action": "exclude_match"}) + if filters: + for f in filters: + source = LineFilter(source, f) + self.source = source + + def __iter__(self): + return self + + def __next__(self): + ''' Iteration ''' + for i, line in enumerate(self.source): + fields = line.rstrip('\r\n').split('\t') + if self.col_idx: + fields = [fields[i] for i in self.col_idx] + return fields + raise StopIteration + + next = __next__ + + +def filter_file(input_file, output, skip=0, comment_char='#', filters=None): + data_lines = 0 + try: + tr = TabularReader(input_file, skip=skip, comment_char=comment_char, + filters=filters) + for linenum, fields in enumerate(tr): + data_lines += 1 + try: + output.write('%s\n' % '\t'.join(fields)) + except Exception as e: + print('Failed at line: %d err: %s' % (linenum, e), + file=sys.stderr) + except Exception as e: + exit('Error: %s' % (e)) |
b |
diff -r 000000000000 -r 6fbd9d25ceef load_db.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/load_db.py Tue Jul 18 09:06:47 2017 -0400 |
[ |
@@ -0,0 +1,135 @@ +#!/usr/bin/env python + +from __future__ import print_function + +import sys + +from filters import TabularReader + + +def getValueType(val): + if val or 0. == val: + try: + int(val) + return 'INTEGER' + except: + try: + float(val) + return 'REAL' + except: + return 'TEXT' + return None + + +def get_column_def(file_path, table_name, skip=0, comment_char='#', + column_names=None, max_lines=100, load_named_columns=False, + filters=None): + col_pref = ['TEXT', 'REAL', 'INTEGER', None] + col_types = [] + col_idx = None + try: + tr = TabularReader(file_path, skip=skip, comment_char=comment_char, + col_idx=None, filters=filters) + for linenum, fields in enumerate(tr): + if linenum > max_lines: + break + try: + while len(col_types) < len(fields): + col_types.append(None) + for i, val in enumerate(fields): + colType = getValueType(val) + if col_pref.index(colType) < col_pref.index(col_types[i]): + col_types[i] = colType + except Exception as e: + print('Failed at line: %d err: %s' % (linenum, e), + file=sys.stderr) + except Exception as e: + print('Failed: %s' % (e), file=sys.stderr) + for i, col_type in enumerate(col_types): + if not col_type: + col_types[i] = 'TEXT' + if column_names: + col_names = [] + if load_named_columns: + col_idx = [] + for i, cname in enumerate( + [cn.strip() for cn in column_names.split(',')]): + if cname != '': + col_idx.append(i) + col_names.append(cname) + col_types = [col_types[i] for i in col_idx] + else: + col_names = ['c%d' % i for i in range(1, len(col_types) + 1)] + for i, cname in enumerate( + [cn.strip() for cn in column_names.split(',')]): + if cname and i < len(col_names): + col_names[i] = cname + else: + col_names = ['c%d' % i for i in range(1, len(col_types) + 1)] + col_def = [] + for i, col_name in enumerate(col_names): + col_def.append('%s %s' % (col_names[i], col_types[i])) + return col_names, col_types, col_def, col_idx + + +def create_table(conn, file_path, table_name, skip=0, comment_char='#', + pkey_autoincr=None, column_names=None, + load_named_columns=False, filters=None, + unique_indexes=[], indexes=[]): + col_names, col_types, col_def, col_idx = \ + get_column_def(file_path, table_name, skip=skip, + comment_char=comment_char, column_names=column_names, + load_named_columns=load_named_columns, filters=filters) + col_func = [float if t == 'REAL' else int + if t == 'INTEGER' else str for t in col_types] + table_def = 'CREATE TABLE %s (\n %s%s\n);' % ( + table_name, + '%s INTEGER PRIMARY KEY AUTOINCREMENT,' % + pkey_autoincr if pkey_autoincr else '', + ', \n '.join(col_def)) + # print >> sys.stdout, table_def + insert_stmt = 'INSERT INTO %s(%s) VALUES(%s)' % ( + table_name, ','.join(col_names), + ','.join(["?" for x in col_names])) + # print >> sys.stdout, insert_stmt + data_lines = 0 + try: + c = conn.cursor() + c.execute(table_def) + conn.commit() + c.close() + for i, index in enumerate(unique_indexes): + index_name = 'idx_uniq_%s_%d' % (table_name, i) + index_columns = index.split(',') + create_index(conn, table_name, index_name, index_columns, + unique=True) + for i, index in enumerate(indexes): + index_name = 'idx_%s_%d' % (table_name, i) + index_columns = index.split(',') + create_index(conn, table_name, index_name, index_columns) + c = conn.cursor() + tr = TabularReader(file_path, skip=skip, comment_char=comment_char, + col_idx=col_idx, filters=filters) + for linenum, fields in enumerate(tr): + data_lines += 1 + try: + vals = [col_func[i](x) + if x else None for i, x in enumerate(fields)] + c.execute(insert_stmt, vals) + except Exception as e: + print('Failed at line: %d err: %s' % (linenum, e), + file=sys.stderr) + conn.commit() + c.close() + except Exception as e: + exit('Error: %s' % (e)) + + +def create_index(conn, table_name, index_name, index_columns, unique=False): + index_def = "CREATE %s INDEX %s on %s(%s)" % ( + 'UNIQUE' if unique else '', index_name, + table_name, ','.join(index_columns)) + c = conn.cursor() + c.execute(index_def) + conn.commit() + c.close() |
b |
diff -r 000000000000 -r 6fbd9d25ceef macros.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Tue Jul 18 09:06:47 2017 -0400 |
[ |
b'@@ -0,0 +1,383 @@\n+<macros>\n+ <token name="@LINEFILTERS@">\n+<![CDATA[\n+ ## set linefilters to the \n+ #set $input_filters = []\n+ #for $fi in $linefilters:\n+ #if $fi.filter.filter_type == \'skip\':\n+ #set $skip_lines = None\n+ #if str($fi.filter.skip_lines) != \'\':\n+ #set $skip_lines = int($fi.filter.skip_lines)\n+ #elif $tbl.table.metadata.comment_lines and $tbl.table.metadata.comment_lines > 0:\n+ #set $skip_lines = int($tbl.table.metadata.comment_lines)\n+ #end if\n+ #if $skip_lines is not None:\n+ #set $filter_dict = dict()\n+ #set $filter_dict[\'filter\'] = str($fi.filter.filter_type)\n+ #set $filter_dict[\'count\'] = $skip_lines\n+ #silent $input_filters.append($filter_dict)\n+ #end if\n+ #elif $fi.filter.filter_type == \'comment\':\n+ #set $filter_dict = dict()\n+ #set $filter_dict[\'filter\'] = \'regex\'\n+ #set $filter_dict[\'pattern\'] = \'^(%s).*$\' % \'|\'.join([chr(int(x)).replace(\'|\',\'[|]\') for x in (str($fi.filter.comment_char)).split(\',\')])\n+ #set $filter_dict[\'action\'] = \'exclude_match\'\n+ #silent $input_filters.append($filter_dict)\n+ #elif $fi.filter.filter_type == \'regex\':\n+ #set $filter_dict = dict()\n+ #set $filter_dict[\'filter\'] = str($fi.filter.filter_type)\n+ #set $filter_dict[\'pattern\'] = str($fi.filter.regex_pattern)\n+ #set $filter_dict[\'action\'] = str($fi.filter.regex_action)\n+ #silent $input_filters.append($filter_dict)\n+ #elif $fi.filter.filter_type == \'select_columns\':\n+ #set $filter_dict = dict()\n+ #set $filter_dict[\'filter\'] = str($fi.filter.filter_type)\n+ #set $filter_dict[\'columns\'] = [int(str($ci).replace(\'c\',\'\')) for $ci in str($fi.filter.columns).split(\',\')]\n+ #silent $input_filters.append($filter_dict)\n+ #elif $fi.filter.filter_type == \'replace\':\n+ #set $filter_dict = dict()\n+ #set $filter_dict[\'filter\'] = str($fi.filter.filter_type)\n+ #set $filter_dict[\'column\'] = int(str($fi.filter.column).replace(\'c\',\'\'))\n+ #set $filter_dict[\'pattern\'] = str($fi.filter.regex_pattern)\n+ #set $filter_dict[\'replace\'] = str($fi.filter.regex_replace)\n+ #silent $input_filters.append($filter_dict)\n+ #elif str($fi.filter.filter_type).endswith(\'pend_line_num\'):\n+ #set $filter_dict = dict()\n+ #set $filter_dict[\'filter\'] = str($fi.filter.filter_type)\n+ #silent $input_filters.append($filter_dict)\n+ #elif str($fi.filter.filter_type).endswith(\'pend_text\'):\n+ #set $filter_dict = dict()\n+ #set $filter_dict[\'filter\'] = str($fi.filter.filter_type)\n+ #set $filter_dict[\'column_text\'] = str($fi.filter.column_text)\n+ #silent $input_filters.append($filter_dict)\n+ #elif $fi.filter.filter_type == \'normalize\':\n+ #set $filter_dict = dict()\n+ #set $filter_dict[\'filter\'] = str($fi.filter.filter_type)\n+ #set $filter_dict[\'columns\'] = [int(str($ci).replace(\'c\',\'\')) for $ci in str($fi.filter.columns).split(\',\')]\n+ #set $filter_dict[\'separator\'] = str($fi.filter.separator)\n+ #silent $input_filters.append($filter_dict)\n+ #end if\n+ #end for\n+]]>\n+ </token>\n+ <xml name="macro_line_filters">\n+ <repeat name="linefilters" title="Filter Tabular Input Lines">\n+ <conditional name="filter">\n+ <param name="filter_type" type="select" label="Filter By">\n+ <option value="skip">skip leading lines</option>\n+ <option value="comment">comment char</option>\n+ <option value="regex">by regex expression matching</option>\n+ <option value="select_columns">select columns</option>\n+ <option value="replace">regex replace value in column</option>\n+ <option value="prepend_line_num">prepend a line number column</option>\n+ <option value="append_line_num">append a line number column</option>\n+ '..b'==== ============\n+ #CustomerID FirstName LastName Email DOB Phone\n+ =========== ========== ========== ===================== ========== ============\n+ 1 John Smith John.Smith@yahoo.com 1968-02-04 626 222-2222\n+ 2 Steven Goldfish goldfish@fishhere.net 1974-04-04 323 455-4545\n+ 3 Paula Brown pb@herowndomain.org 1978-05-24 416 323-3232\n+ 4 James Smith jim@supergig.co.uk 1980-10-20 416 323-8888\n+ =========== ========== ========== ===================== ========== ============\n+ \n+ Dataset *sales*\n+ \n+ Table name: "sales"\n+ \n+ Column names: "CustomerID,Date,SaleAmount"\n+ \n+ ============= ============ ============\n+ #CustomerID Date SaleAmount\n+ ============= ============ ============\n+ 2 2004-05-06 100.22\n+ 1 2004-05-07 99.95\n+ 3 2004-05-07 122.95\n+ 3 2004-05-13 100.00\n+ 4 2004-05-22 555.55\n+ ============= ============ ============\n+ \n+ The query\n+ \n+ ::\n+ \n+ SELECT FirstName,LastName,sum(SaleAmount) as "TotalSales" \n+ FROM customers join sales on customers.CustomerID = sales.CustomerID \n+ GROUP BY customers.CustomerID ORDER BY TotalSales DESC;\n+ \n+ Produces this tabular output:\n+ \n+ ========== ======== ==========\n+ #FirstName LastName TotalSales\n+ ========== ======== ==========\n+ James Smith 555.55\n+ Paula Brown 222.95\n+ Steven Goldfish 100.22\n+ John Smith 99.95\n+ ========== ======== ==========\n+ \n+ \n+ If the optional Table name and Column names inputs are not used, the query would be:\n+ \n+ ::\n+ \n+ SELECT t1.c2 as "FirstName", t1.c3 as "LastName", sum(t2.c3) as "TotalSales" \n+ FROM t1 join t2 on t1.c1 = t2.c1 \n+ GROUP BY t1.c1 ORDER BY TotalSales DESC;\n+ \n+ You can selectively name columns, e.g. on the customers input you could just name columns 2,3, and 5: \n+ \n+ Column names: ,FirstName,LastName,,BirthDate\n+ \n+ Results in the following data base table\n+ \n+ =========== ========== ========== ===================== ========== ============\n+ #c1 FirstName LastName c4 BirthDate c6\n+ =========== ========== ========== ===================== ========== ============\n+ 1 John Smith John.Smith@yahoo.com 1968-02-04 626 222-2222\n+ 2 Steven Goldfish goldfish@fishhere.net 1974-04-04 323 455-4545\n+ 3 Paula Brown pb@herowndomain.org 1978-05-24 416 323-3232\n+ 4 James Smith jim@supergig.co.uk 1980-10-20 416 323-8888\n+ =========== ========== ========== ===================== ========== ============\n+\n+\n+ Regular_expression_ functions are included for: \n+\n+ ::\n+\n+ matching: re_match(\'pattern\',column) \n+\n+ SELECT t1.FirstName, t1.LastName\n+ FROM t1\n+ WHERE re_match(\'^.*\\.(net|org)$\',c4)\n+\n+ Results:\n+\n+ =========== ==========\n+ #FirstName LastName\n+ =========== ==========\n+ Steven Goldfish\n+ Paula Brown\n+ =========== ==========\n+\n+\n+ ::\n+\n+ searching: re_search(\'pattern\',column)\n+ substituting: re_sub(\'pattern\',\'replacement,column)\n+\n+ SELECT t1.FirstName, t1.LastName, re_sub(\'^\\d{2}(\\d{2})-(\\d\\d)-(\\d\\d)\',\'\\3/\\2/\\1\',BirthDate) as "DOB"\n+ FROM t1\n+ WHERE re_search(\'[hp]er\',c4)\n+\n+ Results:\n+\n+\n+ =========== ========== ==========\n+ #FirstName LastName DOB\n+ =========== ========== ==========\n+ Steven Goldfish 04/04/74\n+ Paula Brown 24/05/78\n+ James Smith 20/10/80\n+ =========== ========== ==========\n+\n+.. _Regular_expression: https://docs.python.org/release/2.7/library/re.html\n+.. _SQLite: http://www.sqlite.org/index.html\n+.. _SQLite_functions: http://www.sqlite.org/docs.html\n+\n+\n+]]>\n+ </token>\n+\n+</macros>\n+\n' |
b |
diff -r 000000000000 -r 6fbd9d25ceef query_db.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/query_db.py Tue Jul 18 09:06:47 2017 -0400 |
[ |
@@ -0,0 +1,67 @@ +#!/usr/bin/env python + +from __future__ import print_function + +import re +import sqlite3 as sqlite +import sys + + +TABLE_QUERY = \ + """ + SELECT name, sql + FROM sqlite_master + WHERE type='table' + ORDER BY name + """ + + +def regex_match(expr, item): + return re.match(expr, item) is not None + + +def regex_search(expr, item): + return re.search(expr, item) is not None + + +def regex_sub(expr, replace, item): + return re.sub(expr, replace, item) + + +def get_connection(sqlitedb_path, addfunctions=True): + conn = sqlite.connect(sqlitedb_path) + if addfunctions: + conn.create_function("re_match", 2, regex_match) + conn.create_function("re_search", 2, regex_search) + conn.create_function("re_sub", 3, regex_sub) + return conn + + +def describe_tables(conn, outputFile): + try: + c = conn.cursor() + tables_query = TABLE_QUERY + rslt = c.execute(tables_query).fetchall() + for table, sql in rslt: + print("Table %s:" % table, file=outputFile) + try: + col_query = 'SELECT * FROM %s LIMIT 0' % table + cur = conn.cursor().execute(col_query) + cols = [col[0] for col in cur.description] + print(" Columns: %s" % cols, file=outputFile) + except Exception as exc: + print("Warning: %s" % exc, file=sys.stderr) + except Exception as e: + exit('Error: %s' % (e)) + exit(0) + + +def run_query(conn, query, outputFile, no_header=False): + cur = conn.cursor() + results = cur.execute(query) + if not no_header: + outputFile.write("#%s\n" % '\t'.join( + [str(col[0]) for col in cur.description])) + for i, row in enumerate(results): + outputFile.write("%s\n" % '\t'.join( + [str(val) if val is not None else '' for val in row])) |
b |
diff -r 000000000000 -r 6fbd9d25ceef query_tabular.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/query_tabular.py Tue Jul 18 09:06:47 2017 -0400 |
[ |
@@ -0,0 +1,137 @@ +#!/usr/bin/env python + +from __future__ import print_function + +import json +import optparse +import os.path +import sys + +from load_db import create_table + +from query_db import describe_tables, get_connection, run_query + + +""" +JSON config: +{ tables : [ + { file_path : '/home/galaxy/dataset_101.dat', + table_name : 't1', + column_names : ['c1','c2','c3'], + pkey_autoincr : 'id' + comment_lines : 1 + unique: ['c1'], + index: ['c2', 'c3'] + }, + { file_path : '/home/galaxy/dataset_102.dat', + table_name : 'gff', + column_names : ['seqname',,'date','start','end'] + comment_lines : 1 + load_named_columns : True + filters : [{'filter': 'regex', 'pattern': '#peptide', + 'action': 'exclude_match'}, + {'filter': 'replace', 'column': 3, + 'replace': 'gi[|]', 'pattern': ''}] + }, + { file_path : '/home/galaxy/dataset_103.dat', + table_name : 'test', + column_names : ['c1', 'c2', 'c3'] + } + ] +} +""" + + +def __main__(): + # Parse Command Line + parser = optparse.OptionParser() + parser.add_option('-s', '--sqlitedb', dest='sqlitedb', default=None, + help='The SQLite Database') + parser.add_option('-j', '--jsonfile', dest='jsonfile', default=None, + help='JSON dict of table specifications') + parser.add_option('-q', '--query', dest='query', default=None, + help='SQL query') + parser.add_option('-Q', '--query_file', dest='query_file', default=None, + help='SQL query file') + parser.add_option('-n', '--no_header', dest='no_header', default=False, + action='store_true', + help='Include a column headers line') + parser.add_option('-o', '--output', dest='output', default=None, + help='Output file for query results') + (options, args) = parser.parse_args() + + # determine output destination + if options.output is not None: + try: + outputPath = os.path.abspath(options.output) + outputFile = open(outputPath, 'w') + except Exception as e: + exit('Error: %s' % (e)) + else: + outputFile = sys.stdout + + def _create_table(ti, table): + path = table['file_path'] + table_name =\ + table['table_name'] if 'table_name' in table else 't%d' % (ti + 1) + comment_lines =\ + table['comment_lines'] if 'comment_lines' in table else 0 + comment_char =\ + table['comment_char'] if 'comment_char' in table else None + column_names =\ + table['column_names'] if 'column_names' in table else None + if column_names: + load_named_columns =\ + table['load_named_columns']\ + if 'load_named_columns' in table else False + else: + load_named_columns = False + unique_indexes = table['unique'] if 'unique' in table else [] + indexes = table['index'] if 'index' in table else [] + filters = table['filters'] if 'filters' in table else None + pkey_autoincr = \ + table['pkey_autoincr'] if 'pkey_autoincr' in table else None + create_table(get_connection(options.sqlitedb), path, table_name, + pkey_autoincr=pkey_autoincr, + column_names=column_names, + skip=comment_lines, + comment_char=comment_char, + load_named_columns=load_named_columns, + filters=filters, + unique_indexes=unique_indexes, + indexes=indexes) + + if options.jsonfile: + try: + with open(options.jsonfile) as fh: + tdef = json.load(fh) + if 'tables' in tdef: + for ti, table in enumerate(tdef['tables']): + _create_table(ti, table) + except Exception as e: + exit('Error: %s' % (e)) + + query = None + if options.query_file is not None: + with open(options.query_file, 'r') as fh: + query = '' + for line in fh: + query += line + elif options.query is not None: + query = options.query + + if query is None: + try: + describe_tables(get_connection(options.sqlitedb), outputFile) + except Exception as e: + exit('Error: %s' % (e)) + else: + try: + run_query(get_connection(options.sqlitedb), query, outputFile, + no_header=options.no_header) + except Exception as e: + exit('Error: %s' % (e)) + + +if __name__ == "__main__": + __main__() |
b |
diff -r 000000000000 -r 6fbd9d25ceef sqlite_to_tabular.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sqlite_to_tabular.py Tue Jul 18 09:06:47 2017 -0400 |
b |
@@ -0,0 +1,60 @@ +#!/usr/bin/env python + +from __future__ import print_function + +import optparse +import os.path +import sys + +from query_db import describe_tables, get_connection, run_query + + +def __main__(): + # Parse Command Line + parser = optparse.OptionParser() + parser.add_option('-s', '--sqlitedb', dest='sqlitedb', default=None, + help='The SQLite Database') + parser.add_option('-q', '--query', dest='query', default=None, + help='SQL query') + parser.add_option('-Q', '--query_file', dest='query_file', default=None, + help='SQL query file') + parser.add_option('-n', '--no_header', dest='no_header', default=False, + action='store_true', + help='Include a column headers line') + parser.add_option('-o', '--output', dest='output', default=None, + help='Output file for query results') + (options, args) = parser.parse_args() + + # determine output destination + if options.output is not None: + try: + outputPath = os.path.abspath(options.output) + outputFile = open(outputPath, 'w') + except Exception as e: + exit('Error: %s' % (e)) + else: + outputFile = sys.stdout + + query = None + if options.query_file is not None: + with open(options.query_file, 'r') as fh: + query = fh.read() + elif options.query is not None: + query = options.query + + if query is None: + try: + describe_tables(get_connection(options.sqlitedb), outputFile) + except Exception as e: + exit('Error: %s' % (e)) + exit(0) + else: + try: + run_query(get_connection(options.sqlitedb), query, outputFile, + no_header=options.no_header) + except Exception as e: + exit('Error: %s' % (e)) + + +if __name__ == "__main__": + __main__() |
b |
diff -r 000000000000 -r 6fbd9d25ceef test-data/IEDB.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/IEDB.tsv Tue Jul 18 09:06:47 2017 -0400 |
b |
@@ -0,0 +1,17 @@ +#ID allele seq_num start end length peptide method percentile_rank ann_ic50 ann_rank smm_ic50 smm_rank comblib_sidney2008_score comblib_sidney2008_rank netmhcpan_ic50 netmhcpan_rank +PPAP2C HLA-A*02:01 1 3 11 9 GMYCMVFLV Consensus (ann/smm/comblib_sidney2008) 0.2 4 0.2 3.77 0.2 7.1e-06 0.5 - - +PPAP2C HLA-A*23:01 1 1 9 9 SFGMYCMVF Consensus (ann/smm) 0.5 67 0.5 137.54 0.5 - - - - +PPAP2C HLA-A*23:01 1 4 12 9 MYCMVFLVK Consensus (ann/smm) 0.65 146 0.7 160.11 0.6 - - - - +PPAP2C HLA-A*02:01 1 2 10 9 FGMYCMVFL Consensus (ann/smm/comblib_sidney2008) 2.3 222 3.1 150.01 2.3 2.14e-05 1.3 - - +PPAP2C HLA-A*23:01 1 3 11 9 GMYCMVFLV Consensus (ann/smm) 4.95 3256 4 2706.64 5.9 - - - - +PPAP2C HLA-A*23:01 1 2 10 9 FGMYCMVFL Consensus (ann/smm) 6.55 4423 4.9 4144.10 8.2 - - - - +PPAP2C HLA-A*02:01 1 1 9 9 SFGMYCMVF Consensus (ann/smm/comblib_sidney2008) 45 24390 45 44989.38 39 0.01 91 - - +PPAP2C HLA-A*02:01 1 4 12 9 MYCMVFLVK Consensus (ann/smm/comblib_sidney2008) 54 23399 41 157801.09 54 0.01 86 - - +ADAMTSL1 HLA-A*02:01 1 1 9 9 SLDMCISGL Consensus (ann/smm/comblib_sidney2008) 1 26 1 51.65 0.9 3.02e-05 1.7 - - +ADAMTSL1 HLA-A*23:01 1 4 12 9 MCISGLCQL Consensus (ann/smm) 6.65 5781 5.9 3626.02 7.4 - - - - +ADAMTSL1 HLA-A*02:01 1 4 12 9 MCISGLCQL Consensus (ann/smm/comblib_sidney2008) 14 1823 6.5 2612.82 14 0.00056 24 - - +ADAMTSL1 HLA-A*23:01 1 1 9 9 SLDMCISGL Consensus (ann/smm) 30.5 27179 34 24684.82 27 - - - - +ADAMTSL1 HLA-A*02:01 1 2 10 9 LDMCISGLC Consensus (ann/smm/comblib_sidney2008) 42 23677 42 53716.78 41 0.01 71 - - +ADAMTSL1 HLA-A*23:01 1 3 11 9 DMCISGLCQ Consensus (ann/smm) 64.5 34451 73 118148.99 56 - - - - +ADAMTSL1 HLA-A*23:01 1 2 10 9 LDMCISGLC Consensus (ann/smm) 76.0 33222 62 665932.18 90 - - - - +ADAMTSL1 HLA-A*02:01 1 3 11 9 DMCISGLCQ Consensus (ann/smm/comblib_sidney2008) 97 31630 98 639896.89 71 0.03 97 - - |
b |
diff -r 000000000000 -r 6fbd9d25ceef test-data/add_to_db_results.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/add_to_db_results.tsv Tue Jul 18 09:06:47 2017 -0400 |
b |
@@ -0,0 +1,3 @@ +#id first last pets quote +1 Paula Brown 2 Time flies like and arrow. Fruit flies like a banana. +2 Steven Jones 1 I would have wrtten less if I had more time |
b |
diff -r 000000000000 -r 6fbd9d25ceef test-data/customers.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/customers.tsv Tue Jul 18 09:06:47 2017 -0400 |
b |
@@ -0,0 +1,5 @@ +#CustomerID FirstName LastName Email DOB Phone +1 John Smith John.Smith@yahoo.com 1968-02-04 626 222-2222 +2 Steven Goldfish goldfish@fishhere.net 1974-04-04 323 455-4545 +3 Paula Brown pb@herowndomain.org 1978-05-24 416 323-3232 +4 James Smith jim@supergig.co.uk 1980-10-20 416 323-8888 |
b |
diff -r 000000000000 -r 6fbd9d25ceef test-data/filtered_people_results.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/filtered_people_results.tsv Tue Jul 18 09:06:47 2017 -0400 |
b |
@@ -0,0 +1,4 @@ +1 Paula Brown 1978-05-24 2 +2 Steven Jones 1974-04-04 1 +3 Jane Doe 1978-05-24 0 +4 James Smith 1980-10-20 1 |
b |
diff -r 000000000000 -r 6fbd9d25ceef test-data/filtered_pets_results.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/filtered_pets_results.tsv Tue Jul 18 09:06:47 2017 -0400 |
b |
@@ -0,0 +1,4 @@ +1 Rex dog +1 Fluff cat +2 Allie cat +4 Spot |
b |
diff -r 000000000000 -r 6fbd9d25ceef test-data/netMHC_summary.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/netMHC_summary.tsv Tue Jul 18 09:06:47 2017 -0400 |
b |
@@ -0,0 +1,9 @@ +#pos peptide logscore affinity(nM) Bind Level Protein Name Allele +2 GMYCMVFLV 0.858 4 SB PPAP2C HLA-A02:01 +1 FGMYCMVFL 0.501 222 WB PPAP2C HLA-A02:01 +3 MYCMVFLVK 0.070 23399 PPAP2C HLA-A02:01 +0 SFGMYCMVF 0.066 24390 PPAP2C HLA-A02:01 +0 SLDMCISGL 0.698 26 SB ADAMTSL1 HLA-A02:01 +3 MCISGLCQL 0.306 1823 ADAMTSL1 HLA-A02:01 +1 LDMCISGLC 0.069 23677 ADAMTSL1 HLA-A02:01 +2 DMCISGLCQ 0.042 31630 ADAMTSL1 HLA-A02:01 |
b |
diff -r 000000000000 -r 6fbd9d25ceef test-data/pet_normalized_query_results.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/pet_normalized_query_results.tsv Tue Jul 18 09:06:47 2017 -0400 |
b |
@@ -0,0 +1,3 @@ +#id first last dob name animal pets +1 Paula Brown 1978-05-24 Fluff cat 2 +2 Steven Jones 1974-04-04 Allie cat 1 |
b |
diff -r 000000000000 -r 6fbd9d25ceef test-data/pets.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/pets.tsv Tue Jul 18 09:06:47 2017 -0400 |
b |
@@ -0,0 +1,7 @@ +#People with pets +Pets FirstName LastName DOB PetNames PetType +2 Paula Brown 24/05/78 Rex,Fluff dog,cat +1 Steven Jones 04/04/74 Allie cat +0 Jane Doe 24/05/78 +1 James Smith 20/10/80 Spot + |
b |
diff -r 000000000000 -r 6fbd9d25ceef test-data/query_results.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/query_results.tsv Tue Jul 18 09:06:47 2017 -0400 |
b |
@@ -0,0 +1,17 @@ +#ID peptide start end percentile_rank logscore affinity Bind_Level +PPAP2C GMYCMVFLV 3 11 0.2 0.858 4 SB +PPAP2C GMYCMVFLV 3 11 4.95 0.858 4 SB +ADAMTSL1 SLDMCISGL 1 9 1.0 0.698 26 SB +ADAMTSL1 SLDMCISGL 1 9 30.5 0.698 26 SB +PPAP2C FGMYCMVFL 2 10 2.3 0.501 222 WB +PPAP2C FGMYCMVFL 2 10 6.55 0.501 222 WB +ADAMTSL1 MCISGLCQL 4 12 6.65 0.306 1823 +ADAMTSL1 MCISGLCQL 4 12 14.0 0.306 1823 +PPAP2C MYCMVFLVK 4 12 0.65 0.07 23399 +PPAP2C MYCMVFLVK 4 12 54.0 0.07 23399 +ADAMTSL1 LDMCISGLC 2 10 42.0 0.069 23677 +ADAMTSL1 LDMCISGLC 2 10 76.0 0.069 23677 +PPAP2C SFGMYCMVF 1 9 0.5 0.066 24390 +PPAP2C SFGMYCMVF 1 9 45.0 0.066 24390 +ADAMTSL1 DMCISGLCQ 3 11 64.5 0.042 31630 +ADAMTSL1 DMCISGLCQ 3 11 97.0 0.042 31630 |
b |
diff -r 000000000000 -r 6fbd9d25ceef test-data/regex_results.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/regex_results.tsv Tue Jul 18 09:06:47 2017 -0400 |
b |
@@ -0,0 +1,4 @@ +#FirstName LastName DOB +Steven Goldfish 04/04/74 +Paula Brown 24/05/78 +James Smith 20/10/80 |
b |
diff -r 000000000000 -r 6fbd9d25ceef test-data/sales.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sales.tsv Tue Jul 18 09:06:47 2017 -0400 |
b |
@@ -0,0 +1,6 @@ +#CustomerID Date SaleAmount +2 2004-05-06 100.22 +1 2004-05-07 99.95 +3 2004-05-07 122.95 +3 2004-05-13 100.00 +4 2004-05-22 555.55 |
b |
diff -r 000000000000 -r 6fbd9d25ceef test-data/sales_results.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sales_results.tsv Tue Jul 18 09:06:47 2017 -0400 |
b |
@@ -0,0 +1,5 @@ +#FirstName LastName TotalSales +James Smith 555.55 +Paula Brown 222.95 +Steven Goldfish 100.22 +John Smith 99.95 |
b |
diff -r 000000000000 -r 6fbd9d25ceef test-data/testdb.sqlite |
b |
Binary file test-data/testdb.sqlite has changed |