Mercurial > repos > jjohnson > query_tabular
diff query_tabular.xml @ 20:ab27c4bd14b9 draft
Uploaded
author | jjohnson |
---|---|
date | Fri, 14 Jul 2017 11:39:27 -0400 |
parents | b9f797bf4f38 |
children | 357fe86f245d |
line wrap: on
line diff
--- a/query_tabular.xml Wed Jul 05 11:51:50 2017 -0400 +++ b/query_tabular.xml Fri Jul 14 11:39:27 2017 -0400 @@ -1,6 +1,10 @@ -<tool id="query_tabular" name="Query Tabular" version="4.0.0"> +<tool id="query_tabular" name="Query Tabular" version="5.0.0"> <description>using sqlite sql</description> + <macros> + <import>macros.xml</import> + </macros> + <requirements> </requirements> <stdio> @@ -76,62 +80,8 @@ #if len($idx_non) > 0: #set $jtbl['index'] = $idx_non #end if - #set $input_filters = [] - #for $fi in $tbl.input_opts.linefilters: - #if $fi.filter.filter_type == 'skip': - #set $skip_lines = None - #if str($fi.filter.skip_lines) != '': - #set $skip_lines = int($fi.filter.skip_lines) - #elif $tbl.table.metadata.comment_lines and $tbl.table.metadata.comment_lines > 0: - #set $skip_lines = int($tbl.table.metadata.comment_lines) - #end if - #if $skip_lines is not None: - #set $filter_dict = dict() - #set $filter_dict['filter'] = str($fi.filter.filter_type) - #set $filter_dict['count'] = $skip_lines - #silent $input_filters.append($filter_dict) - #end if - #elif $fi.filter.filter_type == 'comment': - #set $filter_dict = dict() - #set $filter_dict['filter'] = 'regex' - #set $filter_dict['pattern'] = '^(%s).*$' % '|'.join([chr(int(x)).replace('|','[|]') for x in (str($fi.filter.comment_char)).split(',')]) - #set $filter_dict['action'] = 'exclude_match' - #silent $input_filters.append($filter_dict) - #elif $fi.filter.filter_type == 'regex': - #set $filter_dict = dict() - #set $filter_dict['filter'] = str($fi.filter.filter_type) - #set $filter_dict['pattern'] = str($fi.filter.regex_pattern) - #set $filter_dict['action'] = str($fi.filter.regex_action) - #silent $input_filters.append($filter_dict) - #elif $fi.filter.filter_type == 'select_columns': - #set $filter_dict = dict() - #set $filter_dict['filter'] = str($fi.filter.filter_type) - #set $filter_dict['columns'] = [int(str($ci).replace('c','')) for $ci in str($fi.filter.columns).split(',')] - #silent $input_filters.append($filter_dict) - #elif $fi.filter.filter_type == 'replace': - #set $filter_dict = dict() - #set $filter_dict['filter'] = str($fi.filter.filter_type) - #set $filter_dict['column'] = int(str($fi.filter.column).replace('c','')) - #set $filter_dict['pattern'] = str($fi.filter.regex_pattern) - #set $filter_dict['replace'] = str($fi.filter.regex_replace) - #silent $input_filters.append($filter_dict) - #elif str($fi.filter.filter_type).endswith('pend_line_num'): - #set $filter_dict = dict() - #set $filter_dict['filter'] = str($fi.filter.filter_type) - #silent $input_filters.append($filter_dict) - #elif str($fi.filter.filter_type).endswith('pend_text'): - #set $filter_dict = dict() - #set $filter_dict['filter'] = str($fi.filter.filter_type) - #set $filter_dict['column_text'] = str($fi.filter.column_text) - #silent $input_filters.append($filter_dict) - #elif $fi.filter.filter_type == 'normalize': - #set $filter_dict = dict() - #set $filter_dict['filter'] = str($fi.filter.filter_type) - #set $filter_dict['columns'] = [int(str($ci).replace('c','')) for $ci in str($fi.filter.columns).split(',')] - #set $filter_dict['separator'] = str($fi.filter.separator) - #silent $input_filters.append($filter_dict) - #end if - #end for + #set $linefilters = $tbl.input_opts.linefilters + @LINEFILTERS@ #if $input_filters: #set $jtbl['filters'] = $input_filters #end if @@ -149,99 +99,7 @@ <repeat name="tables" title="Database Table" min="0"> <param name="table" type="data" format="tabular" label="Tabular Dataset for Table"/> <section name="input_opts" expanded="false" title="Filter Dataset Input"> - <repeat name="linefilters" title="Filter Tabular Input Lines"> - <conditional name="filter"> - <param name="filter_type" type="select" label="Filter By"> - <option value="skip">skip leading lines</option> - <option value="comment">comment char</option> - <option value="regex">by regex expression matching</option> - <option value="select_columns">select columns</option> - <option value="replace">regex replace value in column</option> - <option value="prepend_line_num">prepend a line number column</option> - <option value="append_line_num">append a line number column</option> - <option value="prepend_text">prepend a column with the given text</option> - <option value="append_text">append a column with the given text</option> - <option value="normalize">normalize list columns, replicates row for each item in list</option> - </param> - <when value="skip"> - <param name="skip_lines" type="integer" value="" min="0" optional="true" label="Skip lines" - help="Leave blank to use the comment lines metadata for this dataset" /> - </when> - <when value="comment"> - <param name="comment_char" type="select" display="checkboxes" multiple="True" label="Ignore lines beginning with these characters" help="lines beginning with these are skipped"> - <option value="62">></option> - <option value="64">@</option> - <option value="43">+</option> - <option value="60"><</option> - <option value="42">*</option> - <option value="45">-</option> - <option value="61">=</option> - <option value="124">|</option> - <option value="63">?</option> - <option value="36">$</option> - <option value="46">.</option> - <option value="58">:</option> - <option value="38">&</option> - <option value="37">%</option> - <option value="94">^</option> - <option value="35">#</option> - <option value="33">!</option> - </param> - </when> - <when value="prepend_line_num"/> - <when value="append_line_num"/> - <when value="prepend_text"> - <param name="column_text" type="text" value="" label="text for column"> - </param> - </when> - <when value="append_text"> - <param name="column_text" type="text" value="" label="text for column"> - </param> - </when> - <when value="regex"> - <param name="regex_pattern" type="text" value="" label="regex pattern"> - <sanitizer sanitize="False"/> - </param> - <param name="regex_action" type="select" label="action for regex match"> - <option value="exclude_match">exclude line on pattern match</option> - <option value="include_match">include line on pattern match</option> - <option value="exclude_find">exclude line if pattern found</option> - <option value="include_find">include line if pattern found</option> - </param> - </when> - <when value="select_columns"> - <param name="columns" type="text" value="" label="enter column numbers to keep" - help="example: 1,4,2 or c1,c4,c2(selects the first,fourth, and second columns)"> - <validator type="regex" message="Column ordinal positions separated by commas">^(c?[1-9]\d*)(,c?[1-9]\d*)*$</validator> - </param> - </when> - <when value="replace"> - <param name="column" type="text" value="" label="enter column number to replace" - help="example: 1 or c1 (selects the first column)"> - <validator type="regex" message="Column ordinal position separated by commas">^(c?[1-9]\d*)$</validator> - </param> - <param name="regex_pattern" type="text" value="" label="regex pattern"> - <sanitizer sanitize="False"/> - </param> - <param name="regex_replace" type="text" value="" label="replacement expression"> - <sanitizer sanitize="False"/> - </param> - </when> - <when value="normalize"> - <param name="columns" type="text" value="" label="enter column numbers to normalize"> - <help><![CDATA[ - example: 2,4 or c2,c4 (selects the second, and fourth columns) - If multiple columns are selected, they should have the same length and separator on each line - ]]></help> - <validator type="regex" message="Column ordinal positions separated by commas">^(c?[1-9]\d*)(,c?[1-9]\d*)*$</validator> - </param> - <param name="separator" type="text" value="," label="List item delimiter in column"> - <sanitizer sanitize="False"/> - <validator type="regex" message="Anything but TAB or Newline">^[^\t\n\r\f\v]+$</validator> - </param> - </when> - </conditional> - </repeat> + <expand macro="macro_line_filters" /> </section> <section name="tbl_opts" expanded="false" title="Table Options"> <param name="table_name" type="text" value="" optional="true" label="Specify Name for Table"> @@ -269,7 +127,8 @@ </repeat> </section> </repeat> - <param name="save_db" type="boolean" truevalue="yes" falsevalue="no" checked="false" label="Save the sqlite database in your history"/> + <param name="save_db" type="boolean" truevalue="yes" falsevalue="no" checked="false" label="Save the sqlite database in your history" + help="SQLite to tabular tool can run additional queries on this database"/> <param name="sqlquery" type="text" area="true" size="20x80" value="" optional="true" label="SQL Query to generate tabular output"> <help>By default: tables are named: t1,t2,...,tn and columns in each table: c1,c2,...,cn</help> <sanitizer sanitize="False"/> @@ -279,10 +138,10 @@ </inputs> <outputs> <data format="sqlite" name="sqlitedb" label="sqlite db of ${on_string}"> - <filter>save_db or not (sqlquery and len(sqlquery) > 0)</filter> + <filter>save_db</filter> </data> <data format="tabular" name="output" label="query results on ${on_string}"> - <filter>sqlquery and len(sqlquery) > 0</filter> + <filter>not save_db or (sqlquery and len(sqlquery.strip()) > 0)</filter> </data> </outputs> <tests> @@ -400,20 +259,7 @@ An existing SQLite_ data base can be used as input, and any selected tabular datasets will be added as new tables in that data base. -**Input Line Filters** - - As a tabular file is being read, line filters may be applied. - - :: - - - skip leading lines skip the first *number* of lines - - comment char omit any lines that start with the specified comment character - - by regex expression matching *include/exclude* lines the match the regex expression - - select columns choose to include only selected columns in the order specified - - regex replace value in column replace a field in a column using a regex substitution (good for date reformatting) - - prepend a line number column each line has the ordinal value of the line read by this filter as the first column - - append a line number column each line has the ordinal value of the line read by this filter as the last column - - normalize list columns replicates the line for each item in the specified list *columns* +@LINEFILTERS_HELP@ **Outputs** @@ -425,192 +271,9 @@ *(The* **SQLite to tabular** *tool can run additional queries on this database.)* -For help in using SQLite_ see: http://www.sqlite.org/docs.html - -**NOTE:** input for SQLite dates input field must be in the format: *YYYY-MM-DD* for example: 2015-09-30 - -See: http://www.sqlite.org/lang_datefunc.html - -**Example** - - Given 2 tabular datasets: *customers* and *sales* - - Dataset *customers* - - Table name: "customers" - - Column names: "CustomerID,FirstName,LastName,Email,DOB,Phone" - - =========== ========== ========== ===================== ========== ============ - #CustomerID FirstName LastName Email DOB Phone - =========== ========== ========== ===================== ========== ============ - 1 John Smith John.Smith@yahoo.com 1968-02-04 626 222-2222 - 2 Steven Goldfish goldfish@fishhere.net 1974-04-04 323 455-4545 - 3 Paula Brown pb@herowndomain.org 1978-05-24 416 323-3232 - 4 James Smith jim@supergig.co.uk 1980-10-20 416 323-8888 - =========== ========== ========== ===================== ========== ============ - - Dataset *sales* - - Table name: "sales" - - Column names: "CustomerID,Date,SaleAmount" - - ============= ============ ============ - #CustomerID Date SaleAmount - ============= ============ ============ - 2 2004-05-06 100.22 - 1 2004-05-07 99.95 - 3 2004-05-07 122.95 - 3 2004-05-13 100.00 - 4 2004-05-22 555.55 - ============= ============ ============ - - The query - - :: - - SELECT FirstName,LastName,sum(SaleAmount) as "TotalSales" - FROM customers join sales on customers.CustomerID = sales.CustomerID - GROUP BY customers.CustomerID ORDER BY TotalSales DESC; - - Produces this tabular output: - - ========== ======== ========== - #FirstName LastName TotalSales - ========== ======== ========== - James Smith 555.55 - Paula Brown 222.95 - Steven Goldfish 100.22 - John Smith 99.95 - ========== ======== ========== - - - If the optional Table name and Column names inputs are not used, the query would be: - - :: - - SELECT t1.c2 as "FirstName", t1.c3 as "LastName", sum(t2.c3) as "TotalSales" - FROM t1 join t2 on t1.c1 = t2.c1 - GROUP BY t1.c1 ORDER BY TotalSales DESC; - - You can selectively name columns, e.g. on the customers input you could just name columns 2,3, and 5: - - Column names: ,FirstName,LastName,,BirthDate - - Results in the following data base table - - =========== ========== ========== ===================== ========== ============ - #c1 FirstName LastName c4 BirthDate c6 - =========== ========== ========== ===================== ========== ============ - 1 John Smith John.Smith@yahoo.com 1968-02-04 626 222-2222 - 2 Steven Goldfish goldfish@fishhere.net 1974-04-04 323 455-4545 - 3 Paula Brown pb@herowndomain.org 1978-05-24 416 323-3232 - 4 James Smith jim@supergig.co.uk 1980-10-20 416 323-8888 - =========== ========== ========== ===================== ========== ============ - - - Regular_expression_ functions are included for: - - :: - - matching: re_match('pattern',column) +@QUERY_HELP@ - SELECT t1.FirstName, t1.LastName - FROM t1 - WHERE re_match('^.*\.(net|org)$',c4) - - Results: - - =========== ========== - #FirstName LastName - =========== ========== - Steven Goldfish - Paula Brown - =========== ========== - - - :: - - searching: re_search('pattern',column) - substituting: re_sub('pattern','replacement,column) - - SELECT t1.FirstName, t1.LastName, re_sub('^\d{2}(\d{2})-(\d\d)-(\d\d)','\3/\2/\1',BirthDate) as "DOB" - FROM t1 - WHERE re_search('[hp]er',c4) - - Results: - - - =========== ========== ========== - #FirstName LastName DOB - =========== ========== ========== - Steven Goldfish 04/04/74 - Paula Brown 24/05/78 - James Smith 20/10/80 - =========== ========== ========== - - -**Line Filtering Example** - *(Six filters are applied as the following file is read)* - - :: - - Input Tabular File: - - #People with pets - Pets FirstName LastName DOB PetNames PetType - 2 Paula Brown 24/05/78 Rex,Fluff dog,cat - 1 Steven Jones 04/04/74 Allie cat - 0 Jane Doe 24/05/78 - 1 James Smith 20/10/80 Spot - - - Filter 1 - append a line number column: - - #People with pets 1 - Pets FirstName LastName DOB PetNames PetType 2 - 2 Paula Brown 24/05/78 Rex,Fluff dog,cat 3 - 1 Steven Jones 04/04/74 Allie cat 4 - 0 Jane Doe 24/05/78 5 - 1 James Smith 20/10/80 Spot 6 - - Filter 2 - by regex expression matching [include]: '^\d+' (include lines that start with a number) - - 2 Paula Brown 24/05/78 Rex,Fluff dog,cat 3 - 1 Steven Jones 04/04/74 Allie cat 4 - 0 Jane Doe 24/05/78 5 - 1 James Smith 20/10/80 Spot 6 - - Filter 3 - append a line number column: - - 2 Paula Brown 24/05/78 Rex,Fluff dog,cat 3 1 - 1 Steven Jones 04/04/74 Allie cat 4 2 - 0 Jane Doe 24/05/78 5 3 - 1 James Smith 20/10/80 Spot 6 4 - - Filter 4 - regex replace value in column[4]: '(\d+)/(\d+)/(\d+)' '19\3-\2-\1' (convert dates to sqlite format) - - 2 Paula Brown 1978-05-24 Rex,Fluff dog,cat 3 1 - 1 Steven Jones 1974-04-04 Allie cat 4 2 - 0 Jane Doe 1978-05-24 5 3 - 1 James Smith 1980-10-20 Spot 6 4 - - Filter 5 - normalize list columns[5,6]: - - 2 Paula Brown 1978-05-24 Rex dog 3 1 - 2 Paula Brown 1978-05-24 Fluff cat 3 1 - 1 Steven Jones 1974-04-04 Allie cat 4 2 - 0 Jane Doe 1978-05-24 5 3 - 1 James Smith 1980-10-20 Spot 6 4 - - Filter 6 - append a line number column: - - 2 Paula Brown 1978-05-24 Rex dog 3 1 1 - 2 Paula Brown 1978-05-24 Fluff cat 3 1 2 - 1 Steven Jones 1974-04-04 Allie cat 4 2 3 - 0 Jane Doe 1978-05-24 5 3 4 - 1 James Smith 1980-10-20 Spot 6 4 5 +@LINEFILTERS_HELP_EXAMPLE@ Table name: pets @@ -634,6 +297,13 @@ **Normalizing by Line Filtering into 2 Tables** +*Relational database opertions work with single-valued column entries. +To apply relational operations to tabular files that contain fields with lists of values, +we need to "normalize" those fields, duplicating lines for each item in the list. +In this example we create 2 tables, one for single-valued fields and a second with list-valued fields normalized. +Becauce we add a line number first for each table, we can join the 2 tables on the line number column.* +https://en.wikipedia.org/wiki/First_normal_form + *People Table* :: @@ -679,7 +349,7 @@ == ======== ======== - Query: SELECT FirstName,LastName,PetName FROM People join Pet on People.id = Pet.id WHERE PetType = 'cat'; + Query: SELECT FirstName,LastName,PetName FROM People JOIN Pet ON People.id = Pet.id WHERE PetType = 'cat'; Result: @@ -690,8 +360,6 @@ Steven Jones Allie ========= ======== ======== -.. _Regular_expression: https://docs.python.org/release/2.7/library/re.html -.. _SQLite: http://www.sqlite.org/index.html ]]></help> </tool>