comparison query_tabular.xml @ 20:ab27c4bd14b9 draft

Uploaded
author jjohnson
date Fri, 14 Jul 2017 11:39:27 -0400
parents b9f797bf4f38
children 357fe86f245d
comparison
equal deleted inserted replaced
19:9d9ab2c69014 20:ab27c4bd14b9
1 <tool id="query_tabular" name="Query Tabular" version="4.0.0"> 1 <tool id="query_tabular" name="Query Tabular" version="5.0.0">
2 <description>using sqlite sql</description> 2 <description>using sqlite sql</description>
3
4 <macros>
5 <import>macros.xml</import>
6 </macros>
3 7
4 <requirements> 8 <requirements>
5 </requirements> 9 </requirements>
6 <stdio> 10 <stdio>
7 <exit_code range="1:" /> 11 <exit_code range="1:" />
74 #set $jtbl['unique'] = $idx_unique 78 #set $jtbl['unique'] = $idx_unique
75 #end if 79 #end if
76 #if len($idx_non) > 0: 80 #if len($idx_non) > 0:
77 #set $jtbl['index'] = $idx_non 81 #set $jtbl['index'] = $idx_non
78 #end if 82 #end if
79 #set $input_filters = [] 83 #set $linefilters = $tbl.input_opts.linefilters
80 #for $fi in $tbl.input_opts.linefilters: 84 @LINEFILTERS@
81 #if $fi.filter.filter_type == 'skip':
82 #set $skip_lines = None
83 #if str($fi.filter.skip_lines) != '':
84 #set $skip_lines = int($fi.filter.skip_lines)
85 #elif $tbl.table.metadata.comment_lines and $tbl.table.metadata.comment_lines > 0:
86 #set $skip_lines = int($tbl.table.metadata.comment_lines)
87 #end if
88 #if $skip_lines is not None:
89 #set $filter_dict = dict()
90 #set $filter_dict['filter'] = str($fi.filter.filter_type)
91 #set $filter_dict['count'] = $skip_lines
92 #silent $input_filters.append($filter_dict)
93 #end if
94 #elif $fi.filter.filter_type == 'comment':
95 #set $filter_dict = dict()
96 #set $filter_dict['filter'] = 'regex'
97 #set $filter_dict['pattern'] = '^(%s).*$' % '|'.join([chr(int(x)).replace('|','[|]') for x in (str($fi.filter.comment_char)).split(',')])
98 #set $filter_dict['action'] = 'exclude_match'
99 #silent $input_filters.append($filter_dict)
100 #elif $fi.filter.filter_type == 'regex':
101 #set $filter_dict = dict()
102 #set $filter_dict['filter'] = str($fi.filter.filter_type)
103 #set $filter_dict['pattern'] = str($fi.filter.regex_pattern)
104 #set $filter_dict['action'] = str($fi.filter.regex_action)
105 #silent $input_filters.append($filter_dict)
106 #elif $fi.filter.filter_type == 'select_columns':
107 #set $filter_dict = dict()
108 #set $filter_dict['filter'] = str($fi.filter.filter_type)
109 #set $filter_dict['columns'] = [int(str($ci).replace('c','')) for $ci in str($fi.filter.columns).split(',')]
110 #silent $input_filters.append($filter_dict)
111 #elif $fi.filter.filter_type == 'replace':
112 #set $filter_dict = dict()
113 #set $filter_dict['filter'] = str($fi.filter.filter_type)
114 #set $filter_dict['column'] = int(str($fi.filter.column).replace('c',''))
115 #set $filter_dict['pattern'] = str($fi.filter.regex_pattern)
116 #set $filter_dict['replace'] = str($fi.filter.regex_replace)
117 #silent $input_filters.append($filter_dict)
118 #elif str($fi.filter.filter_type).endswith('pend_line_num'):
119 #set $filter_dict = dict()
120 #set $filter_dict['filter'] = str($fi.filter.filter_type)
121 #silent $input_filters.append($filter_dict)
122 #elif str($fi.filter.filter_type).endswith('pend_text'):
123 #set $filter_dict = dict()
124 #set $filter_dict['filter'] = str($fi.filter.filter_type)
125 #set $filter_dict['column_text'] = str($fi.filter.column_text)
126 #silent $input_filters.append($filter_dict)
127 #elif $fi.filter.filter_type == 'normalize':
128 #set $filter_dict = dict()
129 #set $filter_dict['filter'] = str($fi.filter.filter_type)
130 #set $filter_dict['columns'] = [int(str($ci).replace('c','')) for $ci in str($fi.filter.columns).split(',')]
131 #set $filter_dict['separator'] = str($fi.filter.separator)
132 #silent $input_filters.append($filter_dict)
133 #end if
134 #end for
135 #if $input_filters: 85 #if $input_filters:
136 #set $jtbl['filters'] = $input_filters 86 #set $jtbl['filters'] = $input_filters
137 #end if 87 #end if
138 #set $jtbls += [$jtbl] 88 #set $jtbls += [$jtbl]
139 #end for 89 #end for
147 help="Make sure your added table names are not already in this database"/> 97 help="Make sure your added table names are not already in this database"/>
148 </section> 98 </section>
149 <repeat name="tables" title="Database Table" min="0"> 99 <repeat name="tables" title="Database Table" min="0">
150 <param name="table" type="data" format="tabular" label="Tabular Dataset for Table"/> 100 <param name="table" type="data" format="tabular" label="Tabular Dataset for Table"/>
151 <section name="input_opts" expanded="false" title="Filter Dataset Input"> 101 <section name="input_opts" expanded="false" title="Filter Dataset Input">
152 <repeat name="linefilters" title="Filter Tabular Input Lines"> 102 <expand macro="macro_line_filters" />
153 <conditional name="filter">
154 <param name="filter_type" type="select" label="Filter By">
155 <option value="skip">skip leading lines</option>
156 <option value="comment">comment char</option>
157 <option value="regex">by regex expression matching</option>
158 <option value="select_columns">select columns</option>
159 <option value="replace">regex replace value in column</option>
160 <option value="prepend_line_num">prepend a line number column</option>
161 <option value="append_line_num">append a line number column</option>
162 <option value="prepend_text">prepend a column with the given text</option>
163 <option value="append_text">append a column with the given text</option>
164 <option value="normalize">normalize list columns, replicates row for each item in list</option>
165 </param>
166 <when value="skip">
167 <param name="skip_lines" type="integer" value="" min="0" optional="true" label="Skip lines"
168 help="Leave blank to use the comment lines metadata for this dataset" />
169 </when>
170 <when value="comment">
171 <param name="comment_char" type="select" display="checkboxes" multiple="True" label="Ignore lines beginning with these characters" help="lines beginning with these are skipped">
172 <option value="62">&gt;</option>
173 <option value="64">@</option>
174 <option value="43">+</option>
175 <option value="60">&lt;</option>
176 <option value="42">*</option>
177 <option value="45">-</option>
178 <option value="61">=</option>
179 <option value="124">|</option>
180 <option value="63">?</option>
181 <option value="36">$</option>
182 <option value="46">.</option>
183 <option value="58">:</option>
184 <option value="38">&amp;</option>
185 <option value="37">%</option>
186 <option value="94">^</option>
187 <option value="35">&#35;</option>
188 <option value="33">!</option>
189 </param>
190 </when>
191 <when value="prepend_line_num"/>
192 <when value="append_line_num"/>
193 <when value="prepend_text">
194 <param name="column_text" type="text" value="" label="text for column">
195 </param>
196 </when>
197 <when value="append_text">
198 <param name="column_text" type="text" value="" label="text for column">
199 </param>
200 </when>
201 <when value="regex">
202 <param name="regex_pattern" type="text" value="" label="regex pattern">
203 <sanitizer sanitize="False"/>
204 </param>
205 <param name="regex_action" type="select" label="action for regex match">
206 <option value="exclude_match">exclude line on pattern match</option>
207 <option value="include_match">include line on pattern match</option>
208 <option value="exclude_find">exclude line if pattern found</option>
209 <option value="include_find">include line if pattern found</option>
210 </param>
211 </when>
212 <when value="select_columns">
213 <param name="columns" type="text" value="" label="enter column numbers to keep"
214 help="example: 1,4,2 or c1,c4,c2(selects the first,fourth, and second columns)">
215 <validator type="regex" message="Column ordinal positions separated by commas">^(c?[1-9]\d*)(,c?[1-9]\d*)*$</validator>
216 </param>
217 </when>
218 <when value="replace">
219 <param name="column" type="text" value="" label="enter column number to replace"
220 help="example: 1 or c1 (selects the first column)">
221 <validator type="regex" message="Column ordinal position separated by commas">^(c?[1-9]\d*)$</validator>
222 </param>
223 <param name="regex_pattern" type="text" value="" label="regex pattern">
224 <sanitizer sanitize="False"/>
225 </param>
226 <param name="regex_replace" type="text" value="" label="replacement expression">
227 <sanitizer sanitize="False"/>
228 </param>
229 </when>
230 <when value="normalize">
231 <param name="columns" type="text" value="" label="enter column numbers to normalize">
232 <help><![CDATA[
233 example: 2,4 or c2,c4 (selects the second, and fourth columns)
234 If multiple columns are selected, they should have the same length and separator on each line
235 ]]></help>
236 <validator type="regex" message="Column ordinal positions separated by commas">^(c?[1-9]\d*)(,c?[1-9]\d*)*$</validator>
237 </param>
238 <param name="separator" type="text" value="," label="List item delimiter in column">
239 <sanitizer sanitize="False"/>
240 <validator type="regex" message="Anything but TAB or Newline">^[^\t\n\r\f\v]+$</validator>
241 </param>
242 </when>
243 </conditional>
244 </repeat>
245 </section> 103 </section>
246 <section name="tbl_opts" expanded="false" title="Table Options"> 104 <section name="tbl_opts" expanded="false" title="Table Options">
247 <param name="table_name" type="text" value="" optional="true" label="Specify Name for Table"> 105 <param name="table_name" type="text" value="" optional="true" label="Specify Name for Table">
248 <help>By default, tables will be named: t1,t2,...,tn (table names must be unique)</help> 106 <help>By default, tables will be named: t1,t2,...,tn (table names must be unique)</help>
249 <validator type="regex" message="Table name should start with a letter and may contain additional letters, digits, and underscores">^[A-Za-z]\w*$</validator> 107 <validator type="regex" message="Table name should start with a letter and may contain additional letters, digits, and underscores">^[A-Za-z]\w*$</validator>
267 <validator type="regex" message="Column name, separated by commes if more than one">^([A-Za-z]\w*|"\S+[^,"]*"|`\S+[^,`]*`|[[]\S+[^,"]*[]])(,([A-Za-z]\w*|"\S+.*"|`\S+[^,`]*`|[[]\S+[^,"]*[]])?)*$</validator> 125 <validator type="regex" message="Column name, separated by commes if more than one">^([A-Za-z]\w*|"\S+[^,"]*"|`\S+[^,`]*`|[[]\S+[^,"]*[]])(,([A-Za-z]\w*|"\S+.*"|`\S+[^,`]*`|[[]\S+[^,"]*[]])?)*$</validator>
268 </param> 126 </param>
269 </repeat> 127 </repeat>
270 </section> 128 </section>
271 </repeat> 129 </repeat>
272 <param name="save_db" type="boolean" truevalue="yes" falsevalue="no" checked="false" label="Save the sqlite database in your history"/> 130 <param name="save_db" type="boolean" truevalue="yes" falsevalue="no" checked="false" label="Save the sqlite database in your history"
131 help="SQLite to tabular tool can run additional queries on this database"/>
273 <param name="sqlquery" type="text" area="true" size="20x80" value="" optional="true" label="SQL Query to generate tabular output"> 132 <param name="sqlquery" type="text" area="true" size="20x80" value="" optional="true" label="SQL Query to generate tabular output">
274 <help>By default: tables are named: t1,t2,...,tn and columns in each table: c1,c2,...,cn</help> 133 <help>By default: tables are named: t1,t2,...,tn and columns in each table: c1,c2,...,cn</help>
275 <sanitizer sanitize="False"/> 134 <sanitizer sanitize="False"/>
276 <validator type="regex" message="">^(?ims)\s*select\s+.*\s+from\s+.*$</validator> 135 <validator type="regex" message="">^(?ims)\s*select\s+.*\s+from\s+.*$</validator>
277 </param> 136 </param>
278 <param name="no_header" type="boolean" truevalue="-n" falsevalue="" checked="False" label="Omit column headers from tabular output"/> 137 <param name="no_header" type="boolean" truevalue="-n" falsevalue="" checked="False" label="Omit column headers from tabular output"/>
279 </inputs> 138 </inputs>
280 <outputs> 139 <outputs>
281 <data format="sqlite" name="sqlitedb" label="sqlite db of ${on_string}"> 140 <data format="sqlite" name="sqlitedb" label="sqlite db of ${on_string}">
282 <filter>save_db or not (sqlquery and len(sqlquery) > 0)</filter> 141 <filter>save_db</filter>
283 </data> 142 </data>
284 <data format="tabular" name="output" label="query results on ${on_string}"> 143 <data format="tabular" name="output" label="query results on ${on_string}">
285 <filter>sqlquery and len(sqlquery) > 0</filter> 144 <filter>not save_db or (sqlquery and len(sqlquery.strip()) > 0)</filter>
286 </data> 145 </data>
287 </outputs> 146 </outputs>
288 <tests> 147 <tests>
289 148
290 <test> 149 <test>
398 Loads tabular datasets into a SQLite_ data base. 257 Loads tabular datasets into a SQLite_ data base.
399 258
400 An existing SQLite_ data base can be used as input, and any selected tabular datasets will be added as new tables in that data base. 259 An existing SQLite_ data base can be used as input, and any selected tabular datasets will be added as new tables in that data base.
401 260
402 261
403 **Input Line Filters** 262 @LINEFILTERS_HELP@
404
405 As a tabular file is being read, line filters may be applied.
406
407 ::
408
409 - skip leading lines skip the first *number* of lines
410 - comment char omit any lines that start with the specified comment character
411 - by regex expression matching *include/exclude* lines the match the regex expression
412 - select columns choose to include only selected columns in the order specified
413 - regex replace value in column replace a field in a column using a regex substitution (good for date reformatting)
414 - prepend a line number column each line has the ordinal value of the line read by this filter as the first column
415 - append a line number column each line has the ordinal value of the line read by this filter as the last column
416 - normalize list columns replicates the line for each item in the specified list *columns*
417 263
418 264
419 **Outputs** 265 **Outputs**
420 266
421 The results of a SQL query are output to the history as a tabular file. 267 The results of a SQL query are output to the history as a tabular file.
423 The SQLite_ data base can also be saved and output as a dataset in the history. 269 The SQLite_ data base can also be saved and output as a dataset in the history.
424 270
425 *(The* **SQLite to tabular** *tool can run additional queries on this database.)* 271 *(The* **SQLite to tabular** *tool can run additional queries on this database.)*
426 272
427 273
428 For help in using SQLite_ see: http://www.sqlite.org/docs.html 274 @QUERY_HELP@
429 275
430 **NOTE:** input for SQLite dates input field must be in the format: *YYYY-MM-DD* for example: 2015-09-30 276 @LINEFILTERS_HELP_EXAMPLE@
431
432 See: http://www.sqlite.org/lang_datefunc.html
433
434 **Example**
435
436 Given 2 tabular datasets: *customers* and *sales*
437
438 Dataset *customers*
439
440 Table name: "customers"
441
442 Column names: "CustomerID,FirstName,LastName,Email,DOB,Phone"
443
444 =========== ========== ========== ===================== ========== ============
445 #CustomerID FirstName LastName Email DOB Phone
446 =========== ========== ========== ===================== ========== ============
447 1 John Smith John.Smith@yahoo.com 1968-02-04 626 222-2222
448 2 Steven Goldfish goldfish@fishhere.net 1974-04-04 323 455-4545
449 3 Paula Brown pb@herowndomain.org 1978-05-24 416 323-3232
450 4 James Smith jim@supergig.co.uk 1980-10-20 416 323-8888
451 =========== ========== ========== ===================== ========== ============
452
453 Dataset *sales*
454
455 Table name: "sales"
456
457 Column names: "CustomerID,Date,SaleAmount"
458
459 ============= ============ ============
460 #CustomerID Date SaleAmount
461 ============= ============ ============
462 2 2004-05-06 100.22
463 1 2004-05-07 99.95
464 3 2004-05-07 122.95
465 3 2004-05-13 100.00
466 4 2004-05-22 555.55
467 ============= ============ ============
468
469 The query
470
471 ::
472
473 SELECT FirstName,LastName,sum(SaleAmount) as "TotalSales"
474 FROM customers join sales on customers.CustomerID = sales.CustomerID
475 GROUP BY customers.CustomerID ORDER BY TotalSales DESC;
476
477 Produces this tabular output:
478
479 ========== ======== ==========
480 #FirstName LastName TotalSales
481 ========== ======== ==========
482 James Smith 555.55
483 Paula Brown 222.95
484 Steven Goldfish 100.22
485 John Smith 99.95
486 ========== ======== ==========
487
488
489 If the optional Table name and Column names inputs are not used, the query would be:
490
491 ::
492
493 SELECT t1.c2 as "FirstName", t1.c3 as "LastName", sum(t2.c3) as "TotalSales"
494 FROM t1 join t2 on t1.c1 = t2.c1
495 GROUP BY t1.c1 ORDER BY TotalSales DESC;
496
497 You can selectively name columns, e.g. on the customers input you could just name columns 2,3, and 5:
498
499 Column names: ,FirstName,LastName,,BirthDate
500
501 Results in the following data base table
502
503 =========== ========== ========== ===================== ========== ============
504 #c1 FirstName LastName c4 BirthDate c6
505 =========== ========== ========== ===================== ========== ============
506 1 John Smith John.Smith@yahoo.com 1968-02-04 626 222-2222
507 2 Steven Goldfish goldfish@fishhere.net 1974-04-04 323 455-4545
508 3 Paula Brown pb@herowndomain.org 1978-05-24 416 323-3232
509 4 James Smith jim@supergig.co.uk 1980-10-20 416 323-8888
510 =========== ========== ========== ===================== ========== ============
511
512
513 Regular_expression_ functions are included for:
514
515 ::
516
517 matching: re_match('pattern',column)
518
519 SELECT t1.FirstName, t1.LastName
520 FROM t1
521 WHERE re_match('^.*\.(net|org)$',c4)
522
523 Results:
524
525 =========== ==========
526 #FirstName LastName
527 =========== ==========
528 Steven Goldfish
529 Paula Brown
530 =========== ==========
531
532
533 ::
534
535 searching: re_search('pattern',column)
536 substituting: re_sub('pattern','replacement,column)
537
538 SELECT t1.FirstName, t1.LastName, re_sub('^\d{2}(\d{2})-(\d\d)-(\d\d)','\3/\2/\1',BirthDate) as "DOB"
539 FROM t1
540 WHERE re_search('[hp]er',c4)
541
542 Results:
543
544
545 =========== ========== ==========
546 #FirstName LastName DOB
547 =========== ========== ==========
548 Steven Goldfish 04/04/74
549 Paula Brown 24/05/78
550 James Smith 20/10/80
551 =========== ========== ==========
552
553
554 **Line Filtering Example**
555 *(Six filters are applied as the following file is read)*
556
557 ::
558
559 Input Tabular File:
560
561 #People with pets
562 Pets FirstName LastName DOB PetNames PetType
563 2 Paula Brown 24/05/78 Rex,Fluff dog,cat
564 1 Steven Jones 04/04/74 Allie cat
565 0 Jane Doe 24/05/78
566 1 James Smith 20/10/80 Spot
567
568
569 Filter 1 - append a line number column:
570
571 #People with pets 1
572 Pets FirstName LastName DOB PetNames PetType 2
573 2 Paula Brown 24/05/78 Rex,Fluff dog,cat 3
574 1 Steven Jones 04/04/74 Allie cat 4
575 0 Jane Doe 24/05/78 5
576 1 James Smith 20/10/80 Spot 6
577
578 Filter 2 - by regex expression matching [include]: '^\d+' (include lines that start with a number)
579
580 2 Paula Brown 24/05/78 Rex,Fluff dog,cat 3
581 1 Steven Jones 04/04/74 Allie cat 4
582 0 Jane Doe 24/05/78 5
583 1 James Smith 20/10/80 Spot 6
584
585 Filter 3 - append a line number column:
586
587 2 Paula Brown 24/05/78 Rex,Fluff dog,cat 3 1
588 1 Steven Jones 04/04/74 Allie cat 4 2
589 0 Jane Doe 24/05/78 5 3
590 1 James Smith 20/10/80 Spot 6 4
591
592 Filter 4 - regex replace value in column[4]: '(\d+)/(\d+)/(\d+)' '19\3-\2-\1' (convert dates to sqlite format)
593
594 2 Paula Brown 1978-05-24 Rex,Fluff dog,cat 3 1
595 1 Steven Jones 1974-04-04 Allie cat 4 2
596 0 Jane Doe 1978-05-24 5 3
597 1 James Smith 1980-10-20 Spot 6 4
598
599 Filter 5 - normalize list columns[5,6]:
600
601 2 Paula Brown 1978-05-24 Rex dog 3 1
602 2 Paula Brown 1978-05-24 Fluff cat 3 1
603 1 Steven Jones 1974-04-04 Allie cat 4 2
604 0 Jane Doe 1978-05-24 5 3
605 1 James Smith 1980-10-20 Spot 6 4
606
607 Filter 6 - append a line number column:
608
609 2 Paula Brown 1978-05-24 Rex dog 3 1 1
610 2 Paula Brown 1978-05-24 Fluff cat 3 1 2
611 1 Steven Jones 1974-04-04 Allie cat 4 2 3
612 0 Jane Doe 1978-05-24 5 3 4
613 1 James Smith 1980-10-20 Spot 6 4 5
614 277
615 278
616 Table name: pets 279 Table name: pets
617 280
618 Table columns: Pets,FirstName,LastName,Birthdate,PetNames,PetType,line_num,entry_num,row_num 281 Table columns: Pets,FirstName,LastName,Birthdate,PetNames,PetType,line_num,entry_num,row_num
632 ====== ========== ======== ========== ========= ======== ========= ========== ======== 295 ====== ========== ======== ========== ========= ======== ========= ========== ========
633 296
634 297
635 **Normalizing by Line Filtering into 2 Tables** 298 **Normalizing by Line Filtering into 2 Tables**
636 299
300 *Relational database opertions work with single-valued column entries.
301 To apply relational operations to tabular files that contain fields with lists of values,
302 we need to "normalize" those fields, duplicating lines for each item in the list.
303 In this example we create 2 tables, one for single-valued fields and a second with list-valued fields normalized.
304 Becauce we add a line number first for each table, we can join the 2 tables on the line number column.*
305 https://en.wikipedia.org/wiki/First_normal_form
306
637 *People Table* 307 *People Table*
638 308
639 :: 309 ::
640 310
641 Filter 1 - by regex expression matching [include]: '^\d+' (include lines that start with a number) 311 Filter 1 - by regex expression matching [include]: '^\d+' (include lines that start with a number)
677 2 Allie cat 347 2 Allie cat
678 4 Spot 348 4 Spot
679 == ======== ======== 349 == ======== ========
680 350
681 351
682 Query: SELECT FirstName,LastName,PetName FROM People join Pet on People.id = Pet.id WHERE PetType = 'cat'; 352 Query: SELECT FirstName,LastName,PetName FROM People JOIN Pet ON People.id = Pet.id WHERE PetType = 'cat';
683 353
684 Result: 354 Result:
685 355
686 ========= ======== ======== 356 ========= ======== ========
687 FirstName LastName PetName 357 FirstName LastName PetName
688 ========= ======== ======== 358 ========= ======== ========
689 Paula Brown Fluff 359 Paula Brown Fluff
690 Steven Jones Allie 360 Steven Jones Allie
691 ========= ======== ======== 361 ========= ======== ========
692 362
693 .. _Regular_expression: https://docs.python.org/release/2.7/library/re.html
694 .. _SQLite: http://www.sqlite.org/index.html
695 363
696 ]]></help> 364 ]]></help>
697 </tool> 365 </tool>