20
|
1 <macros>
|
|
2 <token name="@LINEFILTERS@">
|
|
3 <![CDATA[
|
|
4 ## set linefilters to the
|
|
5 #set $input_filters = []
|
|
6 #for $fi in $linefilters:
|
|
7 #if $fi.filter.filter_type == 'skip':
|
|
8 #set $skip_lines = None
|
|
9 #if str($fi.filter.skip_lines) != '':
|
|
10 #set $skip_lines = int($fi.filter.skip_lines)
|
|
11 #elif $tbl.table.metadata.comment_lines and $tbl.table.metadata.comment_lines > 0:
|
|
12 #set $skip_lines = int($tbl.table.metadata.comment_lines)
|
|
13 #end if
|
|
14 #if $skip_lines is not None:
|
|
15 #set $filter_dict = dict()
|
|
16 #set $filter_dict['filter'] = str($fi.filter.filter_type)
|
|
17 #set $filter_dict['count'] = $skip_lines
|
|
18 #silent $input_filters.append($filter_dict)
|
|
19 #end if
|
|
20 #elif $fi.filter.filter_type == 'comment':
|
|
21 #set $filter_dict = dict()
|
|
22 #set $filter_dict['filter'] = 'regex'
|
|
23 #set $filter_dict['pattern'] = '^(%s).*$' % '|'.join([chr(int(x)).replace('|','[|]') for x in (str($fi.filter.comment_char)).split(',')])
|
|
24 #set $filter_dict['action'] = 'exclude_match'
|
|
25 #silent $input_filters.append($filter_dict)
|
|
26 #elif $fi.filter.filter_type == 'regex':
|
|
27 #set $filter_dict = dict()
|
|
28 #set $filter_dict['filter'] = str($fi.filter.filter_type)
|
|
29 #set $filter_dict['pattern'] = str($fi.filter.regex_pattern)
|
|
30 #set $filter_dict['action'] = str($fi.filter.regex_action)
|
|
31 #silent $input_filters.append($filter_dict)
|
|
32 #elif $fi.filter.filter_type == 'select_columns':
|
|
33 #set $filter_dict = dict()
|
|
34 #set $filter_dict['filter'] = str($fi.filter.filter_type)
|
|
35 #set $filter_dict['columns'] = [int(str($ci).replace('c','')) for $ci in str($fi.filter.columns).split(',')]
|
|
36 #silent $input_filters.append($filter_dict)
|
|
37 #elif $fi.filter.filter_type == 'replace':
|
|
38 #set $filter_dict = dict()
|
|
39 #set $filter_dict['filter'] = str($fi.filter.filter_type)
|
|
40 #set $filter_dict['column'] = int(str($fi.filter.column).replace('c',''))
|
|
41 #set $filter_dict['pattern'] = str($fi.filter.regex_pattern)
|
|
42 #set $filter_dict['replace'] = str($fi.filter.regex_replace)
|
|
43 #silent $input_filters.append($filter_dict)
|
|
44 #elif str($fi.filter.filter_type).endswith('pend_line_num'):
|
|
45 #set $filter_dict = dict()
|
|
46 #set $filter_dict['filter'] = str($fi.filter.filter_type)
|
|
47 #silent $input_filters.append($filter_dict)
|
|
48 #elif str($fi.filter.filter_type).endswith('pend_text'):
|
|
49 #set $filter_dict = dict()
|
|
50 #set $filter_dict['filter'] = str($fi.filter.filter_type)
|
|
51 #set $filter_dict['column_text'] = str($fi.filter.column_text)
|
|
52 #silent $input_filters.append($filter_dict)
|
|
53 #elif $fi.filter.filter_type == 'normalize':
|
|
54 #set $filter_dict = dict()
|
|
55 #set $filter_dict['filter'] = str($fi.filter.filter_type)
|
|
56 #set $filter_dict['columns'] = [int(str($ci).replace('c','')) for $ci in str($fi.filter.columns).split(',')]
|
|
57 #set $filter_dict['separator'] = str($fi.filter.separator)
|
|
58 #silent $input_filters.append($filter_dict)
|
|
59 #end if
|
|
60 #end for
|
|
61 ]]>
|
|
62 </token>
|
|
63 <xml name="macro_line_filters">
|
|
64 <repeat name="linefilters" title="Filter Tabular Input Lines">
|
|
65 <conditional name="filter">
|
|
66 <param name="filter_type" type="select" label="Filter By">
|
|
67 <option value="skip">skip leading lines</option>
|
|
68 <option value="comment">comment char</option>
|
|
69 <option value="regex">by regex expression matching</option>
|
|
70 <option value="select_columns">select columns</option>
|
|
71 <option value="replace">regex replace value in column</option>
|
|
72 <option value="prepend_line_num">prepend a line number column</option>
|
|
73 <option value="append_line_num">append a line number column</option>
|
|
74 <option value="prepend_text">prepend a column with the given text</option>
|
|
75 <option value="append_text">append a column with the given text</option>
|
|
76 <option value="normalize">normalize list columns, replicates row for each item in list</option>
|
|
77 </param>
|
|
78 <when value="skip">
|
|
79 <param name="skip_lines" type="integer" value="" min="0" optional="true" label="Skip lines"
|
|
80 help="Leave blank to use the comment lines metadata for this dataset" />
|
|
81 </when>
|
|
82 <when value="comment">
|
|
83 <param name="comment_char" type="select" display="checkboxes" multiple="True" label="Ignore lines beginning with these characters" help="lines beginning with these are skipped">
|
|
84 <option value="62">></option>
|
|
85 <option value="64">@</option>
|
|
86 <option value="43">+</option>
|
|
87 <option value="60"><</option>
|
|
88 <option value="42">*</option>
|
|
89 <option value="45">-</option>
|
|
90 <option value="61">=</option>
|
|
91 <option value="124">|</option>
|
|
92 <option value="63">?</option>
|
|
93 <option value="36">$</option>
|
|
94 <option value="46">.</option>
|
|
95 <option value="58">:</option>
|
|
96 <option value="38">&</option>
|
|
97 <option value="37">%</option>
|
|
98 <option value="94">^</option>
|
|
99 <option value="35">#</option>
|
|
100 <option value="33">!</option>
|
|
101 </param>
|
|
102 </when>
|
|
103 <when value="prepend_line_num"/>
|
|
104 <when value="append_line_num"/>
|
|
105 <when value="prepend_text">
|
|
106 <param name="column_text" type="text" value="" label="text for column">
|
|
107 </param>
|
|
108 </when>
|
|
109 <when value="append_text">
|
|
110 <param name="column_text" type="text" value="" label="text for column">
|
|
111 </param>
|
|
112 </when>
|
|
113 <when value="regex">
|
|
114 <param name="regex_pattern" type="text" value="" label="regex pattern">
|
|
115 <sanitizer sanitize="False"/>
|
|
116 </param>
|
|
117 <param name="regex_action" type="select" label="action for regex match">
|
|
118 <option value="exclude_match">exclude line on pattern match</option>
|
|
119 <option value="include_match">include line on pattern match</option>
|
|
120 <option value="exclude_find">exclude line if pattern found</option>
|
|
121 <option value="include_find">include line if pattern found</option>
|
|
122 </param>
|
|
123 </when>
|
|
124 <when value="select_columns">
|
|
125 <param name="columns" type="text" value="" label="enter column numbers to keep"
|
|
126 help="example: 1,4,2 or c1,c4,c2(selects the first,fourth, and second columns)">
|
|
127 <validator type="regex" message="Column ordinal positions separated by commas">^(c?[1-9]\d*)(,c?[1-9]\d*)*$</validator>
|
|
128 </param>
|
|
129 </when>
|
|
130 <when value="replace">
|
|
131 <param name="column" type="text" value="" label="enter column number to replace"
|
|
132 help="example: 1 or c1 (selects the first column)">
|
|
133 <validator type="regex" message="Column ordinal position separated by commas">^(c?[1-9]\d*)$</validator>
|
|
134 </param>
|
|
135 <param name="regex_pattern" type="text" value="" label="regex pattern">
|
|
136 <sanitizer sanitize="False"/>
|
|
137 </param>
|
|
138 <param name="regex_replace" type="text" value="" label="replacement expression">
|
|
139 <sanitizer sanitize="False"/>
|
|
140 </param>
|
|
141 </when>
|
|
142 <when value="normalize">
|
|
143 <param name="columns" type="text" value="" label="enter column numbers to normalize">
|
|
144 <help><![CDATA[
|
|
145 example: 2,4 or c2,c4 (selects the second, and fourth columns)
|
|
146 If multiple columns are selected, they should have the same length and separator on each line
|
|
147 ]]></help>
|
|
148 <validator type="regex" message="Column ordinal positions separated by commas">^(c?[1-9]\d*)(,c?[1-9]\d*)*$</validator>
|
|
149 </param>
|
|
150 <param name="separator" type="text" value="," label="List item delimiter in column">
|
|
151 <sanitizer sanitize="False"/>
|
|
152 <validator type="regex" message="Anything but TAB or Newline">^[^\t\n\r\f\v]+$</validator>
|
|
153 </param>
|
|
154 </when>
|
|
155 </conditional>
|
|
156 </repeat>
|
|
157 </xml>
|
|
158
|
|
159 <token name="@LINEFILTERS_HELP@">
|
|
160 <![CDATA[
|
|
161 **Input Line Filters**
|
|
162
|
|
163 As a tabular file is being read, line filters may be applied.
|
|
164
|
|
165 ::
|
|
166
|
|
167 - skip leading lines skip the first *number* of lines
|
|
168 - comment char omit any lines that start with the specified comment character
|
|
169 - by regex expression matching *include/exclude* lines the match the regex expression
|
|
170 - select columns choose to include only selected columns in the order specified
|
|
171 - regex replace value in column replace a field in a column using a regex substitution (good for date reformatting)
|
|
172 - prepend a line number column each line has the ordinal value of the line read by this filter as the first column
|
|
173 - append a line number column each line has the ordinal value of the line read by this filter as the last column
|
|
174 - prepend a text column each line has the text string as the first column
|
|
175 - append a text column each line has the text string as the last column
|
|
176 - normalize list columns replicates the line for each item in the specified list *columns*
|
|
177 ]]>
|
|
178 </token>
|
|
179
|
|
180 <token name="@LINEFILTERS_HELP_EXAMPLE@">
|
|
181 <![CDATA[
|
|
182 **Line Filtering Example**
|
|
183 *(Six filters are applied as the following file is read)*
|
|
184
|
|
185 ::
|
|
186
|
|
187 Input Tabular File:
|
|
188
|
|
189 #People with pets
|
|
190 Pets FirstName LastName DOB PetNames PetType
|
|
191 2 Paula Brown 24/05/78 Rex,Fluff dog,cat
|
|
192 1 Steven Jones 04/04/74 Allie cat
|
|
193 0 Jane Doe 24/05/78
|
|
194 1 James Smith 20/10/80 Spot
|
|
195
|
|
196
|
|
197 Filter 1 - append a line number column:
|
|
198
|
|
199 #People with pets 1
|
|
200 Pets FirstName LastName DOB PetNames PetType 2
|
|
201 2 Paula Brown 24/05/78 Rex,Fluff dog,cat 3
|
|
202 1 Steven Jones 04/04/74 Allie cat 4
|
|
203 0 Jane Doe 24/05/78 5
|
|
204 1 James Smith 20/10/80 Spot 6
|
|
205
|
|
206 Filter 2 - by regex expression matching [include]: '^\d+' (include lines that start with a number)
|
|
207
|
|
208 2 Paula Brown 24/05/78 Rex,Fluff dog,cat 3
|
|
209 1 Steven Jones 04/04/74 Allie cat 4
|
|
210 0 Jane Doe 24/05/78 5
|
|
211 1 James Smith 20/10/80 Spot 6
|
|
212
|
|
213 Filter 3 - append a line number column:
|
|
214
|
|
215 2 Paula Brown 24/05/78 Rex,Fluff dog,cat 3 1
|
|
216 1 Steven Jones 04/04/74 Allie cat 4 2
|
|
217 0 Jane Doe 24/05/78 5 3
|
|
218 1 James Smith 20/10/80 Spot 6 4
|
|
219
|
|
220 Filter 4 - regex replace value in column[4]: '(\d+)/(\d+)/(\d+)' '19\3-\2-\1' (convert dates to sqlite format)
|
|
221
|
|
222 2 Paula Brown 1978-05-24 Rex,Fluff dog,cat 3 1
|
|
223 1 Steven Jones 1974-04-04 Allie cat 4 2
|
|
224 0 Jane Doe 1978-05-24 5 3
|
|
225 1 James Smith 1980-10-20 Spot 6 4
|
|
226
|
|
227 Filter 5 - normalize list columns[5,6]:
|
|
228
|
|
229 2 Paula Brown 1978-05-24 Rex dog 3 1
|
|
230 2 Paula Brown 1978-05-24 Fluff cat 3 1
|
|
231 1 Steven Jones 1974-04-04 Allie cat 4 2
|
|
232 0 Jane Doe 1978-05-24 5 3
|
|
233 1 James Smith 1980-10-20 Spot 6 4
|
|
234
|
|
235 Filter 6 - append a line number column:
|
|
236
|
|
237 2 Paula Brown 1978-05-24 Rex dog 3 1 1
|
|
238 2 Paula Brown 1978-05-24 Fluff cat 3 1 2
|
|
239 1 Steven Jones 1974-04-04 Allie cat 4 2 3
|
|
240 0 Jane Doe 1978-05-24 5 3 4
|
|
241 1 James Smith 1980-10-20 Spot 6 4 5
|
|
242
|
|
243 ]]>
|
|
244 </token>
|
|
245
|
|
246 <token name="@QUERY_HELP@">
|
|
247 <![CDATA[
|
|
248
|
|
249 For help in using SQLite_ see: http://www.sqlite.org/docs.html
|
|
250
|
|
251 **NOTE:** input for SQLite dates input field must be in the format: *YYYY-MM-DD* for example: 2015-09-30
|
|
252
|
|
253 See: http://www.sqlite.org/lang_datefunc.html
|
|
254
|
|
255 **Example**
|
|
256
|
|
257 Given 2 tabular datasets: *customers* and *sales*
|
|
258
|
|
259 Dataset *customers*
|
|
260
|
|
261 Table name: "customers"
|
|
262
|
|
263 Column names: "CustomerID,FirstName,LastName,Email,DOB,Phone"
|
|
264
|
|
265 =========== ========== ========== ===================== ========== ============
|
|
266 #CustomerID FirstName LastName Email DOB Phone
|
|
267 =========== ========== ========== ===================== ========== ============
|
|
268 1 John Smith John.Smith@yahoo.com 1968-02-04 626 222-2222
|
|
269 2 Steven Goldfish goldfish@fishhere.net 1974-04-04 323 455-4545
|
|
270 3 Paula Brown pb@herowndomain.org 1978-05-24 416 323-3232
|
|
271 4 James Smith jim@supergig.co.uk 1980-10-20 416 323-8888
|
|
272 =========== ========== ========== ===================== ========== ============
|
|
273
|
|
274 Dataset *sales*
|
|
275
|
|
276 Table name: "sales"
|
|
277
|
|
278 Column names: "CustomerID,Date,SaleAmount"
|
|
279
|
|
280 ============= ============ ============
|
|
281 #CustomerID Date SaleAmount
|
|
282 ============= ============ ============
|
|
283 2 2004-05-06 100.22
|
|
284 1 2004-05-07 99.95
|
|
285 3 2004-05-07 122.95
|
|
286 3 2004-05-13 100.00
|
|
287 4 2004-05-22 555.55
|
|
288 ============= ============ ============
|
|
289
|
|
290 The query
|
|
291
|
|
292 ::
|
|
293
|
|
294 SELECT FirstName,LastName,sum(SaleAmount) as "TotalSales"
|
|
295 FROM customers join sales on customers.CustomerID = sales.CustomerID
|
|
296 GROUP BY customers.CustomerID ORDER BY TotalSales DESC;
|
|
297
|
|
298 Produces this tabular output:
|
|
299
|
|
300 ========== ======== ==========
|
|
301 #FirstName LastName TotalSales
|
|
302 ========== ======== ==========
|
|
303 James Smith 555.55
|
|
304 Paula Brown 222.95
|
|
305 Steven Goldfish 100.22
|
|
306 John Smith 99.95
|
|
307 ========== ======== ==========
|
|
308
|
|
309
|
|
310 If the optional Table name and Column names inputs are not used, the query would be:
|
|
311
|
|
312 ::
|
|
313
|
|
314 SELECT t1.c2 as "FirstName", t1.c3 as "LastName", sum(t2.c3) as "TotalSales"
|
|
315 FROM t1 join t2 on t1.c1 = t2.c1
|
|
316 GROUP BY t1.c1 ORDER BY TotalSales DESC;
|
|
317
|
|
318 You can selectively name columns, e.g. on the customers input you could just name columns 2,3, and 5:
|
|
319
|
|
320 Column names: ,FirstName,LastName,,BirthDate
|
|
321
|
|
322 Results in the following data base table
|
|
323
|
|
324 =========== ========== ========== ===================== ========== ============
|
|
325 #c1 FirstName LastName c4 BirthDate c6
|
|
326 =========== ========== ========== ===================== ========== ============
|
|
327 1 John Smith John.Smith@yahoo.com 1968-02-04 626 222-2222
|
|
328 2 Steven Goldfish goldfish@fishhere.net 1974-04-04 323 455-4545
|
|
329 3 Paula Brown pb@herowndomain.org 1978-05-24 416 323-3232
|
|
330 4 James Smith jim@supergig.co.uk 1980-10-20 416 323-8888
|
|
331 =========== ========== ========== ===================== ========== ============
|
|
332
|
|
333
|
|
334 Regular_expression_ functions are included for:
|
|
335
|
|
336 ::
|
|
337
|
|
338 matching: re_match('pattern',column)
|
|
339
|
|
340 SELECT t1.FirstName, t1.LastName
|
|
341 FROM t1
|
|
342 WHERE re_match('^.*\.(net|org)$',c4)
|
|
343
|
|
344 Results:
|
|
345
|
|
346 =========== ==========
|
|
347 #FirstName LastName
|
|
348 =========== ==========
|
|
349 Steven Goldfish
|
|
350 Paula Brown
|
|
351 =========== ==========
|
|
352
|
|
353
|
|
354 ::
|
|
355
|
|
356 searching: re_search('pattern',column)
|
|
357 substituting: re_sub('pattern','replacement,column)
|
|
358
|
|
359 SELECT t1.FirstName, t1.LastName, re_sub('^\d{2}(\d{2})-(\d\d)-(\d\d)','\3/\2/\1',BirthDate) as "DOB"
|
|
360 FROM t1
|
|
361 WHERE re_search('[hp]er',c4)
|
|
362
|
|
363 Results:
|
|
364
|
|
365
|
|
366 =========== ========== ==========
|
|
367 #FirstName LastName DOB
|
|
368 =========== ========== ==========
|
|
369 Steven Goldfish 04/04/74
|
|
370 Paula Brown 24/05/78
|
|
371 James Smith 20/10/80
|
|
372 =========== ========== ==========
|
|
373
|
|
374 .. _Regular_expression: https://docs.python.org/release/2.7/library/re.html
|
|
375 .. _SQLite: http://www.sqlite.org/index.html
|
|
376 .. _SQLite_functions: http://www.sqlite.org/docs.html
|
|
377
|
|
378
|
|
379 ]]>
|
|
380 </token>
|
|
381
|
|
382 </macros>
|
|
383
|