comparison env/lib/python3.9/site-packages/galaxy/util/rules_dsl.py @ 0:4f3585e2f14b draft default tip

"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author shellac
date Mon, 22 Mar 2021 18:12:50 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4f3585e2f14b
1 import abc
2 import itertools
3 import re
4 from typing import List, Type
5
6 import yaml
7 from pkg_resources import resource_stream
8
9
10 def get_rules_specification():
11 return yaml.safe_load(resource_stream(__name__, 'rules_dsl_spec.yml'))
12
13
14 def _ensure_rule_contains_keys(rule, keys):
15 for key, instance_class in keys.items():
16 if key not in rule:
17 raise ValueError("Rule of type [{}] does not contain key [{}].".format(rule["type"], key))
18 value = rule[key]
19 if not isinstance(value, instance_class):
20 raise ValueError("Rule of type [{}] does not contain correct value type for key [{}].".format(rule["type"], key))
21
22
23 def _ensure_key_value_in(rule, key, values):
24 value = rule[key]
25 if value not in values:
26 raise ValueError(f"Invalid value [{value}] for [{key}] encountered.")
27
28
29 def _ensure_valid_pattern(expression):
30 re.compile(expression)
31
32
33 def apply_regex(regex, target, data, replacement=None, group_count=None):
34 pattern = re.compile(regex)
35
36 def new_row(row):
37 source = row[target]
38 if replacement is None:
39 match = pattern.search(source)
40 if not match:
41 raise Exception(f"Problem applying regular expression [{regex}] to [{source}].")
42
43 if group_count:
44 if len(match.groups()) != group_count:
45 raise Exception("Problem applying regular expression, wrong number of groups found.")
46
47 result = row + list(match.groups())
48 else:
49 result = row + [match.group(0)]
50 else:
51 result = row + [pattern.search(source).expand(replacement)]
52
53 return result
54
55 new_data = list(map(new_row, data))
56 return new_data
57
58
59 class BaseRuleDefinition(metaclass=abc.ABCMeta):
60
61 @abc.abstractproperty
62 def rule_type(self):
63 """Short string describing type of rule (plugin class) to use."""
64
65 @abc.abstractmethod
66 def validate_rule(self, rule):
67 """Validate dictified rule definition of this type."""
68
69 @abc.abstractmethod
70 def apply(self, rule, data, sources):
71 """Apply validated, dictified rule definition to supplied data."""
72
73
74 class AddColumnMetadataRuleDefinition(BaseRuleDefinition):
75 rule_type = "add_column_metadata"
76
77 def validate_rule(self, rule):
78 _ensure_rule_contains_keys(rule, {"value": str})
79
80 def apply(self, rule, data, sources):
81 rule_value = rule["value"]
82 if rule_value.startswith("identifier"):
83 identifier_index = int(rule_value[len("identifier"):])
84
85 new_rows = []
86 for index, row in enumerate(data):
87 new_rows.append(row + [sources[index]["identifiers"][identifier_index]])
88
89 elif rule_value == "tags":
90
91 def sorted_tags(index):
92 tags = sorted(sources[index]["tags"])
93 return [",".join(tags)]
94
95 new_rows = []
96 for index, row in enumerate(data):
97 new_rows.append(row + sorted_tags(index))
98
99 return new_rows, sources
100
101
102 class AddColumnGroupTagValueRuleDefinition(BaseRuleDefinition):
103 rule_type = "add_column_group_tag_value"
104
105 def validate_rule(self, rule):
106 _ensure_rule_contains_keys(rule, {"value": str})
107
108 def apply(self, rule, data, sources):
109 rule_value = rule["value"]
110 tag_prefix = "group:%s:" % rule_value
111
112 new_rows = []
113 for index, row in enumerate(data):
114 group_tag_value = None
115 source = sources[index]
116 tags = source["tags"]
117 for tag in sorted(tags):
118 if tag.startswith(tag_prefix):
119 group_tag_value = tag[len(tag_prefix):]
120 break
121
122 if group_tag_value is None:
123 group_tag_value = rule.get("default_value", "")
124
125 new_rows.append(row + [group_tag_value])
126
127 return new_rows, sources
128
129
130 class AddColumnConcatenateRuleDefinition(BaseRuleDefinition):
131 rule_type = "add_column_concatenate"
132
133 def validate_rule(self, rule):
134 _ensure_rule_contains_keys(rule, {"target_column_0": int, "target_column_1": int})
135
136 def apply(self, rule, data, sources):
137 column_0 = rule["target_column_0"]
138 column_1 = rule["target_column_1"]
139
140 new_rows = []
141 for row in data:
142 new_rows.append(row + [row[column_0] + row[column_1]])
143
144 return new_rows, sources
145
146
147 class AddColumnBasenameRuleDefinition(BaseRuleDefinition):
148 rule_type = "add_column_basename"
149
150 def validate_rule(self, rule):
151 _ensure_rule_contains_keys(rule, {"target_column": int})
152
153 def apply(self, rule, data, sources):
154 column = rule["target_column"]
155 re = r"[^/]*$"
156 return apply_regex(re, column, data), sources
157
158
159 class AddColumnRegexRuleDefinition(BaseRuleDefinition):
160 rule_type = "add_column_regex"
161
162 def validate_rule(self, rule):
163 _ensure_rule_contains_keys(rule, {"target_column": int, "expression": str})
164 _ensure_valid_pattern(rule["expression"])
165
166 def apply(self, rule, data, sources):
167 target = rule["target_column"]
168 expression = rule["expression"]
169 replacement = rule.get("replacement")
170 group_count = rule.get("group_count")
171
172 return apply_regex(expression, target, data, replacement, group_count), sources
173
174
175 class AddColumnRownumRuleDefinition(BaseRuleDefinition):
176 rule_type = "add_column_rownum"
177
178 def validate_rule(self, rule):
179 _ensure_rule_contains_keys(rule, {"start": int})
180
181 def apply(self, rule, data, sources):
182 start = rule["start"]
183
184 new_rows = []
185 for index, row in enumerate(data):
186 new_rows.append(row + ["%d" % (index + start)])
187
188 return new_rows, sources
189
190
191 class AddColumnValueRuleDefinition(BaseRuleDefinition):
192 rule_type = "add_column_value"
193
194 def validate_rule(self, rule):
195 _ensure_rule_contains_keys(rule, {"value": str})
196
197 def apply(self, rule, data, sources):
198 value = rule["value"]
199
200 new_rows = []
201 for row in data:
202 new_rows.append(row + [str(value)])
203
204 return new_rows, sources
205
206
207 class AddColumnSubstrRuleDefinition(BaseRuleDefinition):
208 rule_type = "add_column_substr"
209
210 def validate_rule(self, rule):
211 _ensure_rule_contains_keys(rule, {
212 "target_column": int,
213 "length": int,
214 "substr_type": str,
215 })
216 _ensure_key_value_in(rule, "substr_type", ["keep_prefix", "drop_prefix", "keep_suffix", "drop_suffix"])
217
218 def apply(self, rule, data, sources):
219 target = rule["target_column"]
220 length = rule["length"]
221 substr_type = rule["substr_type"]
222
223 def new_row(row):
224 original_value = row[target]
225 start = 0
226 end = len(original_value)
227
228 if substr_type == "keep_prefix":
229 end = length
230 elif substr_type == "drop_prefix":
231 start = length
232 elif substr_type == "keep_suffix":
233 start = end - length
234 if start < 0:
235 start = 0
236 else:
237 end = end - length
238 if end < 0:
239 end = 0
240
241 return row + [original_value[start:end]]
242
243 return list(map(new_row, data)), sources
244
245
246 class RemoveColumnsRuleDefinition(BaseRuleDefinition):
247 rule_type = "remove_columns"
248
249 def validate_rule(self, rule):
250 _ensure_rule_contains_keys(rule, {
251 "target_columns": list,
252 })
253
254 def apply(self, rule, data, sources):
255 target_columns = rule["target_columns"]
256
257 def new_row(row):
258 new = []
259 for index, val in enumerate(row):
260 if index not in target_columns:
261 new.append(val)
262 return new
263
264 return list(map(new_row, data)), sources
265
266
267 def _filter_index(func, iterable):
268 result = []
269 for index, x in enumerate(iterable):
270 if func(index):
271 result.append(x)
272
273 return result
274
275
276 class AddFilterRegexRuleDefinition(BaseRuleDefinition):
277 rule_type = "add_filter_regex"
278
279 def validate_rule(self, rule):
280 _ensure_rule_contains_keys(rule, {
281 "target_column": int,
282 "invert": bool,
283 "expression": str,
284 })
285 _ensure_valid_pattern(rule["expression"])
286
287 def apply(self, rule, data, sources):
288 target_column = rule["target_column"]
289 invert = rule["invert"]
290 regex = rule["expression"]
291
292 def _filter(index):
293 row = data[index]
294 val = row[target_column]
295 pattern = re.compile(regex)
296 return not invert if pattern.search(val) else invert
297
298 return _filter_index(_filter, data), _filter_index(_filter, sources)
299
300
301 class AddFilterCountRuleDefinition(BaseRuleDefinition):
302 rule_type = "add_filter_count"
303
304 def validate_rule(self, rule):
305 _ensure_rule_contains_keys(rule, {
306 "count": int,
307 "invert": bool,
308 "which": str,
309 })
310 _ensure_key_value_in(rule, "which", ["first", "last"])
311
312 def apply(self, rule, data, sources):
313 num_rows = len(data)
314 invert = rule["invert"]
315 n = rule["count"]
316 which = rule["which"]
317
318 def _filter(index):
319 if which == "first":
320 matches = index >= n
321 else:
322 matches = index < (num_rows - n)
323 return not invert if matches else invert
324
325 return _filter_index(_filter, data), _filter_index(_filter, sources)
326
327
328 class AddFilterEmptyRuleDefinition(BaseRuleDefinition):
329 rule_type = "add_filter_empty"
330
331 def validate_rule(self, rule):
332 _ensure_rule_contains_keys(rule, {
333 "target_column": int,
334 "invert": bool
335 })
336
337 def apply(self, rule, data, sources):
338 invert = rule["invert"]
339 target_column = rule["target_column"]
340
341 def _filter(index):
342 non_empty = len(data[index][target_column]) != 0
343 return not invert if non_empty else invert
344
345 return _filter_index(_filter, data), _filter_index(_filter, sources)
346
347
348 class AddFilterMatchesRuleDefinition(BaseRuleDefinition):
349 rule_type = "add_filter_matches"
350
351 def validate_rule(self, rule):
352 _ensure_rule_contains_keys(rule, {
353 "target_column": int,
354 "invert": bool,
355 "value": str,
356 })
357
358 def apply(self, rule, data, sources):
359 invert = rule["invert"]
360 target_column = rule["target_column"]
361 value = rule["value"]
362
363 def _filter(index):
364 row = data[index]
365 val = row[target_column]
366 return not invert if val == value else invert
367
368 return _filter_index(_filter, data), _filter_index(_filter, sources)
369
370
371 class AddFilterCompareRuleDefinition(BaseRuleDefinition):
372 rule_type = "add_filter_compare"
373
374 def validate_rule(self, rule):
375 _ensure_rule_contains_keys(rule, {
376 "target_column": int,
377 "value": int,
378 "compare_type": str,
379 })
380 _ensure_key_value_in(rule, "compare_type", ["less_than", "less_than_equal", "greater_than", "greater_than_equal"])
381
382 def apply(self, rule, data, sources):
383 target_column = rule["target_column"]
384 value = rule["value"]
385 compare_type = rule["compare_type"]
386
387 def _filter(index):
388 row = data[index]
389 target_value = float(row[target_column])
390 if compare_type == "less_than":
391 matches = target_value < value
392 elif compare_type == "less_than_equal":
393 matches = target_value <= value
394 elif compare_type == "greater_than":
395 matches = target_value > value
396 elif compare_type == "greater_than_equal":
397 matches = target_value >= value
398
399 return matches
400
401 return _filter_index(_filter, data), _filter_index(_filter, sources)
402
403
404 class SortRuleDefinition(BaseRuleDefinition):
405 rule_type = "sort"
406
407 def validate_rule(self, rule):
408 _ensure_rule_contains_keys(rule, {
409 "target_column": int,
410 "numeric": bool,
411 })
412
413 def apply(self, rule, data, sources):
414 target = rule["target_column"]
415 numeric = rule["numeric"]
416
417 sortable = zip(data, sources)
418
419 def sort_func(item):
420 a_val = item[0][target]
421 if numeric:
422 a_val = float(a_val)
423 return a_val
424
425 sorted_data = sorted(sortable, key=sort_func)
426
427 new_data = []
428 new_sources = []
429
430 for (row, source) in sorted_data:
431 new_data.append(row)
432 new_sources.append(source)
433
434 return new_data, new_sources
435
436
437 class SwapColumnsRuleDefinition(BaseRuleDefinition):
438 rule_type = "swap_columns"
439
440 def validate_rule(self, rule):
441 _ensure_rule_contains_keys(rule, {
442 "target_column_0": int,
443 "target_column_1": int,
444 })
445
446 def apply(self, rule, data, sources):
447 target_column_0 = rule["target_column_0"]
448 target_column_1 = rule["target_column_1"]
449
450 def new_row(row):
451 row_copy = row[:]
452 row_copy[target_column_0] = row[target_column_1]
453 row_copy[target_column_1] = row[target_column_0]
454 return row_copy
455
456 return list(map(new_row, data)), sources
457
458
459 class SplitColumnsRuleDefinition(BaseRuleDefinition):
460 rule_type = "split_columns"
461
462 def validate_rule(self, rule):
463 _ensure_rule_contains_keys(rule, {
464 "target_columns_0": list,
465 "target_columns_1": list,
466 })
467
468 def apply(self, rule, data, sources):
469 target_columns_0 = rule["target_columns_0"]
470 target_columns_1 = rule["target_columns_1"]
471
472 def split_row(row):
473 new_row_0 = []
474 new_row_1 = []
475 for index, el in enumerate(row):
476 if index in target_columns_0:
477 new_row_0.append(el)
478 elif index in target_columns_1:
479 new_row_1.append(el)
480 else:
481 new_row_0.append(el)
482 new_row_1.append(el)
483
484 return [new_row_0, new_row_1]
485
486 data = flat_map(split_row, data)
487 sources = flat_map(lambda x: [x, x], sources)
488
489 return data, sources
490
491
492 def flat_map(f, items):
493 return list(itertools.chain.from_iterable(map(f, items)))
494
495
496 class RuleSet:
497
498 def __init__(self, rule_set_as_dict):
499 self.raw_rules = rule_set_as_dict["rules"]
500 self.raw_mapping = rule_set_as_dict.get("mapping", [])
501
502 @property
503 def rules(self):
504 return self.raw_rules
505
506 def _rules_with_definitions(self):
507 for rule in self.raw_rules:
508 yield (rule, RULES_DEFINITIONS[rule["type"]])
509
510 def apply(self, data, sources):
511 for rule, rule_definition in self._rules_with_definitions():
512 rule_definition.validate_rule(rule)
513 data, sources = rule_definition.apply(rule, data, sources)
514
515 return data, sources
516
517 @property
518 def has_errors(self):
519 errored = False
520 try:
521 for rule, rule_definition in self._rules_with_definitions():
522 rule_definition.validate_rule(rule)
523 except Exception:
524 errored = True
525 return errored
526
527 @property
528 def mapping_as_dict(self):
529 as_dict = {}
530 for mapping in self.raw_mapping:
531 as_dict[mapping["type"]] = mapping
532
533 return as_dict
534
535 # Rest of this is generic, things here are Galaxy collection specific, think about about
536 # subclass of RuleSet for collection creation.
537 @property
538 def identifier_columns(self):
539 mapping_as_dict = self.mapping_as_dict
540 identifier_columns = []
541 if "list_identifiers" in mapping_as_dict:
542 identifier_columns.extend(mapping_as_dict["list_identifiers"]["columns"])
543 if "paired_identifier" in mapping_as_dict:
544 identifier_columns.append(mapping_as_dict["paired_identifier"]["columns"][0])
545
546 return identifier_columns
547
548 @property
549 def collection_type(self):
550 mapping_as_dict = self.mapping_as_dict
551 list_columns = mapping_as_dict.get("list_identifiers", {"columns": []})["columns"]
552 collection_type = ":".join(map(lambda c: "list", list_columns))
553 if "paired_identifier" in mapping_as_dict:
554 if collection_type:
555 collection_type += ":paired"
556 else:
557 collection_type = "paired"
558 return collection_type
559
560 @property
561 def display(self):
562 message = "Rules:\n"
563 message += "".join("- %s\n" % r for r in self.raw_rules)
564 message += "Column Definitions:\n"
565 message += "".join("- %s\n" % m for m in self.raw_mapping)
566 return message
567
568
569 RULES_DEFINITION_CLASSES: List[Type[BaseRuleDefinition]] = [
570 AddColumnMetadataRuleDefinition,
571 AddColumnGroupTagValueRuleDefinition,
572 AddColumnConcatenateRuleDefinition,
573 AddColumnBasenameRuleDefinition,
574 AddColumnRegexRuleDefinition,
575 AddColumnRownumRuleDefinition,
576 AddColumnValueRuleDefinition,
577 AddColumnSubstrRuleDefinition,
578 RemoveColumnsRuleDefinition,
579 AddFilterRegexRuleDefinition,
580 AddFilterCountRuleDefinition,
581 AddFilterEmptyRuleDefinition,
582 AddFilterMatchesRuleDefinition,
583 AddFilterCompareRuleDefinition,
584 SortRuleDefinition,
585 SwapColumnsRuleDefinition,
586 SplitColumnsRuleDefinition,
587 ]
588 RULES_DEFINITIONS = {}
589 for rule_class in RULES_DEFINITION_CLASSES:
590 RULES_DEFINITIONS[rule_class.rule_type] = rule_class()