Mercurial > repos > shellac > sam_consensus_v3
comparison env/lib/python3.9/site-packages/galaxy/util/rules_dsl.py @ 0:4f3585e2f14b draft default tip
"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author | shellac |
---|---|
date | Mon, 22 Mar 2021 18:12:50 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:4f3585e2f14b |
---|---|
1 import abc | |
2 import itertools | |
3 import re | |
4 from typing import List, Type | |
5 | |
6 import yaml | |
7 from pkg_resources import resource_stream | |
8 | |
9 | |
10 def get_rules_specification(): | |
11 return yaml.safe_load(resource_stream(__name__, 'rules_dsl_spec.yml')) | |
12 | |
13 | |
14 def _ensure_rule_contains_keys(rule, keys): | |
15 for key, instance_class in keys.items(): | |
16 if key not in rule: | |
17 raise ValueError("Rule of type [{}] does not contain key [{}].".format(rule["type"], key)) | |
18 value = rule[key] | |
19 if not isinstance(value, instance_class): | |
20 raise ValueError("Rule of type [{}] does not contain correct value type for key [{}].".format(rule["type"], key)) | |
21 | |
22 | |
23 def _ensure_key_value_in(rule, key, values): | |
24 value = rule[key] | |
25 if value not in values: | |
26 raise ValueError(f"Invalid value [{value}] for [{key}] encountered.") | |
27 | |
28 | |
29 def _ensure_valid_pattern(expression): | |
30 re.compile(expression) | |
31 | |
32 | |
33 def apply_regex(regex, target, data, replacement=None, group_count=None): | |
34 pattern = re.compile(regex) | |
35 | |
36 def new_row(row): | |
37 source = row[target] | |
38 if replacement is None: | |
39 match = pattern.search(source) | |
40 if not match: | |
41 raise Exception(f"Problem applying regular expression [{regex}] to [{source}].") | |
42 | |
43 if group_count: | |
44 if len(match.groups()) != group_count: | |
45 raise Exception("Problem applying regular expression, wrong number of groups found.") | |
46 | |
47 result = row + list(match.groups()) | |
48 else: | |
49 result = row + [match.group(0)] | |
50 else: | |
51 result = row + [pattern.search(source).expand(replacement)] | |
52 | |
53 return result | |
54 | |
55 new_data = list(map(new_row, data)) | |
56 return new_data | |
57 | |
58 | |
59 class BaseRuleDefinition(metaclass=abc.ABCMeta): | |
60 | |
61 @abc.abstractproperty | |
62 def rule_type(self): | |
63 """Short string describing type of rule (plugin class) to use.""" | |
64 | |
65 @abc.abstractmethod | |
66 def validate_rule(self, rule): | |
67 """Validate dictified rule definition of this type.""" | |
68 | |
69 @abc.abstractmethod | |
70 def apply(self, rule, data, sources): | |
71 """Apply validated, dictified rule definition to supplied data.""" | |
72 | |
73 | |
74 class AddColumnMetadataRuleDefinition(BaseRuleDefinition): | |
75 rule_type = "add_column_metadata" | |
76 | |
77 def validate_rule(self, rule): | |
78 _ensure_rule_contains_keys(rule, {"value": str}) | |
79 | |
80 def apply(self, rule, data, sources): | |
81 rule_value = rule["value"] | |
82 if rule_value.startswith("identifier"): | |
83 identifier_index = int(rule_value[len("identifier"):]) | |
84 | |
85 new_rows = [] | |
86 for index, row in enumerate(data): | |
87 new_rows.append(row + [sources[index]["identifiers"][identifier_index]]) | |
88 | |
89 elif rule_value == "tags": | |
90 | |
91 def sorted_tags(index): | |
92 tags = sorted(sources[index]["tags"]) | |
93 return [",".join(tags)] | |
94 | |
95 new_rows = [] | |
96 for index, row in enumerate(data): | |
97 new_rows.append(row + sorted_tags(index)) | |
98 | |
99 return new_rows, sources | |
100 | |
101 | |
102 class AddColumnGroupTagValueRuleDefinition(BaseRuleDefinition): | |
103 rule_type = "add_column_group_tag_value" | |
104 | |
105 def validate_rule(self, rule): | |
106 _ensure_rule_contains_keys(rule, {"value": str}) | |
107 | |
108 def apply(self, rule, data, sources): | |
109 rule_value = rule["value"] | |
110 tag_prefix = "group:%s:" % rule_value | |
111 | |
112 new_rows = [] | |
113 for index, row in enumerate(data): | |
114 group_tag_value = None | |
115 source = sources[index] | |
116 tags = source["tags"] | |
117 for tag in sorted(tags): | |
118 if tag.startswith(tag_prefix): | |
119 group_tag_value = tag[len(tag_prefix):] | |
120 break | |
121 | |
122 if group_tag_value is None: | |
123 group_tag_value = rule.get("default_value", "") | |
124 | |
125 new_rows.append(row + [group_tag_value]) | |
126 | |
127 return new_rows, sources | |
128 | |
129 | |
130 class AddColumnConcatenateRuleDefinition(BaseRuleDefinition): | |
131 rule_type = "add_column_concatenate" | |
132 | |
133 def validate_rule(self, rule): | |
134 _ensure_rule_contains_keys(rule, {"target_column_0": int, "target_column_1": int}) | |
135 | |
136 def apply(self, rule, data, sources): | |
137 column_0 = rule["target_column_0"] | |
138 column_1 = rule["target_column_1"] | |
139 | |
140 new_rows = [] | |
141 for row in data: | |
142 new_rows.append(row + [row[column_0] + row[column_1]]) | |
143 | |
144 return new_rows, sources | |
145 | |
146 | |
147 class AddColumnBasenameRuleDefinition(BaseRuleDefinition): | |
148 rule_type = "add_column_basename" | |
149 | |
150 def validate_rule(self, rule): | |
151 _ensure_rule_contains_keys(rule, {"target_column": int}) | |
152 | |
153 def apply(self, rule, data, sources): | |
154 column = rule["target_column"] | |
155 re = r"[^/]*$" | |
156 return apply_regex(re, column, data), sources | |
157 | |
158 | |
159 class AddColumnRegexRuleDefinition(BaseRuleDefinition): | |
160 rule_type = "add_column_regex" | |
161 | |
162 def validate_rule(self, rule): | |
163 _ensure_rule_contains_keys(rule, {"target_column": int, "expression": str}) | |
164 _ensure_valid_pattern(rule["expression"]) | |
165 | |
166 def apply(self, rule, data, sources): | |
167 target = rule["target_column"] | |
168 expression = rule["expression"] | |
169 replacement = rule.get("replacement") | |
170 group_count = rule.get("group_count") | |
171 | |
172 return apply_regex(expression, target, data, replacement, group_count), sources | |
173 | |
174 | |
175 class AddColumnRownumRuleDefinition(BaseRuleDefinition): | |
176 rule_type = "add_column_rownum" | |
177 | |
178 def validate_rule(self, rule): | |
179 _ensure_rule_contains_keys(rule, {"start": int}) | |
180 | |
181 def apply(self, rule, data, sources): | |
182 start = rule["start"] | |
183 | |
184 new_rows = [] | |
185 for index, row in enumerate(data): | |
186 new_rows.append(row + ["%d" % (index + start)]) | |
187 | |
188 return new_rows, sources | |
189 | |
190 | |
191 class AddColumnValueRuleDefinition(BaseRuleDefinition): | |
192 rule_type = "add_column_value" | |
193 | |
194 def validate_rule(self, rule): | |
195 _ensure_rule_contains_keys(rule, {"value": str}) | |
196 | |
197 def apply(self, rule, data, sources): | |
198 value = rule["value"] | |
199 | |
200 new_rows = [] | |
201 for row in data: | |
202 new_rows.append(row + [str(value)]) | |
203 | |
204 return new_rows, sources | |
205 | |
206 | |
207 class AddColumnSubstrRuleDefinition(BaseRuleDefinition): | |
208 rule_type = "add_column_substr" | |
209 | |
210 def validate_rule(self, rule): | |
211 _ensure_rule_contains_keys(rule, { | |
212 "target_column": int, | |
213 "length": int, | |
214 "substr_type": str, | |
215 }) | |
216 _ensure_key_value_in(rule, "substr_type", ["keep_prefix", "drop_prefix", "keep_suffix", "drop_suffix"]) | |
217 | |
218 def apply(self, rule, data, sources): | |
219 target = rule["target_column"] | |
220 length = rule["length"] | |
221 substr_type = rule["substr_type"] | |
222 | |
223 def new_row(row): | |
224 original_value = row[target] | |
225 start = 0 | |
226 end = len(original_value) | |
227 | |
228 if substr_type == "keep_prefix": | |
229 end = length | |
230 elif substr_type == "drop_prefix": | |
231 start = length | |
232 elif substr_type == "keep_suffix": | |
233 start = end - length | |
234 if start < 0: | |
235 start = 0 | |
236 else: | |
237 end = end - length | |
238 if end < 0: | |
239 end = 0 | |
240 | |
241 return row + [original_value[start:end]] | |
242 | |
243 return list(map(new_row, data)), sources | |
244 | |
245 | |
246 class RemoveColumnsRuleDefinition(BaseRuleDefinition): | |
247 rule_type = "remove_columns" | |
248 | |
249 def validate_rule(self, rule): | |
250 _ensure_rule_contains_keys(rule, { | |
251 "target_columns": list, | |
252 }) | |
253 | |
254 def apply(self, rule, data, sources): | |
255 target_columns = rule["target_columns"] | |
256 | |
257 def new_row(row): | |
258 new = [] | |
259 for index, val in enumerate(row): | |
260 if index not in target_columns: | |
261 new.append(val) | |
262 return new | |
263 | |
264 return list(map(new_row, data)), sources | |
265 | |
266 | |
267 def _filter_index(func, iterable): | |
268 result = [] | |
269 for index, x in enumerate(iterable): | |
270 if func(index): | |
271 result.append(x) | |
272 | |
273 return result | |
274 | |
275 | |
276 class AddFilterRegexRuleDefinition(BaseRuleDefinition): | |
277 rule_type = "add_filter_regex" | |
278 | |
279 def validate_rule(self, rule): | |
280 _ensure_rule_contains_keys(rule, { | |
281 "target_column": int, | |
282 "invert": bool, | |
283 "expression": str, | |
284 }) | |
285 _ensure_valid_pattern(rule["expression"]) | |
286 | |
287 def apply(self, rule, data, sources): | |
288 target_column = rule["target_column"] | |
289 invert = rule["invert"] | |
290 regex = rule["expression"] | |
291 | |
292 def _filter(index): | |
293 row = data[index] | |
294 val = row[target_column] | |
295 pattern = re.compile(regex) | |
296 return not invert if pattern.search(val) else invert | |
297 | |
298 return _filter_index(_filter, data), _filter_index(_filter, sources) | |
299 | |
300 | |
301 class AddFilterCountRuleDefinition(BaseRuleDefinition): | |
302 rule_type = "add_filter_count" | |
303 | |
304 def validate_rule(self, rule): | |
305 _ensure_rule_contains_keys(rule, { | |
306 "count": int, | |
307 "invert": bool, | |
308 "which": str, | |
309 }) | |
310 _ensure_key_value_in(rule, "which", ["first", "last"]) | |
311 | |
312 def apply(self, rule, data, sources): | |
313 num_rows = len(data) | |
314 invert = rule["invert"] | |
315 n = rule["count"] | |
316 which = rule["which"] | |
317 | |
318 def _filter(index): | |
319 if which == "first": | |
320 matches = index >= n | |
321 else: | |
322 matches = index < (num_rows - n) | |
323 return not invert if matches else invert | |
324 | |
325 return _filter_index(_filter, data), _filter_index(_filter, sources) | |
326 | |
327 | |
328 class AddFilterEmptyRuleDefinition(BaseRuleDefinition): | |
329 rule_type = "add_filter_empty" | |
330 | |
331 def validate_rule(self, rule): | |
332 _ensure_rule_contains_keys(rule, { | |
333 "target_column": int, | |
334 "invert": bool | |
335 }) | |
336 | |
337 def apply(self, rule, data, sources): | |
338 invert = rule["invert"] | |
339 target_column = rule["target_column"] | |
340 | |
341 def _filter(index): | |
342 non_empty = len(data[index][target_column]) != 0 | |
343 return not invert if non_empty else invert | |
344 | |
345 return _filter_index(_filter, data), _filter_index(_filter, sources) | |
346 | |
347 | |
348 class AddFilterMatchesRuleDefinition(BaseRuleDefinition): | |
349 rule_type = "add_filter_matches" | |
350 | |
351 def validate_rule(self, rule): | |
352 _ensure_rule_contains_keys(rule, { | |
353 "target_column": int, | |
354 "invert": bool, | |
355 "value": str, | |
356 }) | |
357 | |
358 def apply(self, rule, data, sources): | |
359 invert = rule["invert"] | |
360 target_column = rule["target_column"] | |
361 value = rule["value"] | |
362 | |
363 def _filter(index): | |
364 row = data[index] | |
365 val = row[target_column] | |
366 return not invert if val == value else invert | |
367 | |
368 return _filter_index(_filter, data), _filter_index(_filter, sources) | |
369 | |
370 | |
371 class AddFilterCompareRuleDefinition(BaseRuleDefinition): | |
372 rule_type = "add_filter_compare" | |
373 | |
374 def validate_rule(self, rule): | |
375 _ensure_rule_contains_keys(rule, { | |
376 "target_column": int, | |
377 "value": int, | |
378 "compare_type": str, | |
379 }) | |
380 _ensure_key_value_in(rule, "compare_type", ["less_than", "less_than_equal", "greater_than", "greater_than_equal"]) | |
381 | |
382 def apply(self, rule, data, sources): | |
383 target_column = rule["target_column"] | |
384 value = rule["value"] | |
385 compare_type = rule["compare_type"] | |
386 | |
387 def _filter(index): | |
388 row = data[index] | |
389 target_value = float(row[target_column]) | |
390 if compare_type == "less_than": | |
391 matches = target_value < value | |
392 elif compare_type == "less_than_equal": | |
393 matches = target_value <= value | |
394 elif compare_type == "greater_than": | |
395 matches = target_value > value | |
396 elif compare_type == "greater_than_equal": | |
397 matches = target_value >= value | |
398 | |
399 return matches | |
400 | |
401 return _filter_index(_filter, data), _filter_index(_filter, sources) | |
402 | |
403 | |
404 class SortRuleDefinition(BaseRuleDefinition): | |
405 rule_type = "sort" | |
406 | |
407 def validate_rule(self, rule): | |
408 _ensure_rule_contains_keys(rule, { | |
409 "target_column": int, | |
410 "numeric": bool, | |
411 }) | |
412 | |
413 def apply(self, rule, data, sources): | |
414 target = rule["target_column"] | |
415 numeric = rule["numeric"] | |
416 | |
417 sortable = zip(data, sources) | |
418 | |
419 def sort_func(item): | |
420 a_val = item[0][target] | |
421 if numeric: | |
422 a_val = float(a_val) | |
423 return a_val | |
424 | |
425 sorted_data = sorted(sortable, key=sort_func) | |
426 | |
427 new_data = [] | |
428 new_sources = [] | |
429 | |
430 for (row, source) in sorted_data: | |
431 new_data.append(row) | |
432 new_sources.append(source) | |
433 | |
434 return new_data, new_sources | |
435 | |
436 | |
437 class SwapColumnsRuleDefinition(BaseRuleDefinition): | |
438 rule_type = "swap_columns" | |
439 | |
440 def validate_rule(self, rule): | |
441 _ensure_rule_contains_keys(rule, { | |
442 "target_column_0": int, | |
443 "target_column_1": int, | |
444 }) | |
445 | |
446 def apply(self, rule, data, sources): | |
447 target_column_0 = rule["target_column_0"] | |
448 target_column_1 = rule["target_column_1"] | |
449 | |
450 def new_row(row): | |
451 row_copy = row[:] | |
452 row_copy[target_column_0] = row[target_column_1] | |
453 row_copy[target_column_1] = row[target_column_0] | |
454 return row_copy | |
455 | |
456 return list(map(new_row, data)), sources | |
457 | |
458 | |
459 class SplitColumnsRuleDefinition(BaseRuleDefinition): | |
460 rule_type = "split_columns" | |
461 | |
462 def validate_rule(self, rule): | |
463 _ensure_rule_contains_keys(rule, { | |
464 "target_columns_0": list, | |
465 "target_columns_1": list, | |
466 }) | |
467 | |
468 def apply(self, rule, data, sources): | |
469 target_columns_0 = rule["target_columns_0"] | |
470 target_columns_1 = rule["target_columns_1"] | |
471 | |
472 def split_row(row): | |
473 new_row_0 = [] | |
474 new_row_1 = [] | |
475 for index, el in enumerate(row): | |
476 if index in target_columns_0: | |
477 new_row_0.append(el) | |
478 elif index in target_columns_1: | |
479 new_row_1.append(el) | |
480 else: | |
481 new_row_0.append(el) | |
482 new_row_1.append(el) | |
483 | |
484 return [new_row_0, new_row_1] | |
485 | |
486 data = flat_map(split_row, data) | |
487 sources = flat_map(lambda x: [x, x], sources) | |
488 | |
489 return data, sources | |
490 | |
491 | |
492 def flat_map(f, items): | |
493 return list(itertools.chain.from_iterable(map(f, items))) | |
494 | |
495 | |
496 class RuleSet: | |
497 | |
498 def __init__(self, rule_set_as_dict): | |
499 self.raw_rules = rule_set_as_dict["rules"] | |
500 self.raw_mapping = rule_set_as_dict.get("mapping", []) | |
501 | |
502 @property | |
503 def rules(self): | |
504 return self.raw_rules | |
505 | |
506 def _rules_with_definitions(self): | |
507 for rule in self.raw_rules: | |
508 yield (rule, RULES_DEFINITIONS[rule["type"]]) | |
509 | |
510 def apply(self, data, sources): | |
511 for rule, rule_definition in self._rules_with_definitions(): | |
512 rule_definition.validate_rule(rule) | |
513 data, sources = rule_definition.apply(rule, data, sources) | |
514 | |
515 return data, sources | |
516 | |
517 @property | |
518 def has_errors(self): | |
519 errored = False | |
520 try: | |
521 for rule, rule_definition in self._rules_with_definitions(): | |
522 rule_definition.validate_rule(rule) | |
523 except Exception: | |
524 errored = True | |
525 return errored | |
526 | |
527 @property | |
528 def mapping_as_dict(self): | |
529 as_dict = {} | |
530 for mapping in self.raw_mapping: | |
531 as_dict[mapping["type"]] = mapping | |
532 | |
533 return as_dict | |
534 | |
535 # Rest of this is generic, things here are Galaxy collection specific, think about about | |
536 # subclass of RuleSet for collection creation. | |
537 @property | |
538 def identifier_columns(self): | |
539 mapping_as_dict = self.mapping_as_dict | |
540 identifier_columns = [] | |
541 if "list_identifiers" in mapping_as_dict: | |
542 identifier_columns.extend(mapping_as_dict["list_identifiers"]["columns"]) | |
543 if "paired_identifier" in mapping_as_dict: | |
544 identifier_columns.append(mapping_as_dict["paired_identifier"]["columns"][0]) | |
545 | |
546 return identifier_columns | |
547 | |
548 @property | |
549 def collection_type(self): | |
550 mapping_as_dict = self.mapping_as_dict | |
551 list_columns = mapping_as_dict.get("list_identifiers", {"columns": []})["columns"] | |
552 collection_type = ":".join(map(lambda c: "list", list_columns)) | |
553 if "paired_identifier" in mapping_as_dict: | |
554 if collection_type: | |
555 collection_type += ":paired" | |
556 else: | |
557 collection_type = "paired" | |
558 return collection_type | |
559 | |
560 @property | |
561 def display(self): | |
562 message = "Rules:\n" | |
563 message += "".join("- %s\n" % r for r in self.raw_rules) | |
564 message += "Column Definitions:\n" | |
565 message += "".join("- %s\n" % m for m in self.raw_mapping) | |
566 return message | |
567 | |
568 | |
569 RULES_DEFINITION_CLASSES: List[Type[BaseRuleDefinition]] = [ | |
570 AddColumnMetadataRuleDefinition, | |
571 AddColumnGroupTagValueRuleDefinition, | |
572 AddColumnConcatenateRuleDefinition, | |
573 AddColumnBasenameRuleDefinition, | |
574 AddColumnRegexRuleDefinition, | |
575 AddColumnRownumRuleDefinition, | |
576 AddColumnValueRuleDefinition, | |
577 AddColumnSubstrRuleDefinition, | |
578 RemoveColumnsRuleDefinition, | |
579 AddFilterRegexRuleDefinition, | |
580 AddFilterCountRuleDefinition, | |
581 AddFilterEmptyRuleDefinition, | |
582 AddFilterMatchesRuleDefinition, | |
583 AddFilterCompareRuleDefinition, | |
584 SortRuleDefinition, | |
585 SwapColumnsRuleDefinition, | |
586 SplitColumnsRuleDefinition, | |
587 ] | |
588 RULES_DEFINITIONS = {} | |
589 for rule_class in RULES_DEFINITION_CLASSES: | |
590 RULES_DEFINITIONS[rule_class.rule_type] = rule_class() |