Mercurial > repos > guerler > springsuite
comparison planemo/lib/python3.7/site-packages/galaxy/util/rules_dsl.py @ 1:56ad4e20f292 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
| author | guerler |
|---|---|
| date | Fri, 31 Jul 2020 00:32:28 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 0:d30785e31577 | 1:56ad4e20f292 |
|---|---|
| 1 import abc | |
| 2 import itertools | |
| 3 import re | |
| 4 | |
| 5 import six | |
| 6 from six.moves import map | |
| 7 | |
| 8 from galaxy.util import strip_control_characters_nested | |
| 9 | |
| 10 | |
| 11 def _ensure_rule_contains_keys(rule, keys): | |
| 12 for key, instance_class in keys.items(): | |
| 13 if key not in rule: | |
| 14 raise ValueError("Rule of type [%s] does not contain key [%s]." % (rule["type"], key)) | |
| 15 value = rule[key] | |
| 16 if not isinstance(value, instance_class): | |
| 17 raise ValueError("Rule of type [%s] does not contain correct value type for key [%s]." % (rule["type"], key)) | |
| 18 | |
| 19 | |
| 20 def _ensure_key_value_in(rule, key, values): | |
| 21 value = rule[key] | |
| 22 if value not in values: | |
| 23 raise ValueError("Invalid value [%s] for [%s] encountered." % (value, key)) | |
| 24 | |
| 25 | |
| 26 def _ensure_valid_pattern(expression): | |
| 27 re.compile(expression) | |
| 28 | |
| 29 | |
| 30 def apply_regex(regex, target, data, replacement=None, group_count=None): | |
| 31 pattern = re.compile(regex) | |
| 32 | |
| 33 def new_row(row): | |
| 34 source = row[target] | |
| 35 if replacement is None: | |
| 36 match = pattern.search(source) | |
| 37 if not match: | |
| 38 raise Exception("Problem applying regular expression [%s] to [%s]." % (regex, source)) | |
| 39 | |
| 40 if group_count: | |
| 41 if len(match.groups()) != group_count: | |
| 42 raise Exception("Problem applying regular expression, wrong number of groups found.") | |
| 43 | |
| 44 result = row + list(match.groups()) | |
| 45 else: | |
| 46 result = row + [match.group(0)] | |
| 47 else: | |
| 48 result = row + [pattern.search(source).expand(replacement)] | |
| 49 | |
| 50 return result | |
| 51 | |
| 52 new_data = list(map(new_row, data)) | |
| 53 return new_data | |
| 54 | |
| 55 | |
| 56 @six.add_metaclass(abc.ABCMeta) | |
| 57 class BaseRuleDefinition(object): | |
| 58 | |
| 59 @abc.abstractproperty | |
| 60 def rule_type(self): | |
| 61 """Short string describing type of rule (plugin class) to use.""" | |
| 62 | |
| 63 @abc.abstractmethod | |
| 64 def validate_rule(self, rule): | |
| 65 """Validate dictified rule definition of this type.""" | |
| 66 | |
| 67 @abc.abstractmethod | |
| 68 def apply(self, rule, data, sources): | |
| 69 """Apply validated, dictified rule definition to supplied data.""" | |
| 70 | |
| 71 | |
| 72 class AddColumnMetadataRuleDefinition(BaseRuleDefinition): | |
| 73 rule_type = "add_column_metadata" | |
| 74 | |
| 75 def validate_rule(self, rule): | |
| 76 _ensure_rule_contains_keys(rule, {"value": six.string_types}) | |
| 77 | |
| 78 def apply(self, rule, data, sources): | |
| 79 rule_value = rule["value"] | |
| 80 if rule_value.startswith("identifier"): | |
| 81 identifier_index = int(rule_value[len("identifier"):]) | |
| 82 | |
| 83 new_rows = [] | |
| 84 for index, row in enumerate(data): | |
| 85 new_rows.append(row + [sources[index]["identifiers"][identifier_index]]) | |
| 86 | |
| 87 elif rule_value == "tags": | |
| 88 | |
| 89 def sorted_tags(index): | |
| 90 tags = sorted(sources[index]["tags"]) | |
| 91 return [",".join(tags)] | |
| 92 | |
| 93 new_rows = [] | |
| 94 for index, row in enumerate(data): | |
| 95 new_rows.append(row + sorted_tags(index)) | |
| 96 | |
| 97 return new_rows, sources | |
| 98 | |
| 99 | |
| 100 class AddColumnGroupTagValueRuleDefinition(BaseRuleDefinition): | |
| 101 rule_type = "add_column_group_tag_value" | |
| 102 | |
| 103 def validate_rule(self, rule): | |
| 104 _ensure_rule_contains_keys(rule, {"value": six.string_types}) | |
| 105 | |
| 106 def apply(self, rule, data, sources): | |
| 107 rule_value = rule["value"] | |
| 108 tag_prefix = "group:%s:" % rule_value | |
| 109 | |
| 110 new_rows = [] | |
| 111 for index, row in enumerate(data): | |
| 112 group_tag_value = None | |
| 113 source = sources[index] | |
| 114 tags = source["tags"] | |
| 115 for tag in sorted(tags): | |
| 116 if tag.startswith(tag_prefix): | |
| 117 group_tag_value = tag[len(tag_prefix):] | |
| 118 break | |
| 119 | |
| 120 if group_tag_value is None: | |
| 121 group_tag_value = rule.get("default_value", "") | |
| 122 | |
| 123 new_rows.append(row + [group_tag_value]) | |
| 124 | |
| 125 return new_rows, sources | |
| 126 | |
| 127 | |
| 128 class AddColumnConcatenateRuleDefinition(BaseRuleDefinition): | |
| 129 rule_type = "add_column_concatenate" | |
| 130 | |
| 131 def validate_rule(self, rule): | |
| 132 _ensure_rule_contains_keys(rule, {"target_column_0": int, "target_column_1": int}) | |
| 133 | |
| 134 def apply(self, rule, data, sources): | |
| 135 column_0 = rule["target_column_0"] | |
| 136 column_1 = rule["target_column_1"] | |
| 137 | |
| 138 new_rows = [] | |
| 139 for index, row in enumerate(data): | |
| 140 new_rows.append(row + [row[column_0] + row[column_1]]) | |
| 141 | |
| 142 return new_rows, sources | |
| 143 | |
| 144 | |
| 145 class AddColumnBasenameRuleDefinition(BaseRuleDefinition): | |
| 146 rule_type = "add_column_basename" | |
| 147 | |
| 148 def validate_rule(self, rule): | |
| 149 _ensure_rule_contains_keys(rule, {"target_column": int}) | |
| 150 | |
| 151 def apply(self, rule, data, sources): | |
| 152 column = rule["target_column"] | |
| 153 re = r"[^/]*$" | |
| 154 return apply_regex(re, column, data), sources | |
| 155 | |
| 156 | |
| 157 class AddColumnRegexRuleDefinition(BaseRuleDefinition): | |
| 158 rule_type = "add_column_regex" | |
| 159 | |
| 160 def validate_rule(self, rule): | |
| 161 _ensure_rule_contains_keys(rule, {"target_column": int, "expression": six.string_types}) | |
| 162 _ensure_valid_pattern(rule["expression"]) | |
| 163 | |
| 164 def apply(self, rule, data, sources): | |
| 165 target = rule["target_column"] | |
| 166 expression = rule["expression"] | |
| 167 replacement = rule.get("replacement") | |
| 168 group_count = rule.get("group_count") | |
| 169 | |
| 170 return apply_regex(expression, target, data, replacement, group_count), sources | |
| 171 | |
| 172 | |
| 173 class AddColumnRownumRuleDefinition(BaseRuleDefinition): | |
| 174 rule_type = "add_column_rownum" | |
| 175 | |
| 176 def validate_rule(self, rule): | |
| 177 _ensure_rule_contains_keys(rule, {"start": int}) | |
| 178 | |
| 179 def apply(self, rule, data, sources): | |
| 180 start = rule["start"] | |
| 181 | |
| 182 new_rows = [] | |
| 183 for index, row in enumerate(data): | |
| 184 new_rows.append(row + ["%d" % (index + start)]) | |
| 185 | |
| 186 return new_rows, sources | |
| 187 | |
| 188 | |
| 189 class AddColumnValueRuleDefinition(BaseRuleDefinition): | |
| 190 rule_type = "add_column_value" | |
| 191 | |
| 192 def validate_rule(self, rule): | |
| 193 _ensure_rule_contains_keys(rule, {"value": six.string_types}) | |
| 194 | |
| 195 def apply(self, rule, data, sources): | |
| 196 value = rule["value"] | |
| 197 | |
| 198 new_rows = [] | |
| 199 for index, row in enumerate(data): | |
| 200 new_rows.append(row + [str(value)]) | |
| 201 | |
| 202 return new_rows, sources | |
| 203 | |
| 204 | |
| 205 class AddColumnSubstrRuleDefinition(BaseRuleDefinition): | |
| 206 rule_type = "add_column_substr" | |
| 207 | |
| 208 def validate_rule(self, rule): | |
| 209 _ensure_rule_contains_keys(rule, { | |
| 210 "target_column": int, | |
| 211 "length": int, | |
| 212 "substr_type": six.string_types, | |
| 213 }) | |
| 214 _ensure_key_value_in(rule, "substr_type", ["keep_prefix", "drop_prefix", "keep_suffix", "drop_suffix"]) | |
| 215 | |
| 216 def apply(self, rule, data, sources): | |
| 217 target = rule["target_column"] | |
| 218 length = rule["length"] | |
| 219 substr_type = rule["substr_type"] | |
| 220 | |
| 221 def new_row(row): | |
| 222 original_value = row[target] | |
| 223 start = 0 | |
| 224 end = len(original_value) | |
| 225 | |
| 226 if substr_type == "keep_prefix": | |
| 227 end = length | |
| 228 elif substr_type == "drop_prefix": | |
| 229 start = length | |
| 230 elif substr_type == "keep_suffix": | |
| 231 start = end - length | |
| 232 if start < 0: | |
| 233 start = 0 | |
| 234 else: | |
| 235 end = end - length | |
| 236 if end < 0: | |
| 237 end = 0 | |
| 238 | |
| 239 return row + [original_value[start:end]] | |
| 240 | |
| 241 return list(map(new_row, data)), sources | |
| 242 | |
| 243 | |
| 244 class RemoveColumnsRuleDefinition(BaseRuleDefinition): | |
| 245 rule_type = "remove_columns" | |
| 246 | |
| 247 def validate_rule(self, rule): | |
| 248 _ensure_rule_contains_keys(rule, { | |
| 249 "target_columns": list, | |
| 250 }) | |
| 251 | |
| 252 def apply(self, rule, data, sources): | |
| 253 target_columns = rule["target_columns"] | |
| 254 | |
| 255 def new_row(row): | |
| 256 new = [] | |
| 257 for index, val in enumerate(row): | |
| 258 if index not in target_columns: | |
| 259 new.append(val) | |
| 260 return new | |
| 261 | |
| 262 return list(map(new_row, data)), sources | |
| 263 | |
| 264 | |
| 265 def _filter_index(func, iterable): | |
| 266 result = [] | |
| 267 for index, x in enumerate(iterable): | |
| 268 if func(index): | |
| 269 result.append(x) | |
| 270 | |
| 271 return result | |
| 272 | |
| 273 | |
| 274 class AddFilterRegexRuleDefinition(BaseRuleDefinition): | |
| 275 rule_type = "add_filter_regex" | |
| 276 | |
| 277 def validate_rule(self, rule): | |
| 278 _ensure_rule_contains_keys(rule, { | |
| 279 "target_column": int, | |
| 280 "invert": bool, | |
| 281 "expression": six.string_types, | |
| 282 }) | |
| 283 _ensure_valid_pattern(rule["expression"]) | |
| 284 | |
| 285 def apply(self, rule, data, sources): | |
| 286 target_column = rule["target_column"] | |
| 287 invert = rule["invert"] | |
| 288 regex = rule["expression"] | |
| 289 | |
| 290 def _filter(index): | |
| 291 row = data[index] | |
| 292 val = row[target_column] | |
| 293 pattern = re.compile(regex) | |
| 294 return not invert if pattern.search(val) else invert | |
| 295 | |
| 296 return _filter_index(_filter, data), _filter_index(_filter, sources) | |
| 297 | |
| 298 | |
| 299 class AddFilterCountRuleDefinition(BaseRuleDefinition): | |
| 300 rule_type = "add_filter_count" | |
| 301 | |
| 302 def validate_rule(self, rule): | |
| 303 _ensure_rule_contains_keys(rule, { | |
| 304 "count": int, | |
| 305 "invert": bool, | |
| 306 "which": six.string_types, | |
| 307 }) | |
| 308 _ensure_key_value_in(rule, "which", ["first", "last"]) | |
| 309 | |
| 310 def apply(self, rule, data, sources): | |
| 311 num_rows = len(data) | |
| 312 invert = rule["invert"] | |
| 313 n = rule["count"] | |
| 314 which = rule["which"] | |
| 315 | |
| 316 def _filter(index): | |
| 317 if which == "first": | |
| 318 matches = index >= n | |
| 319 else: | |
| 320 matches = index < (num_rows - n) | |
| 321 return not invert if matches else invert | |
| 322 | |
| 323 return _filter_index(_filter, data), _filter_index(_filter, sources) | |
| 324 | |
| 325 | |
| 326 class AddFilterEmptyRuleDefinition(BaseRuleDefinition): | |
| 327 rule_type = "add_filter_empty" | |
| 328 | |
| 329 def validate_rule(self, rule): | |
| 330 _ensure_rule_contains_keys(rule, { | |
| 331 "target_column": int, | |
| 332 "invert": bool | |
| 333 }) | |
| 334 | |
| 335 def apply(self, rule, data, sources): | |
| 336 invert = rule["invert"] | |
| 337 target_column = rule["target_column"] | |
| 338 | |
| 339 def _filter(index): | |
| 340 non_empty = len(data[index][target_column]) != 0 | |
| 341 return not invert if non_empty else invert | |
| 342 | |
| 343 return _filter_index(_filter, data), _filter_index(_filter, sources) | |
| 344 | |
| 345 | |
| 346 class AddFilterMatchesRuleDefinition(BaseRuleDefinition): | |
| 347 rule_type = "add_filter_matches" | |
| 348 | |
| 349 def validate_rule(self, rule): | |
| 350 _ensure_rule_contains_keys(rule, { | |
| 351 "target_column": int, | |
| 352 "invert": bool, | |
| 353 "value": six.string_types, | |
| 354 }) | |
| 355 | |
| 356 def apply(self, rule, data, sources): | |
| 357 invert = rule["invert"] | |
| 358 target_column = rule["target_column"] | |
| 359 value = rule["value"] | |
| 360 | |
| 361 def _filter(index): | |
| 362 row = data[index] | |
| 363 val = row[target_column] | |
| 364 return not invert if val == value else invert | |
| 365 | |
| 366 return _filter_index(_filter, data), _filter_index(_filter, sources) | |
| 367 | |
| 368 | |
| 369 class AddFilterCompareRuleDefinition(BaseRuleDefinition): | |
| 370 rule_type = "add_filter_compare" | |
| 371 | |
| 372 def validate_rule(self, rule): | |
| 373 _ensure_rule_contains_keys(rule, { | |
| 374 "target_column": int, | |
| 375 "value": int, | |
| 376 "compare_type": six.string_types, | |
| 377 }) | |
| 378 _ensure_key_value_in(rule, "compare_type", ["less_than", "less_than_equal", "greater_than", "greater_than_equal"]) | |
| 379 | |
| 380 def apply(self, rule, data, sources): | |
| 381 target_column = rule["target_column"] | |
| 382 value = rule["value"] | |
| 383 compare_type = rule["compare_type"] | |
| 384 | |
| 385 def _filter(index): | |
| 386 row = data[index] | |
| 387 target_value = float(row[target_column]) | |
| 388 if compare_type == "less_than": | |
| 389 matches = target_value < value | |
| 390 elif compare_type == "less_than_equal": | |
| 391 matches = target_value <= value | |
| 392 elif compare_type == "greater_than": | |
| 393 matches = target_value > value | |
| 394 elif compare_type == "greater_than_equal": | |
| 395 matches = target_value >= value | |
| 396 | |
| 397 return matches | |
| 398 | |
| 399 return _filter_index(_filter, data), _filter_index(_filter, sources) | |
| 400 | |
| 401 | |
| 402 class SortRuleDefinition(BaseRuleDefinition): | |
| 403 rule_type = "sort" | |
| 404 | |
| 405 def validate_rule(self, rule): | |
| 406 _ensure_rule_contains_keys(rule, { | |
| 407 "target_column": int, | |
| 408 "numeric": bool, | |
| 409 }) | |
| 410 | |
| 411 def apply(self, rule, data, sources): | |
| 412 target = rule["target_column"] | |
| 413 numeric = rule["numeric"] | |
| 414 | |
| 415 sortable = zip(data, sources) | |
| 416 | |
| 417 def sort_func(item): | |
| 418 a_val = item[0][target] | |
| 419 if numeric: | |
| 420 a_val = float(a_val) | |
| 421 return a_val | |
| 422 | |
| 423 sorted_data = sorted(sortable, key=sort_func) | |
| 424 | |
| 425 new_data = [] | |
| 426 new_sources = [] | |
| 427 | |
| 428 for (row, source) in sorted_data: | |
| 429 new_data.append(row) | |
| 430 new_sources.append(source) | |
| 431 | |
| 432 return new_data, new_sources | |
| 433 | |
| 434 | |
| 435 class SwapColumnsRuleDefinition(BaseRuleDefinition): | |
| 436 rule_type = "swap_columns" | |
| 437 | |
| 438 def validate_rule(self, rule): | |
| 439 _ensure_rule_contains_keys(rule, { | |
| 440 "target_column_0": int, | |
| 441 "target_column_1": int, | |
| 442 }) | |
| 443 | |
| 444 def apply(self, rule, data, sources): | |
| 445 target_column_0 = rule["target_column_0"] | |
| 446 target_column_1 = rule["target_column_1"] | |
| 447 | |
| 448 def new_row(row): | |
| 449 row_copy = row[:] | |
| 450 row_copy[target_column_0] = row[target_column_1] | |
| 451 row_copy[target_column_1] = row[target_column_0] | |
| 452 return row_copy | |
| 453 | |
| 454 return list(map(new_row, data)), sources | |
| 455 | |
| 456 | |
| 457 class SplitColumnsRuleDefinition(BaseRuleDefinition): | |
| 458 rule_type = "split_columns" | |
| 459 | |
| 460 def validate_rule(self, rule): | |
| 461 _ensure_rule_contains_keys(rule, { | |
| 462 "target_columns_0": list, | |
| 463 "target_columns_1": list, | |
| 464 }) | |
| 465 | |
| 466 def apply(self, rule, data, sources): | |
| 467 target_columns_0 = rule["target_columns_0"] | |
| 468 target_columns_1 = rule["target_columns_1"] | |
| 469 | |
| 470 def split_row(row): | |
| 471 new_row_0 = [] | |
| 472 new_row_1 = [] | |
| 473 for index, el in enumerate(row): | |
| 474 if index in target_columns_0: | |
| 475 new_row_0.append(el) | |
| 476 elif index in target_columns_1: | |
| 477 new_row_1.append(el) | |
| 478 else: | |
| 479 new_row_0.append(el) | |
| 480 new_row_1.append(el) | |
| 481 | |
| 482 return [new_row_0, new_row_1] | |
| 483 | |
| 484 data = flat_map(split_row, data) | |
| 485 sources = flat_map(lambda x: [x, x], sources) | |
| 486 | |
| 487 return data, sources | |
| 488 | |
| 489 | |
| 490 def flat_map(f, items): | |
| 491 return list(itertools.chain.from_iterable(map(f, items))) | |
| 492 | |
| 493 | |
| 494 class RuleSet(object): | |
| 495 | |
| 496 def __init__(self, rule_set_as_dict): | |
| 497 self.raw_rules = strip_control_characters_nested(rule_set_as_dict["rules"]) | |
| 498 self.raw_mapping = rule_set_as_dict.get("mapping", []) | |
| 499 | |
| 500 @property | |
| 501 def rules(self): | |
| 502 return self.raw_rules | |
| 503 | |
| 504 def _rules_with_definitions(self): | |
| 505 for rule in self.raw_rules: | |
| 506 yield (rule, RULES_DEFINITIONS[rule["type"]]) | |
| 507 | |
| 508 def apply(self, data, sources): | |
| 509 for rule, rule_definition in self._rules_with_definitions(): | |
| 510 rule_definition.validate_rule(rule) | |
| 511 data, sources = rule_definition.apply(rule, data, sources) | |
| 512 | |
| 513 return data, sources | |
| 514 | |
| 515 @property | |
| 516 def has_errors(self): | |
| 517 errored = False | |
| 518 try: | |
| 519 for rule, rule_definition in self._rules_with_definitions(): | |
| 520 rule_definition.validate_rule(rule) | |
| 521 except Exception: | |
| 522 errored = True | |
| 523 return errored | |
| 524 | |
| 525 @property | |
| 526 def mapping_as_dict(self): | |
| 527 as_dict = {} | |
| 528 for mapping in self.raw_mapping: | |
| 529 as_dict[mapping["type"]] = mapping | |
| 530 | |
| 531 return as_dict | |
| 532 | |
| 533 # Rest of this is generic, things here are Galaxy collection specific, think about about | |
| 534 # subclass of RuleSet for collection creation. | |
| 535 @property | |
| 536 def identifier_columns(self): | |
| 537 mapping_as_dict = self.mapping_as_dict | |
| 538 identifier_columns = [] | |
| 539 if "list_identifiers" in mapping_as_dict: | |
| 540 identifier_columns.extend(mapping_as_dict["list_identifiers"]["columns"]) | |
| 541 if "paired_identifier" in mapping_as_dict: | |
| 542 identifier_columns.append(mapping_as_dict["paired_identifier"]["columns"][0]) | |
| 543 | |
| 544 return identifier_columns | |
| 545 | |
| 546 @property | |
| 547 def collection_type(self): | |
| 548 mapping_as_dict = self.mapping_as_dict | |
| 549 list_columns = mapping_as_dict.get("list_identifiers", {"columns": []})["columns"] | |
| 550 collection_type = ":".join(map(lambda c: "list", list_columns)) | |
| 551 if "paired_identifier" in mapping_as_dict: | |
| 552 if collection_type: | |
| 553 collection_type += ":paired" | |
| 554 else: | |
| 555 collection_type = "paired" | |
| 556 return collection_type | |
| 557 | |
| 558 @property | |
| 559 def display(self): | |
| 560 message = "Rules:\n" | |
| 561 message += "".join("- %s\n" % r for r in self.raw_rules) | |
| 562 message += "Column Definitions:\n" | |
| 563 message += "".join("- %s\n" % m for m in self.raw_mapping) | |
| 564 return message | |
| 565 | |
| 566 | |
| 567 RULES_DEFINITION_CLASSES = [ | |
| 568 AddColumnMetadataRuleDefinition, | |
| 569 AddColumnGroupTagValueRuleDefinition, | |
| 570 AddColumnConcatenateRuleDefinition, | |
| 571 AddColumnBasenameRuleDefinition, | |
| 572 AddColumnRegexRuleDefinition, | |
| 573 AddColumnRownumRuleDefinition, | |
| 574 AddColumnValueRuleDefinition, | |
| 575 AddColumnSubstrRuleDefinition, | |
| 576 RemoveColumnsRuleDefinition, | |
| 577 AddFilterRegexRuleDefinition, | |
| 578 AddFilterCountRuleDefinition, | |
| 579 AddFilterEmptyRuleDefinition, | |
| 580 AddFilterMatchesRuleDefinition, | |
| 581 AddFilterCompareRuleDefinition, | |
| 582 SortRuleDefinition, | |
| 583 SwapColumnsRuleDefinition, | |
| 584 SplitColumnsRuleDefinition, | |
| 585 ] | |
| 586 RULES_DEFINITIONS = {} | |
| 587 for rule_class in RULES_DEFINITION_CLASSES: | |
| 588 RULES_DEFINITIONS[rule_class.rule_type] = rule_class() |
