Mercurial > repos > guerler > springsuite
comparison planemo/lib/python3.7/site-packages/soupsieve/css_match.py @ 1:56ad4e20f292 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
| author | guerler |
|---|---|
| date | Fri, 31 Jul 2020 00:32:28 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 0:d30785e31577 | 1:56ad4e20f292 |
|---|---|
| 1 """CSS matcher.""" | |
| 2 from datetime import datetime | |
| 3 from . import util | |
| 4 import re | |
| 5 from .import css_types as ct | |
| 6 import unicodedata | |
| 7 | |
| 8 # Empty tag pattern (whitespace okay) | |
| 9 RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]') | |
| 10 | |
| 11 RE_NOT_WS = re.compile('[^ \t\r\n\f]+') | |
| 12 | |
| 13 # Relationships | |
| 14 REL_PARENT = ' ' | |
| 15 REL_CLOSE_PARENT = '>' | |
| 16 REL_SIBLING = '~' | |
| 17 REL_CLOSE_SIBLING = '+' | |
| 18 | |
| 19 # Relationships for :has() (forward looking) | |
| 20 REL_HAS_PARENT = ': ' | |
| 21 REL_HAS_CLOSE_PARENT = ':>' | |
| 22 REL_HAS_SIBLING = ':~' | |
| 23 REL_HAS_CLOSE_SIBLING = ':+' | |
| 24 | |
| 25 NS_XHTML = 'http://www.w3.org/1999/xhtml' | |
| 26 NS_XML = 'http://www.w3.org/XML/1998/namespace' | |
| 27 | |
| 28 DIR_FLAGS = ct.SEL_DIR_LTR | ct.SEL_DIR_RTL | |
| 29 RANGES = ct.SEL_IN_RANGE | ct.SEL_OUT_OF_RANGE | |
| 30 | |
| 31 DIR_MAP = { | |
| 32 'ltr': ct.SEL_DIR_LTR, | |
| 33 'rtl': ct.SEL_DIR_RTL, | |
| 34 'auto': 0 | |
| 35 } | |
| 36 | |
| 37 RE_NUM = re.compile(r"^(?P<value>-?(?:[0-9]{1,}(\.[0-9]+)?|\.[0-9]+))$") | |
| 38 RE_TIME = re.compile(r'^(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$') | |
| 39 RE_MONTH = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})$') | |
| 40 RE_WEEK = re.compile(r'^(?P<year>[0-9]{4,})-W(?P<week>[0-9]{2})$') | |
| 41 RE_DATE = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})$') | |
| 42 RE_DATETIME = re.compile( | |
| 43 r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})T(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$' | |
| 44 ) | |
| 45 RE_WILD_STRIP = re.compile(r'(?:(?:-\*-)(?:\*(?:-|$))*|-\*$)') | |
| 46 | |
| 47 MONTHS_30 = (4, 6, 9, 11) # April, June, September, and November | |
| 48 FEB = 2 | |
| 49 SHORT_MONTH = 30 | |
| 50 LONG_MONTH = 31 | |
| 51 FEB_MONTH = 28 | |
| 52 FEB_LEAP_MONTH = 29 | |
| 53 DAYS_IN_WEEK = 7 | |
| 54 | |
| 55 | |
| 56 class _FakeParent(object): | |
| 57 """ | |
| 58 Fake parent class. | |
| 59 | |
| 60 When we have a fragment with no `BeautifulSoup` document object, | |
| 61 we can't evaluate `nth` selectors properly. Create a temporary | |
| 62 fake parent so we can traverse the root element as a child. | |
| 63 """ | |
| 64 | |
| 65 def __init__(self, element): | |
| 66 """Initialize.""" | |
| 67 | |
| 68 self.contents = [element] | |
| 69 | |
| 70 def __len__(self): | |
| 71 """Length.""" | |
| 72 | |
| 73 return len(self.contents) | |
| 74 | |
| 75 | |
| 76 class _DocumentNav(object): | |
| 77 """Navigate a Beautiful Soup document.""" | |
| 78 | |
| 79 @classmethod | |
| 80 def assert_valid_input(cls, tag): | |
| 81 """Check if valid input tag or document.""" | |
| 82 | |
| 83 # Fail on unexpected types. | |
| 84 if not cls.is_tag(tag): | |
| 85 raise TypeError("Expected a BeautifulSoup 'Tag', but instead recieved type {}".format(type(tag))) | |
| 86 | |
| 87 @staticmethod | |
| 88 def is_doc(obj): | |
| 89 """Is `BeautifulSoup` object.""" | |
| 90 | |
| 91 import bs4 | |
| 92 return isinstance(obj, bs4.BeautifulSoup) | |
| 93 | |
| 94 @staticmethod | |
| 95 def is_tag(obj): | |
| 96 """Is tag.""" | |
| 97 | |
| 98 import bs4 | |
| 99 return isinstance(obj, bs4.Tag) | |
| 100 | |
| 101 @staticmethod | |
| 102 def is_declaration(obj): # pragma: no cover | |
| 103 """Is declaration.""" | |
| 104 | |
| 105 import bs4 | |
| 106 return isinstance(obj, bs4.Declaration) | |
| 107 | |
| 108 @staticmethod | |
| 109 def is_cdata(obj): | |
| 110 """Is CDATA.""" | |
| 111 | |
| 112 import bs4 | |
| 113 return isinstance(obj, bs4.CData) | |
| 114 | |
| 115 @staticmethod | |
| 116 def is_processing_instruction(obj): # pragma: no cover | |
| 117 """Is processing instruction.""" | |
| 118 | |
| 119 import bs4 | |
| 120 return isinstance(obj, bs4.ProcessingInstruction) | |
| 121 | |
| 122 @staticmethod | |
| 123 def is_navigable_string(obj): | |
| 124 """Is navigable string.""" | |
| 125 | |
| 126 import bs4 | |
| 127 return isinstance(obj, bs4.NavigableString) | |
| 128 | |
| 129 @staticmethod | |
| 130 def is_special_string(obj): | |
| 131 """Is special string.""" | |
| 132 | |
| 133 import bs4 | |
| 134 return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype)) | |
| 135 | |
| 136 @classmethod | |
| 137 def is_content_string(cls, obj): | |
| 138 """Check if node is content string.""" | |
| 139 | |
| 140 return cls.is_navigable_string(obj) and not cls.is_special_string(obj) | |
| 141 | |
| 142 @staticmethod | |
| 143 def create_fake_parent(el): | |
| 144 """Create fake parent for a given element.""" | |
| 145 | |
| 146 return _FakeParent(el) | |
| 147 | |
| 148 @staticmethod | |
| 149 def is_xml_tree(el): | |
| 150 """Check if element (or document) is from a XML tree.""" | |
| 151 | |
| 152 return el._is_xml | |
| 153 | |
| 154 def is_iframe(self, el): | |
| 155 """Check if element is an `iframe`.""" | |
| 156 | |
| 157 return ((el.name if self.is_xml_tree(el) else util.lower(el.name)) == 'iframe') and self.is_html_tag(el) | |
| 158 | |
| 159 def is_root(self, el): | |
| 160 """ | |
| 161 Return whether element is a root element. | |
| 162 | |
| 163 We check that the element is the root of the tree (which we have already pre-calculated), | |
| 164 and we check if it is the root element under an `iframe`. | |
| 165 """ | |
| 166 | |
| 167 root = self.root and self.root is el | |
| 168 if not root: | |
| 169 parent = self.get_parent(el) | |
| 170 root = parent is not None and self.is_html and self.is_iframe(parent) | |
| 171 return root | |
| 172 | |
| 173 def get_contents(self, el, no_iframe=False): | |
| 174 """Get contents or contents in reverse.""" | |
| 175 if not no_iframe or not self.is_iframe(el): | |
| 176 for content in el.contents: | |
| 177 yield content | |
| 178 | |
| 179 def get_children(self, el, start=None, reverse=False, tags=True, no_iframe=False): | |
| 180 """Get children.""" | |
| 181 | |
| 182 if not no_iframe or not self.is_iframe(el): | |
| 183 last = len(el.contents) - 1 | |
| 184 if start is None: | |
| 185 index = last if reverse else 0 | |
| 186 else: | |
| 187 index = start | |
| 188 end = -1 if reverse else last + 1 | |
| 189 incr = -1 if reverse else 1 | |
| 190 | |
| 191 if 0 <= index <= last: | |
| 192 while index != end: | |
| 193 node = el.contents[index] | |
| 194 index += incr | |
| 195 if not tags or self.is_tag(node): | |
| 196 yield node | |
| 197 | |
| 198 def get_descendants(self, el, tags=True, no_iframe=False): | |
| 199 """Get descendants.""" | |
| 200 | |
| 201 if not no_iframe or not self.is_iframe(el): | |
| 202 next_good = None | |
| 203 for child in el.descendants: | |
| 204 | |
| 205 if next_good is not None: | |
| 206 if child is not next_good: | |
| 207 continue | |
| 208 next_good = None | |
| 209 | |
| 210 is_tag = self.is_tag(child) | |
| 211 | |
| 212 if no_iframe and is_tag and self.is_iframe(child): | |
| 213 if child.next_sibling is not None: | |
| 214 next_good = child.next_sibling | |
| 215 else: | |
| 216 last_child = child | |
| 217 while self.is_tag(last_child) and last_child.contents: | |
| 218 last_child = last_child.contents[-1] | |
| 219 next_good = last_child.next_element | |
| 220 yield child | |
| 221 if next_good is None: | |
| 222 break | |
| 223 # Coverage isn't seeing this even though it's executed | |
| 224 continue # pragma: no cover | |
| 225 | |
| 226 if not tags or is_tag: | |
| 227 yield child | |
| 228 | |
| 229 def get_parent(self, el, no_iframe=False): | |
| 230 """Get parent.""" | |
| 231 | |
| 232 parent = el.parent | |
| 233 if no_iframe and parent is not None and self.is_iframe(parent): | |
| 234 parent = None | |
| 235 return parent | |
| 236 | |
| 237 @staticmethod | |
| 238 def get_tag_name(el): | |
| 239 """Get tag.""" | |
| 240 | |
| 241 return el.name | |
| 242 | |
| 243 @staticmethod | |
| 244 def get_prefix_name(el): | |
| 245 """Get prefix.""" | |
| 246 | |
| 247 return el.prefix | |
| 248 | |
| 249 @staticmethod | |
| 250 def get_uri(el): | |
| 251 """Get namespace `URI`.""" | |
| 252 | |
| 253 return el.namespace | |
| 254 | |
| 255 @classmethod | |
| 256 def get_next(cls, el, tags=True): | |
| 257 """Get next sibling tag.""" | |
| 258 | |
| 259 sibling = el.next_sibling | |
| 260 while tags and not cls.is_tag(sibling) and sibling is not None: | |
| 261 sibling = sibling.next_sibling | |
| 262 return sibling | |
| 263 | |
| 264 @classmethod | |
| 265 def get_previous(cls, el, tags=True): | |
| 266 """Get previous sibling tag.""" | |
| 267 | |
| 268 sibling = el.previous_sibling | |
| 269 while tags and not cls.is_tag(sibling) and sibling is not None: | |
| 270 sibling = sibling.previous_sibling | |
| 271 return sibling | |
| 272 | |
| 273 @staticmethod | |
| 274 def has_html_ns(el): | |
| 275 """ | |
| 276 Check if element has an HTML namespace. | |
| 277 | |
| 278 This is a bit different than whether a element is treated as having an HTML namespace, | |
| 279 like we do in the case of `is_html_tag`. | |
| 280 """ | |
| 281 | |
| 282 ns = getattr(el, 'namespace') if el else None | |
| 283 return ns and ns == NS_XHTML | |
| 284 | |
| 285 @staticmethod | |
| 286 def split_namespace(el, attr_name): | |
| 287 """Return namespace and attribute name without the prefix.""" | |
| 288 | |
| 289 return getattr(attr_name, 'namespace', None), getattr(attr_name, 'name', None) | |
| 290 | |
| 291 @staticmethod | |
| 292 def get_attribute_by_name(el, name, default=None): | |
| 293 """Get attribute by name.""" | |
| 294 | |
| 295 value = default | |
| 296 if el._is_xml: | |
| 297 try: | |
| 298 value = el.attrs[name] | |
| 299 except KeyError: | |
| 300 pass | |
| 301 else: | |
| 302 for k, v in el.attrs.items(): | |
| 303 if util.lower(k) == name: | |
| 304 value = v | |
| 305 break | |
| 306 return value | |
| 307 | |
| 308 @staticmethod | |
| 309 def iter_attributes(el): | |
| 310 """Iterate attributes.""" | |
| 311 | |
| 312 for k, v in el.attrs.items(): | |
| 313 yield k, v | |
| 314 | |
| 315 @classmethod | |
| 316 def get_classes(cls, el): | |
| 317 """Get classes.""" | |
| 318 | |
| 319 classes = cls.get_attribute_by_name(el, 'class', []) | |
| 320 if isinstance(classes, str): | |
| 321 classes = RE_NOT_WS.findall(classes) | |
| 322 return classes | |
| 323 | |
| 324 def get_text(self, el, no_iframe=False): | |
| 325 """Get text.""" | |
| 326 | |
| 327 return ''.join( | |
| 328 [node for node in self.get_descendants(el, tags=False, no_iframe=no_iframe) if self.is_content_string(node)] | |
| 329 ) | |
| 330 | |
| 331 | |
| 332 class Inputs(object): | |
| 333 """Class for parsing and validating input items.""" | |
| 334 | |
| 335 @staticmethod | |
| 336 def validate_day(year, month, day): | |
| 337 """Validate day.""" | |
| 338 | |
| 339 max_days = LONG_MONTH | |
| 340 if month == FEB: | |
| 341 max_days = FEB_LEAP_MONTH if ((year % 4 == 0) and (year % 100 != 0)) or (year % 400 == 0) else FEB_MONTH | |
| 342 elif month in MONTHS_30: | |
| 343 max_days = SHORT_MONTH | |
| 344 return 1 <= day <= max_days | |
| 345 | |
| 346 @staticmethod | |
| 347 def validate_week(year, week): | |
| 348 """Validate week.""" | |
| 349 | |
| 350 max_week = datetime.strptime("{}-{}-{}".format(12, 31, year), "%m-%d-%Y").isocalendar()[1] | |
| 351 if max_week == 1: | |
| 352 max_week = 53 | |
| 353 return 1 <= week <= max_week | |
| 354 | |
| 355 @staticmethod | |
| 356 def validate_month(month): | |
| 357 """Validate month.""" | |
| 358 | |
| 359 return 1 <= month <= 12 | |
| 360 | |
| 361 @staticmethod | |
| 362 def validate_year(year): | |
| 363 """Validate year.""" | |
| 364 | |
| 365 return 1 <= year | |
| 366 | |
| 367 @staticmethod | |
| 368 def validate_hour(hour): | |
| 369 """Validate hour.""" | |
| 370 | |
| 371 return 0 <= hour <= 23 | |
| 372 | |
| 373 @staticmethod | |
| 374 def validate_minutes(minutes): | |
| 375 """Validate minutes.""" | |
| 376 | |
| 377 return 0 <= minutes <= 59 | |
| 378 | |
| 379 @classmethod | |
| 380 def parse_value(cls, itype, value): | |
| 381 """Parse the input value.""" | |
| 382 | |
| 383 parsed = None | |
| 384 if itype == "date": | |
| 385 m = RE_DATE.match(value) | |
| 386 if m: | |
| 387 year = int(m.group('year'), 10) | |
| 388 month = int(m.group('month'), 10) | |
| 389 day = int(m.group('day'), 10) | |
| 390 if cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day): | |
| 391 parsed = (year, month, day) | |
| 392 elif itype == "month": | |
| 393 m = RE_MONTH.match(value) | |
| 394 if m: | |
| 395 year = int(m.group('year'), 10) | |
| 396 month = int(m.group('month'), 10) | |
| 397 if cls.validate_year(year) and cls.validate_month(month): | |
| 398 parsed = (year, month) | |
| 399 elif itype == "week": | |
| 400 m = RE_WEEK.match(value) | |
| 401 if m: | |
| 402 year = int(m.group('year'), 10) | |
| 403 week = int(m.group('week'), 10) | |
| 404 if cls.validate_year(year) and cls.validate_week(year, week): | |
| 405 parsed = (year, week) | |
| 406 elif itype == "time": | |
| 407 m = RE_TIME.match(value) | |
| 408 if m: | |
| 409 hour = int(m.group('hour'), 10) | |
| 410 minutes = int(m.group('minutes'), 10) | |
| 411 if cls.validate_hour(hour) and cls.validate_minutes(minutes): | |
| 412 parsed = (hour, minutes) | |
| 413 elif itype == "datetime-local": | |
| 414 m = RE_DATETIME.match(value) | |
| 415 if m: | |
| 416 year = int(m.group('year'), 10) | |
| 417 month = int(m.group('month'), 10) | |
| 418 day = int(m.group('day'), 10) | |
| 419 hour = int(m.group('hour'), 10) | |
| 420 minutes = int(m.group('minutes'), 10) | |
| 421 if ( | |
| 422 cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day) and | |
| 423 cls.validate_hour(hour) and cls.validate_minutes(minutes) | |
| 424 ): | |
| 425 parsed = (year, month, day, hour, minutes) | |
| 426 elif itype in ("number", "range"): | |
| 427 m = RE_NUM.match(value) | |
| 428 if m: | |
| 429 parsed = float(m.group('value')) | |
| 430 return parsed | |
| 431 | |
| 432 | |
| 433 class _Match(object): | |
| 434 """Perform CSS matching.""" | |
| 435 | |
| 436 def __init__(self, selectors, scope, namespaces, flags): | |
| 437 """Initialize.""" | |
| 438 | |
| 439 self.assert_valid_input(scope) | |
| 440 self.tag = scope | |
| 441 self.cached_meta_lang = [] | |
| 442 self.cached_default_forms = [] | |
| 443 self.cached_indeterminate_forms = [] | |
| 444 self.selectors = selectors | |
| 445 self.namespaces = {} if namespaces is None else namespaces | |
| 446 self.flags = flags | |
| 447 self.iframe_restrict = False | |
| 448 | |
| 449 # Find the root element for the whole tree | |
| 450 doc = scope | |
| 451 parent = self.get_parent(doc) | |
| 452 while parent: | |
| 453 doc = parent | |
| 454 parent = self.get_parent(doc) | |
| 455 root = None | |
| 456 if not self.is_doc(doc): | |
| 457 root = doc | |
| 458 else: | |
| 459 for child in self.get_children(doc): | |
| 460 root = child | |
| 461 break | |
| 462 | |
| 463 self.root = root | |
| 464 self.scope = scope if scope is not doc else root | |
| 465 self.has_html_namespace = self.has_html_ns(root) | |
| 466 | |
| 467 # A document can be both XML and HTML (XHTML) | |
| 468 self.is_xml = self.is_xml_tree(doc) | |
| 469 self.is_html = not self.is_xml or self.has_html_namespace | |
| 470 | |
| 471 def supports_namespaces(self): | |
| 472 """Check if namespaces are supported in the HTML type.""" | |
| 473 | |
| 474 return self.is_xml or self.has_html_namespace | |
| 475 | |
| 476 def get_tag_ns(self, el): | |
| 477 """Get tag namespace.""" | |
| 478 | |
| 479 if self.supports_namespaces(): | |
| 480 namespace = '' | |
| 481 ns = self.get_uri(el) | |
| 482 if ns: | |
| 483 namespace = ns | |
| 484 else: | |
| 485 namespace = NS_XHTML | |
| 486 return namespace | |
| 487 | |
| 488 def is_html_tag(self, el): | |
| 489 """Check if tag is in HTML namespace.""" | |
| 490 | |
| 491 return self.get_tag_ns(el) == NS_XHTML | |
| 492 | |
| 493 def get_tag(self, el): | |
| 494 """Get tag.""" | |
| 495 | |
| 496 name = self.get_tag_name(el) | |
| 497 return util.lower(name) if name is not None and not self.is_xml else name | |
| 498 | |
| 499 def get_prefix(self, el): | |
| 500 """Get prefix.""" | |
| 501 | |
| 502 prefix = self.get_prefix_name(el) | |
| 503 return util.lower(prefix) if prefix is not None and not self.is_xml else prefix | |
| 504 | |
| 505 def find_bidi(self, el): | |
| 506 """Get directionality from element text.""" | |
| 507 | |
| 508 for node in self.get_children(el, tags=False): | |
| 509 | |
| 510 # Analyze child text nodes | |
| 511 if self.is_tag(node): | |
| 512 | |
| 513 # Avoid analyzing certain elements specified in the specification. | |
| 514 direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(node, 'dir', '')), None) | |
| 515 if ( | |
| 516 self.get_tag(node) in ('bdi', 'script', 'style', 'textarea', 'iframe') or | |
| 517 not self.is_html_tag(node) or | |
| 518 direction is not None | |
| 519 ): | |
| 520 continue # pragma: no cover | |
| 521 | |
| 522 # Check directionality of this node's text | |
| 523 value = self.find_bidi(node) | |
| 524 if value is not None: | |
| 525 return value | |
| 526 | |
| 527 # Direction could not be determined | |
| 528 continue # pragma: no cover | |
| 529 | |
| 530 # Skip `doctype` comments, etc. | |
| 531 if self.is_special_string(node): | |
| 532 continue | |
| 533 | |
| 534 # Analyze text nodes for directionality. | |
| 535 for c in node: | |
| 536 bidi = unicodedata.bidirectional(c) | |
| 537 if bidi in ('AL', 'R', 'L'): | |
| 538 return ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL | |
| 539 return None | |
| 540 | |
| 541 def extended_language_filter(self, lang_range, lang_tag): | |
| 542 """Filter the language tags.""" | |
| 543 | |
| 544 match = True | |
| 545 lang_range = RE_WILD_STRIP.sub('-', lang_range).lower() | |
| 546 ranges = lang_range.split('-') | |
| 547 subtags = lang_tag.lower().split('-') | |
| 548 length = len(ranges) | |
| 549 rindex = 0 | |
| 550 sindex = 0 | |
| 551 r = ranges[rindex] | |
| 552 s = subtags[sindex] | |
| 553 | |
| 554 # Primary tag needs to match | |
| 555 if r != '*' and r != s: | |
| 556 match = False | |
| 557 | |
| 558 rindex += 1 | |
| 559 sindex += 1 | |
| 560 | |
| 561 # Match until we run out of ranges | |
| 562 while match and rindex < length: | |
| 563 r = ranges[rindex] | |
| 564 try: | |
| 565 s = subtags[sindex] | |
| 566 except IndexError: | |
| 567 # Ran out of subtags, | |
| 568 # but we still have ranges | |
| 569 match = False | |
| 570 continue | |
| 571 | |
| 572 # Empty range | |
| 573 if not r: | |
| 574 match = False | |
| 575 continue | |
| 576 | |
| 577 # Matched range | |
| 578 elif s == r: | |
| 579 rindex += 1 | |
| 580 | |
| 581 # Implicit wildcard cannot match | |
| 582 # singletons | |
| 583 elif len(s) == 1: | |
| 584 match = False | |
| 585 continue | |
| 586 | |
| 587 # Implicitly matched, so grab next subtag | |
| 588 sindex += 1 | |
| 589 | |
| 590 return match | |
| 591 | |
| 592 def match_attribute_name(self, el, attr, prefix): | |
| 593 """Match attribute name and return value if it exists.""" | |
| 594 | |
| 595 value = None | |
| 596 if self.supports_namespaces(): | |
| 597 value = None | |
| 598 # If we have not defined namespaces, we can't very well find them, so don't bother trying. | |
| 599 if prefix: | |
| 600 ns = self.namespaces.get(prefix) | |
| 601 if ns is None and prefix != '*': | |
| 602 return None | |
| 603 else: | |
| 604 ns = None | |
| 605 | |
| 606 for k, v in self.iter_attributes(el): | |
| 607 | |
| 608 # Get attribute parts | |
| 609 namespace, name = self.split_namespace(el, k) | |
| 610 | |
| 611 # Can't match a prefix attribute as we haven't specified one to match | |
| 612 # Try to match it normally as a whole `p:a` as selector may be trying `p\:a`. | |
| 613 if ns is None: | |
| 614 if (self.is_xml and attr == k) or (not self.is_xml and util.lower(attr) == util.lower(k)): | |
| 615 value = v | |
| 616 break | |
| 617 # Coverage is not finding this even though it is executed. | |
| 618 # Adding a print statement before this (and erasing coverage) causes coverage to find the line. | |
| 619 # Ignore the false positive message. | |
| 620 continue # pragma: no cover | |
| 621 | |
| 622 # We can't match our desired prefix attribute as the attribute doesn't have a prefix | |
| 623 if namespace is None or ns != namespace and prefix != '*': | |
| 624 continue | |
| 625 | |
| 626 # The attribute doesn't match. | |
| 627 if (util.lower(attr) != util.lower(name)) if not self.is_xml else (attr != name): | |
| 628 continue | |
| 629 | |
| 630 value = v | |
| 631 break | |
| 632 else: | |
| 633 for k, v in self.iter_attributes(el): | |
| 634 if util.lower(attr) != util.lower(k): | |
| 635 continue | |
| 636 value = v | |
| 637 break | |
| 638 return value | |
| 639 | |
| 640 def match_namespace(self, el, tag): | |
| 641 """Match the namespace of the element.""" | |
| 642 | |
| 643 match = True | |
| 644 namespace = self.get_tag_ns(el) | |
| 645 default_namespace = self.namespaces.get('') | |
| 646 tag_ns = '' if tag.prefix is None else self.namespaces.get(tag.prefix, None) | |
| 647 # We must match the default namespace if one is not provided | |
| 648 if tag.prefix is None and (default_namespace is not None and namespace != default_namespace): | |
| 649 match = False | |
| 650 # If we specified `|tag`, we must not have a namespace. | |
| 651 elif (tag.prefix is not None and tag.prefix == '' and namespace): | |
| 652 match = False | |
| 653 # Verify prefix matches | |
| 654 elif ( | |
| 655 tag.prefix and | |
| 656 tag.prefix != '*' and (tag_ns is None or namespace != tag_ns) | |
| 657 ): | |
| 658 match = False | |
| 659 return match | |
| 660 | |
| 661 def match_attributes(self, el, attributes): | |
| 662 """Match attributes.""" | |
| 663 | |
| 664 match = True | |
| 665 if attributes: | |
| 666 for a in attributes: | |
| 667 value = self.match_attribute_name(el, a.attribute, a.prefix) | |
| 668 pattern = a.xml_type_pattern if self.is_xml and a.xml_type_pattern else a.pattern | |
| 669 if isinstance(value, list): | |
| 670 value = ' '.join(value) | |
| 671 if value is None: | |
| 672 match = False | |
| 673 break | |
| 674 elif pattern is None: | |
| 675 continue | |
| 676 elif pattern.match(value) is None: | |
| 677 match = False | |
| 678 break | |
| 679 return match | |
| 680 | |
| 681 def match_tagname(self, el, tag): | |
| 682 """Match tag name.""" | |
| 683 | |
| 684 name = (util.lower(tag.name) if not self.is_xml and tag.name is not None else tag.name) | |
| 685 return not ( | |
| 686 name is not None and | |
| 687 name not in (self.get_tag(el), '*') | |
| 688 ) | |
| 689 | |
| 690 def match_tag(self, el, tag): | |
| 691 """Match the tag.""" | |
| 692 | |
| 693 match = True | |
| 694 if tag is not None: | |
| 695 # Verify namespace | |
| 696 if not self.match_namespace(el, tag): | |
| 697 match = False | |
| 698 if not self.match_tagname(el, tag): | |
| 699 match = False | |
| 700 return match | |
| 701 | |
| 702 def match_past_relations(self, el, relation): | |
| 703 """Match past relationship.""" | |
| 704 | |
| 705 found = False | |
| 706 if relation[0].rel_type == REL_PARENT: | |
| 707 parent = self.get_parent(el, no_iframe=self.iframe_restrict) | |
| 708 while not found and parent: | |
| 709 found = self.match_selectors(parent, relation) | |
| 710 parent = self.get_parent(parent, no_iframe=self.iframe_restrict) | |
| 711 elif relation[0].rel_type == REL_CLOSE_PARENT: | |
| 712 parent = self.get_parent(el, no_iframe=self.iframe_restrict) | |
| 713 if parent: | |
| 714 found = self.match_selectors(parent, relation) | |
| 715 elif relation[0].rel_type == REL_SIBLING: | |
| 716 sibling = self.get_previous(el) | |
| 717 while not found and sibling: | |
| 718 found = self.match_selectors(sibling, relation) | |
| 719 sibling = self.get_previous(sibling) | |
| 720 elif relation[0].rel_type == REL_CLOSE_SIBLING: | |
| 721 sibling = self.get_previous(el) | |
| 722 if sibling and self.is_tag(sibling): | |
| 723 found = self.match_selectors(sibling, relation) | |
| 724 return found | |
| 725 | |
| 726 def match_future_child(self, parent, relation, recursive=False): | |
| 727 """Match future child.""" | |
| 728 | |
| 729 match = False | |
| 730 children = self.get_descendants if recursive else self.get_children | |
| 731 for child in children(parent, no_iframe=self.iframe_restrict): | |
| 732 match = self.match_selectors(child, relation) | |
| 733 if match: | |
| 734 break | |
| 735 return match | |
| 736 | |
| 737 def match_future_relations(self, el, relation): | |
| 738 """Match future relationship.""" | |
| 739 | |
| 740 found = False | |
| 741 if relation[0].rel_type == REL_HAS_PARENT: | |
| 742 found = self.match_future_child(el, relation, True) | |
| 743 elif relation[0].rel_type == REL_HAS_CLOSE_PARENT: | |
| 744 found = self.match_future_child(el, relation) | |
| 745 elif relation[0].rel_type == REL_HAS_SIBLING: | |
| 746 sibling = self.get_next(el) | |
| 747 while not found and sibling: | |
| 748 found = self.match_selectors(sibling, relation) | |
| 749 sibling = self.get_next(sibling) | |
| 750 elif relation[0].rel_type == REL_HAS_CLOSE_SIBLING: | |
| 751 sibling = self.get_next(el) | |
| 752 if sibling and self.is_tag(sibling): | |
| 753 found = self.match_selectors(sibling, relation) | |
| 754 return found | |
| 755 | |
| 756 def match_relations(self, el, relation): | |
| 757 """Match relationship to other elements.""" | |
| 758 | |
| 759 found = False | |
| 760 | |
| 761 if relation[0].rel_type.startswith(':'): | |
| 762 found = self.match_future_relations(el, relation) | |
| 763 else: | |
| 764 found = self.match_past_relations(el, relation) | |
| 765 | |
| 766 return found | |
| 767 | |
| 768 def match_id(self, el, ids): | |
| 769 """Match element's ID.""" | |
| 770 | |
| 771 found = True | |
| 772 for i in ids: | |
| 773 if i != self.get_attribute_by_name(el, 'id', ''): | |
| 774 found = False | |
| 775 break | |
| 776 return found | |
| 777 | |
| 778 def match_classes(self, el, classes): | |
| 779 """Match element's classes.""" | |
| 780 | |
| 781 current_classes = self.get_classes(el) | |
| 782 found = True | |
| 783 for c in classes: | |
| 784 if c not in current_classes: | |
| 785 found = False | |
| 786 break | |
| 787 return found | |
| 788 | |
| 789 def match_root(self, el): | |
| 790 """Match element as root.""" | |
| 791 | |
| 792 is_root = self.is_root(el) | |
| 793 if is_root: | |
| 794 sibling = self.get_previous(el, tags=False) | |
| 795 while is_root and sibling is not None: | |
| 796 if ( | |
| 797 self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or | |
| 798 self.is_cdata(sibling) | |
| 799 ): | |
| 800 is_root = False | |
| 801 else: | |
| 802 sibling = self.get_previous(sibling, tags=False) | |
| 803 if is_root: | |
| 804 sibling = self.get_next(el, tags=False) | |
| 805 while is_root and sibling is not None: | |
| 806 if ( | |
| 807 self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or | |
| 808 self.is_cdata(sibling) | |
| 809 ): | |
| 810 is_root = False | |
| 811 else: | |
| 812 sibling = self.get_next(sibling, tags=False) | |
| 813 return is_root | |
| 814 | |
| 815 def match_scope(self, el): | |
| 816 """Match element as scope.""" | |
| 817 | |
| 818 return self.scope is el | |
| 819 | |
| 820 def match_nth_tag_type(self, el, child): | |
| 821 """Match tag type for `nth` matches.""" | |
| 822 | |
| 823 return( | |
| 824 (self.get_tag(child) == self.get_tag(el)) and | |
| 825 (self.get_tag_ns(child) == self.get_tag_ns(el)) | |
| 826 ) | |
| 827 | |
| 828 def match_nth(self, el, nth): | |
| 829 """Match `nth` elements.""" | |
| 830 | |
| 831 matched = True | |
| 832 | |
| 833 for n in nth: | |
| 834 matched = False | |
| 835 if n.selectors and not self.match_selectors(el, n.selectors): | |
| 836 break | |
| 837 parent = self.get_parent(el) | |
| 838 if parent is None: | |
| 839 parent = self.create_fake_parent(el) | |
| 840 last = n.last | |
| 841 last_index = len(parent) - 1 | |
| 842 index = last_index if last else 0 | |
| 843 relative_index = 0 | |
| 844 a = n.a | |
| 845 b = n.b | |
| 846 var = n.n | |
| 847 count = 0 | |
| 848 count_incr = 1 | |
| 849 factor = -1 if last else 1 | |
| 850 idx = last_idx = a * count + b if var else a | |
| 851 | |
| 852 # We can only adjust bounds within a variable index | |
| 853 if var: | |
| 854 # Abort if our nth index is out of bounds and only getting further out of bounds as we increment. | |
| 855 # Otherwise, increment to try to get in bounds. | |
| 856 adjust = None | |
| 857 while idx < 1 or idx > last_index: | |
| 858 if idx < 0: | |
| 859 diff_low = 0 - idx | |
| 860 if adjust is not None and adjust == 1: | |
| 861 break | |
| 862 adjust = -1 | |
| 863 count += count_incr | |
| 864 idx = last_idx = a * count + b if var else a | |
| 865 diff = 0 - idx | |
| 866 if diff >= diff_low: | |
| 867 break | |
| 868 else: | |
| 869 diff_high = idx - last_index | |
| 870 if adjust is not None and adjust == -1: | |
| 871 break | |
| 872 adjust = 1 | |
| 873 count += count_incr | |
| 874 idx = last_idx = a * count + b if var else a | |
| 875 diff = idx - last_index | |
| 876 if diff >= diff_high: | |
| 877 break | |
| 878 diff_high = diff | |
| 879 | |
| 880 # If a < 0, our count is working backwards, so floor the index by increasing the count. | |
| 881 # Find the count that yields the lowest, in bound value and use that. | |
| 882 # Lastly reverse count increment so that we'll increase our index. | |
| 883 lowest = count | |
| 884 if a < 0: | |
| 885 while idx >= 1: | |
| 886 lowest = count | |
| 887 count += count_incr | |
| 888 idx = last_idx = a * count + b if var else a | |
| 889 count_incr = -1 | |
| 890 count = lowest | |
| 891 idx = last_idx = a * count + b if var else a | |
| 892 | |
| 893 # Evaluate elements while our calculated nth index is still in range | |
| 894 while 1 <= idx <= last_index + 1: | |
| 895 child = None | |
| 896 # Evaluate while our child index is still in range. | |
| 897 for child in self.get_children(parent, start=index, reverse=factor < 0, tags=False): | |
| 898 index += factor | |
| 899 if not self.is_tag(child): | |
| 900 continue | |
| 901 # Handle `of S` in `nth-child` | |
| 902 if n.selectors and not self.match_selectors(child, n.selectors): | |
| 903 continue | |
| 904 # Handle `of-type` | |
| 905 if n.of_type and not self.match_nth_tag_type(el, child): | |
| 906 continue | |
| 907 relative_index += 1 | |
| 908 if relative_index == idx: | |
| 909 if child is el: | |
| 910 matched = True | |
| 911 else: | |
| 912 break | |
| 913 if child is el: | |
| 914 break | |
| 915 if child is el: | |
| 916 break | |
| 917 last_idx = idx | |
| 918 count += count_incr | |
| 919 if count < 0: | |
| 920 # Count is counting down and has now ventured into invalid territory. | |
| 921 break | |
| 922 idx = a * count + b if var else a | |
| 923 if last_idx == idx: | |
| 924 break | |
| 925 if not matched: | |
| 926 break | |
| 927 return matched | |
| 928 | |
| 929 def match_empty(self, el): | |
| 930 """Check if element is empty (if requested).""" | |
| 931 | |
| 932 is_empty = True | |
| 933 for child in self.get_children(el, tags=False): | |
| 934 if self.is_tag(child): | |
| 935 is_empty = False | |
| 936 break | |
| 937 elif self.is_content_string(child) and RE_NOT_EMPTY.search(child): | |
| 938 is_empty = False | |
| 939 break | |
| 940 return is_empty | |
| 941 | |
| 942 def match_subselectors(self, el, selectors): | |
| 943 """Match selectors.""" | |
| 944 | |
| 945 match = True | |
| 946 for sel in selectors: | |
| 947 if not self.match_selectors(el, sel): | |
| 948 match = False | |
| 949 return match | |
| 950 | |
| 951 def match_contains(self, el, contains): | |
| 952 """Match element if it contains text.""" | |
| 953 | |
| 954 match = True | |
| 955 content = None | |
| 956 for contain_list in contains: | |
| 957 if content is None: | |
| 958 content = self.get_text(el, no_iframe=self.is_html) | |
| 959 found = False | |
| 960 for text in contain_list.text: | |
| 961 if text in content: | |
| 962 found = True | |
| 963 break | |
| 964 if not found: | |
| 965 match = False | |
| 966 return match | |
| 967 | |
| 968 def match_default(self, el): | |
| 969 """Match default.""" | |
| 970 | |
| 971 match = False | |
| 972 | |
| 973 # Find this input's form | |
| 974 form = None | |
| 975 parent = self.get_parent(el, no_iframe=True) | |
| 976 while parent and form is None: | |
| 977 if self.get_tag(parent) == 'form' and self.is_html_tag(parent): | |
| 978 form = parent | |
| 979 else: | |
| 980 parent = self.get_parent(parent, no_iframe=True) | |
| 981 | |
| 982 # Look in form cache to see if we've already located its default button | |
| 983 found_form = False | |
| 984 for f, t in self.cached_default_forms: | |
| 985 if f is form: | |
| 986 found_form = True | |
| 987 if t is el: | |
| 988 match = True | |
| 989 break | |
| 990 | |
| 991 # We didn't have the form cached, so look for its default button | |
| 992 if not found_form: | |
| 993 for child in self.get_descendants(form, no_iframe=True): | |
| 994 name = self.get_tag(child) | |
| 995 # Can't do nested forms (haven't figured out why we never hit this) | |
| 996 if name == 'form': # pragma: no cover | |
| 997 break | |
| 998 if name in ('input', 'button'): | |
| 999 v = self.get_attribute_by_name(child, 'type', '') | |
| 1000 if v and util.lower(v) == 'submit': | |
| 1001 self.cached_default_forms.append([form, child]) | |
| 1002 if el is child: | |
| 1003 match = True | |
| 1004 break | |
| 1005 return match | |
| 1006 | |
| 1007 def match_indeterminate(self, el): | |
| 1008 """Match default.""" | |
| 1009 | |
| 1010 match = False | |
| 1011 name = self.get_attribute_by_name(el, 'name') | |
| 1012 | |
| 1013 def get_parent_form(el): | |
| 1014 """Find this input's form.""" | |
| 1015 form = None | |
| 1016 parent = self.get_parent(el, no_iframe=True) | |
| 1017 while form is None: | |
| 1018 if self.get_tag(parent) == 'form' and self.is_html_tag(parent): | |
| 1019 form = parent | |
| 1020 break | |
| 1021 last_parent = parent | |
| 1022 parent = self.get_parent(parent, no_iframe=True) | |
| 1023 if parent is None: | |
| 1024 form = last_parent | |
| 1025 break | |
| 1026 return form | |
| 1027 | |
| 1028 form = get_parent_form(el) | |
| 1029 | |
| 1030 # Look in form cache to see if we've already evaluated that its fellow radio buttons are indeterminate | |
| 1031 found_form = False | |
| 1032 for f, n, i in self.cached_indeterminate_forms: | |
| 1033 if f is form and n == name: | |
| 1034 found_form = True | |
| 1035 if i is True: | |
| 1036 match = True | |
| 1037 break | |
| 1038 | |
| 1039 # We didn't have the form cached, so validate that the radio button is indeterminate | |
| 1040 if not found_form: | |
| 1041 checked = False | |
| 1042 for child in self.get_descendants(form, no_iframe=True): | |
| 1043 if child is el: | |
| 1044 continue | |
| 1045 tag_name = self.get_tag(child) | |
| 1046 if tag_name == 'input': | |
| 1047 is_radio = False | |
| 1048 check = False | |
| 1049 has_name = False | |
| 1050 for k, v in self.iter_attributes(child): | |
| 1051 if util.lower(k) == 'type' and util.lower(v) == 'radio': | |
| 1052 is_radio = True | |
| 1053 elif util.lower(k) == 'name' and v == name: | |
| 1054 has_name = True | |
| 1055 elif util.lower(k) == 'checked': | |
| 1056 check = True | |
| 1057 if is_radio and check and has_name and get_parent_form(child) is form: | |
| 1058 checked = True | |
| 1059 break | |
| 1060 if checked: | |
| 1061 break | |
| 1062 if not checked: | |
| 1063 match = True | |
| 1064 self.cached_indeterminate_forms.append([form, name, match]) | |
| 1065 | |
| 1066 return match | |
| 1067 | |
| 1068 def match_lang(self, el, langs): | |
| 1069 """Match languages.""" | |
| 1070 | |
| 1071 match = False | |
| 1072 has_ns = self.supports_namespaces() | |
| 1073 root = self.root | |
| 1074 has_html_namespace = self.has_html_namespace | |
| 1075 | |
| 1076 # Walk parents looking for `lang` (HTML) or `xml:lang` XML property. | |
| 1077 parent = el | |
| 1078 found_lang = None | |
| 1079 last = None | |
| 1080 while not found_lang: | |
| 1081 has_html_ns = self.has_html_ns(parent) | |
| 1082 for k, v in self.iter_attributes(parent): | |
| 1083 attr_ns, attr = self.split_namespace(parent, k) | |
| 1084 if ( | |
| 1085 ((not has_ns or has_html_ns) and (util.lower(k) if not self.is_xml else k) == 'lang') or | |
| 1086 ( | |
| 1087 has_ns and not has_html_ns and attr_ns == NS_XML and | |
| 1088 (util.lower(attr) if not self.is_xml and attr is not None else attr) == 'lang' | |
| 1089 ) | |
| 1090 ): | |
| 1091 found_lang = v | |
| 1092 break | |
| 1093 last = parent | |
| 1094 parent = self.get_parent(parent, no_iframe=self.is_html) | |
| 1095 | |
| 1096 if parent is None: | |
| 1097 root = last | |
| 1098 has_html_namespace = self.has_html_ns(root) | |
| 1099 parent = last | |
| 1100 break | |
| 1101 | |
| 1102 # Use cached meta language. | |
| 1103 if not found_lang and self.cached_meta_lang: | |
| 1104 for cache in self.cached_meta_lang: | |
| 1105 if root is cache[0]: | |
| 1106 found_lang = cache[1] | |
| 1107 | |
| 1108 # If we couldn't find a language, and the document is HTML, look to meta to determine language. | |
| 1109 if found_lang is None and (not self.is_xml or (has_html_namespace and root.name == 'html')): | |
| 1110 # Find head | |
| 1111 found = False | |
| 1112 for tag in ('html', 'head'): | |
| 1113 found = False | |
| 1114 for child in self.get_children(parent, no_iframe=self.is_html): | |
| 1115 if self.get_tag(child) == tag and self.is_html_tag(child): | |
| 1116 found = True | |
| 1117 parent = child | |
| 1118 break | |
| 1119 if not found: # pragma: no cover | |
| 1120 break | |
| 1121 | |
| 1122 # Search meta tags | |
| 1123 if found: | |
| 1124 for child in parent: | |
| 1125 if self.is_tag(child) and self.get_tag(child) == 'meta' and self.is_html_tag(parent): | |
| 1126 c_lang = False | |
| 1127 content = None | |
| 1128 for k, v in self.iter_attributes(child): | |
| 1129 if util.lower(k) == 'http-equiv' and util.lower(v) == 'content-language': | |
| 1130 c_lang = True | |
| 1131 if util.lower(k) == 'content': | |
| 1132 content = v | |
| 1133 if c_lang and content: | |
| 1134 found_lang = content | |
| 1135 self.cached_meta_lang.append((root, found_lang)) | |
| 1136 break | |
| 1137 if found_lang: | |
| 1138 break | |
| 1139 if not found_lang: | |
| 1140 self.cached_meta_lang.append((root, False)) | |
| 1141 | |
| 1142 # If we determined a language, compare. | |
| 1143 if found_lang: | |
| 1144 for patterns in langs: | |
| 1145 match = False | |
| 1146 for pattern in patterns: | |
| 1147 if self.extended_language_filter(pattern, found_lang): | |
| 1148 match = True | |
| 1149 if not match: | |
| 1150 break | |
| 1151 | |
| 1152 return match | |
| 1153 | |
| 1154 def match_dir(self, el, directionality): | |
| 1155 """Check directionality.""" | |
| 1156 | |
| 1157 # If we have to match both left and right, we can't match either. | |
| 1158 if directionality & ct.SEL_DIR_LTR and directionality & ct.SEL_DIR_RTL: | |
| 1159 return False | |
| 1160 | |
| 1161 if el is None or not self.is_html_tag(el): | |
| 1162 return False | |
| 1163 | |
| 1164 # Element has defined direction of left to right or right to left | |
| 1165 direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(el, 'dir', '')), None) | |
| 1166 if direction not in (None, 0): | |
| 1167 return direction == directionality | |
| 1168 | |
| 1169 # Element is the document element (the root) and no direction assigned, assume left to right. | |
| 1170 is_root = self.is_root(el) | |
| 1171 if is_root and direction is None: | |
| 1172 return ct.SEL_DIR_LTR == directionality | |
| 1173 | |
| 1174 # If `input[type=telephone]` and no direction is assigned, assume left to right. | |
| 1175 name = self.get_tag(el) | |
| 1176 is_input = name == 'input' | |
| 1177 is_textarea = name == 'textarea' | |
| 1178 is_bdi = name == 'bdi' | |
| 1179 itype = util.lower(self.get_attribute_by_name(el, 'type', '')) if is_input else '' | |
| 1180 if is_input and itype == 'tel' and direction is None: | |
| 1181 return ct.SEL_DIR_LTR == directionality | |
| 1182 | |
| 1183 # Auto handling for text inputs | |
| 1184 if ((is_input and itype in ('text', 'search', 'tel', 'url', 'email')) or is_textarea) and direction == 0: | |
| 1185 if is_textarea: | |
| 1186 value = [] | |
| 1187 for node in self.get_contents(el, no_iframe=True): | |
| 1188 if self.is_content_string(node): | |
| 1189 value.append(node) | |
| 1190 value = ''.join(value) | |
| 1191 else: | |
| 1192 value = self.get_attribute_by_name(el, 'value', '') | |
| 1193 if value: | |
| 1194 for c in value: | |
| 1195 bidi = unicodedata.bidirectional(c) | |
| 1196 if bidi in ('AL', 'R', 'L'): | |
| 1197 direction = ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL | |
| 1198 return direction == directionality | |
| 1199 # Assume left to right | |
| 1200 return ct.SEL_DIR_LTR == directionality | |
| 1201 elif is_root: | |
| 1202 return ct.SEL_DIR_LTR == directionality | |
| 1203 return self.match_dir(self.get_parent(el, no_iframe=True), directionality) | |
| 1204 | |
| 1205 # Auto handling for `bdi` and other non text inputs. | |
| 1206 if (is_bdi and direction is None) or direction == 0: | |
| 1207 direction = self.find_bidi(el) | |
| 1208 if direction is not None: | |
| 1209 return direction == directionality | |
| 1210 elif is_root: | |
| 1211 return ct.SEL_DIR_LTR == directionality | |
| 1212 return self.match_dir(self.get_parent(el, no_iframe=True), directionality) | |
| 1213 | |
| 1214 # Match parents direction | |
| 1215 return self.match_dir(self.get_parent(el, no_iframe=True), directionality) | |
| 1216 | |
| 1217 def match_range(self, el, condition): | |
| 1218 """ | |
| 1219 Match range. | |
| 1220 | |
| 1221 Behavior is modeled after what we see in browsers. Browsers seem to evaluate | |
| 1222 if the value is out of range, and if not, it is in range. So a missing value | |
| 1223 will not evaluate out of range; therefore, value is in range. Personally, I | |
| 1224 feel like this should evaluate as neither in or out of range. | |
| 1225 """ | |
| 1226 | |
| 1227 out_of_range = False | |
| 1228 | |
| 1229 itype = util.lower(self.get_attribute_by_name(el, 'type')) | |
| 1230 mn = self.get_attribute_by_name(el, 'min', None) | |
| 1231 if mn is not None: | |
| 1232 mn = Inputs.parse_value(itype, mn) | |
| 1233 mx = self.get_attribute_by_name(el, 'max', None) | |
| 1234 if mx is not None: | |
| 1235 mx = Inputs.parse_value(itype, mx) | |
| 1236 | |
| 1237 # There is no valid min or max, so we cannot evaluate a range | |
| 1238 if mn is None and mx is None: | |
| 1239 return False | |
| 1240 | |
| 1241 value = self.get_attribute_by_name(el, 'value', None) | |
| 1242 if value is not None: | |
| 1243 value = Inputs.parse_value(itype, value) | |
| 1244 if value is not None: | |
| 1245 if itype in ("date", "datetime-local", "month", "week", "number", "range"): | |
| 1246 if mn is not None and value < mn: | |
| 1247 out_of_range = True | |
| 1248 if not out_of_range and mx is not None and value > mx: | |
| 1249 out_of_range = True | |
| 1250 elif itype == "time": | |
| 1251 if mn is not None and mx is not None and mn > mx: | |
| 1252 # Time is periodic, so this is a reversed/discontinuous range | |
| 1253 if value < mn and value > mx: | |
| 1254 out_of_range = True | |
| 1255 else: | |
| 1256 if mn is not None and value < mn: | |
| 1257 out_of_range = True | |
| 1258 if not out_of_range and mx is not None and value > mx: | |
| 1259 out_of_range = True | |
| 1260 | |
| 1261 return not out_of_range if condition & ct.SEL_IN_RANGE else out_of_range | |
| 1262 | |
| 1263 def match_defined(self, el): | |
| 1264 """ | |
| 1265 Match defined. | |
| 1266 | |
| 1267 `:defined` is related to custom elements in a browser. | |
| 1268 | |
| 1269 - If the document is XML (not XHTML), all tags will match. | |
| 1270 - Tags that are not custom (don't have a hyphen) are marked defined. | |
| 1271 - If the tag has a prefix (without or without a namespace), it will not match. | |
| 1272 | |
| 1273 This is of course requires the parser to provide us with the proper prefix and namespace info, | |
| 1274 if it doesn't, there is nothing we can do. | |
| 1275 """ | |
| 1276 | |
| 1277 name = self.get_tag(el) | |
| 1278 return ( | |
| 1279 name.find('-') == -1 or | |
| 1280 name.find(':') != -1 or | |
| 1281 self.get_prefix(el) is not None | |
| 1282 ) | |
| 1283 | |
| 1284 def match_placeholder_shown(self, el): | |
| 1285 """ | |
| 1286 Match placeholder shown according to HTML spec. | |
| 1287 | |
| 1288 - text area should be checked if they have content. A single newline does not count as content. | |
| 1289 | |
| 1290 """ | |
| 1291 | |
| 1292 match = False | |
| 1293 content = self.get_text(el) | |
| 1294 if content in ('', '\n'): | |
| 1295 match = True | |
| 1296 | |
| 1297 return match | |
| 1298 | |
| 1299 def match_selectors(self, el, selectors): | |
| 1300 """Check if element matches one of the selectors.""" | |
| 1301 | |
| 1302 match = False | |
| 1303 is_not = selectors.is_not | |
| 1304 is_html = selectors.is_html | |
| 1305 | |
| 1306 # Internal selector lists that use the HTML flag, will automatically get the `html` namespace. | |
| 1307 if is_html: | |
| 1308 namespaces = self.namespaces | |
| 1309 iframe_restrict = self.iframe_restrict | |
| 1310 self.namespaces = {'html': NS_XHTML} | |
| 1311 self.iframe_restrict = True | |
| 1312 | |
| 1313 if not is_html or self.is_html: | |
| 1314 for selector in selectors: | |
| 1315 match = is_not | |
| 1316 # We have a un-matchable situation (like `:focus` as you can focus an element in this environment) | |
| 1317 if isinstance(selector, ct.SelectorNull): | |
| 1318 continue | |
| 1319 # Verify tag matches | |
| 1320 if not self.match_tag(el, selector.tag): | |
| 1321 continue | |
| 1322 # Verify tag is defined | |
| 1323 if selector.flags & ct.SEL_DEFINED and not self.match_defined(el): | |
| 1324 continue | |
| 1325 # Verify element is root | |
| 1326 if selector.flags & ct.SEL_ROOT and not self.match_root(el): | |
| 1327 continue | |
| 1328 # Verify element is scope | |
| 1329 if selector.flags & ct.SEL_SCOPE and not self.match_scope(el): | |
| 1330 continue | |
| 1331 # Verify element has placeholder shown | |
| 1332 if selector.flags & ct.SEL_PLACEHOLDER_SHOWN and not self.match_placeholder_shown(el): | |
| 1333 continue | |
| 1334 # Verify `nth` matches | |
| 1335 if not self.match_nth(el, selector.nth): | |
| 1336 continue | |
| 1337 if selector.flags & ct.SEL_EMPTY and not self.match_empty(el): | |
| 1338 continue | |
| 1339 # Verify id matches | |
| 1340 if selector.ids and not self.match_id(el, selector.ids): | |
| 1341 continue | |
| 1342 # Verify classes match | |
| 1343 if selector.classes and not self.match_classes(el, selector.classes): | |
| 1344 continue | |
| 1345 # Verify attribute(s) match | |
| 1346 if not self.match_attributes(el, selector.attributes): | |
| 1347 continue | |
| 1348 # Verify ranges | |
| 1349 if selector.flags & RANGES and not self.match_range(el, selector.flags & RANGES): | |
| 1350 continue | |
| 1351 # Verify language patterns | |
| 1352 if selector.lang and not self.match_lang(el, selector.lang): | |
| 1353 continue | |
| 1354 # Verify pseudo selector patterns | |
| 1355 if selector.selectors and not self.match_subselectors(el, selector.selectors): | |
| 1356 continue | |
| 1357 # Verify relationship selectors | |
| 1358 if selector.relation and not self.match_relations(el, selector.relation): | |
| 1359 continue | |
| 1360 # Validate that the current default selector match corresponds to the first submit button in the form | |
| 1361 if selector.flags & ct.SEL_DEFAULT and not self.match_default(el): | |
| 1362 continue | |
| 1363 # Validate that the unset radio button is among radio buttons with the same name in a form that are | |
| 1364 # also not set. | |
| 1365 if selector.flags & ct.SEL_INDETERMINATE and not self.match_indeterminate(el): | |
| 1366 continue | |
| 1367 # Validate element directionality | |
| 1368 if selector.flags & DIR_FLAGS and not self.match_dir(el, selector.flags & DIR_FLAGS): | |
| 1369 continue | |
| 1370 # Validate that the tag contains the specified text. | |
| 1371 if not self.match_contains(el, selector.contains): | |
| 1372 continue | |
| 1373 match = not is_not | |
| 1374 break | |
| 1375 | |
| 1376 # Restore actual namespaces being used for external selector lists | |
| 1377 if is_html: | |
| 1378 self.namespaces = namespaces | |
| 1379 self.iframe_restrict = iframe_restrict | |
| 1380 | |
| 1381 return match | |
| 1382 | |
| 1383 def select(self, limit=0): | |
| 1384 """Match all tags under the targeted tag.""" | |
| 1385 | |
| 1386 if limit < 1: | |
| 1387 limit = None | |
| 1388 | |
| 1389 for child in self.get_descendants(self.tag): | |
| 1390 if self.match(child): | |
| 1391 yield child | |
| 1392 if limit is not None: | |
| 1393 limit -= 1 | |
| 1394 if limit < 1: | |
| 1395 break | |
| 1396 | |
| 1397 def closest(self): | |
| 1398 """Match closest ancestor.""" | |
| 1399 | |
| 1400 current = self.tag | |
| 1401 closest = None | |
| 1402 while closest is None and current is not None: | |
| 1403 if self.match(current): | |
| 1404 closest = current | |
| 1405 else: | |
| 1406 current = self.get_parent(current) | |
| 1407 return closest | |
| 1408 | |
| 1409 def filter(self): # noqa A001 | |
| 1410 """Filter tag's children.""" | |
| 1411 | |
| 1412 return [tag for tag in self.get_contents(self.tag) if not self.is_navigable_string(tag) and self.match(tag)] | |
| 1413 | |
| 1414 def match(self, el): | |
| 1415 """Match.""" | |
| 1416 | |
| 1417 return not self.is_doc(el) and self.is_tag(el) and self.match_selectors(el, self.selectors) | |
| 1418 | |
| 1419 | |
| 1420 class CSSMatch(_DocumentNav, _Match): | |
| 1421 """The Beautiful Soup CSS match class.""" | |
| 1422 | |
| 1423 | |
| 1424 class SoupSieve(ct.Immutable): | |
| 1425 """Compiled Soup Sieve selector matching object.""" | |
| 1426 | |
| 1427 __slots__ = ("pattern", "selectors", "namespaces", "custom", "flags", "_hash") | |
| 1428 | |
| 1429 def __init__(self, pattern, selectors, namespaces, custom, flags): | |
| 1430 """Initialize.""" | |
| 1431 | |
| 1432 super(SoupSieve, self).__init__( | |
| 1433 pattern=pattern, | |
| 1434 selectors=selectors, | |
| 1435 namespaces=namespaces, | |
| 1436 custom=custom, | |
| 1437 flags=flags | |
| 1438 ) | |
| 1439 | |
| 1440 def match(self, tag): | |
| 1441 """Match.""" | |
| 1442 | |
| 1443 return CSSMatch(self.selectors, tag, self.namespaces, self.flags).match(tag) | |
| 1444 | |
| 1445 def closest(self, tag): | |
| 1446 """Match closest ancestor.""" | |
| 1447 | |
| 1448 return CSSMatch(self.selectors, tag, self.namespaces, self.flags).closest() | |
| 1449 | |
| 1450 def filter(self, iterable): # noqa A001 | |
| 1451 """ | |
| 1452 Filter. | |
| 1453 | |
| 1454 `CSSMatch` can cache certain searches for tags of the same document, | |
| 1455 so if we are given a tag, all tags are from the same document, | |
| 1456 and we can take advantage of the optimization. | |
| 1457 | |
| 1458 Any other kind of iterable could have tags from different documents or detached tags, | |
| 1459 so for those, we use a new `CSSMatch` for each item in the iterable. | |
| 1460 """ | |
| 1461 | |
| 1462 if CSSMatch.is_tag(iterable): | |
| 1463 return CSSMatch(self.selectors, iterable, self.namespaces, self.flags).filter() | |
| 1464 else: | |
| 1465 return [node for node in iterable if not CSSMatch.is_navigable_string(node) and self.match(node)] | |
| 1466 | |
| 1467 def select_one(self, tag): | |
| 1468 """Select a single tag.""" | |
| 1469 | |
| 1470 tags = self.select(tag, limit=1) | |
| 1471 return tags[0] if tags else None | |
| 1472 | |
| 1473 def select(self, tag, limit=0): | |
| 1474 """Select the specified tags.""" | |
| 1475 | |
| 1476 return list(self.iselect(tag, limit)) | |
| 1477 | |
| 1478 def iselect(self, tag, limit=0): | |
| 1479 """Iterate the specified tags.""" | |
| 1480 | |
| 1481 for el in CSSMatch(self.selectors, tag, self.namespaces, self.flags).select(limit): | |
| 1482 yield el | |
| 1483 | |
| 1484 def __repr__(self): # pragma: no cover | |
| 1485 """Representation.""" | |
| 1486 | |
| 1487 return "SoupSieve(pattern={!r}, namespaces={!r}, custom={!r}, flags={!r})".format( | |
| 1488 self.pattern, | |
| 1489 self.namespaces, | |
| 1490 self.custom, | |
| 1491 self.flags | |
| 1492 ) | |
| 1493 | |
| 1494 __str__ = __repr__ | |
| 1495 | |
| 1496 | |
| 1497 ct.pickle_register(SoupSieve) |
