Mercurial > repos > shellac > sam_consensus_v3
comparison env/lib/python3.9/site-packages/soupsieve/css_match.py @ 0:4f3585e2f14b draft default tip
"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author | shellac |
---|---|
date | Mon, 22 Mar 2021 18:12:50 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:4f3585e2f14b |
---|---|
1 """CSS matcher.""" | |
2 from datetime import datetime | |
3 from . import util | |
4 import re | |
5 from .import css_types as ct | |
6 import unicodedata | |
7 from collections.abc import Sequence | |
8 | |
9 import bs4 | |
10 | |
11 # Empty tag pattern (whitespace okay) | |
12 RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]') | |
13 | |
14 RE_NOT_WS = re.compile('[^ \t\r\n\f]+') | |
15 | |
16 # Relationships | |
17 REL_PARENT = ' ' | |
18 REL_CLOSE_PARENT = '>' | |
19 REL_SIBLING = '~' | |
20 REL_CLOSE_SIBLING = '+' | |
21 | |
22 # Relationships for :has() (forward looking) | |
23 REL_HAS_PARENT = ': ' | |
24 REL_HAS_CLOSE_PARENT = ':>' | |
25 REL_HAS_SIBLING = ':~' | |
26 REL_HAS_CLOSE_SIBLING = ':+' | |
27 | |
28 NS_XHTML = 'http://www.w3.org/1999/xhtml' | |
29 NS_XML = 'http://www.w3.org/XML/1998/namespace' | |
30 | |
31 DIR_FLAGS = ct.SEL_DIR_LTR | ct.SEL_DIR_RTL | |
32 RANGES = ct.SEL_IN_RANGE | ct.SEL_OUT_OF_RANGE | |
33 | |
34 DIR_MAP = { | |
35 'ltr': ct.SEL_DIR_LTR, | |
36 'rtl': ct.SEL_DIR_RTL, | |
37 'auto': 0 | |
38 } | |
39 | |
40 RE_NUM = re.compile(r"^(?P<value>-?(?:[0-9]{1,}(\.[0-9]+)?|\.[0-9]+))$") | |
41 RE_TIME = re.compile(r'^(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$') | |
42 RE_MONTH = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})$') | |
43 RE_WEEK = re.compile(r'^(?P<year>[0-9]{4,})-W(?P<week>[0-9]{2})$') | |
44 RE_DATE = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})$') | |
45 RE_DATETIME = re.compile( | |
46 r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})T(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$' | |
47 ) | |
48 RE_WILD_STRIP = re.compile(r'(?:(?:-\*-)(?:\*(?:-|$))*|-\*$)') | |
49 | |
50 MONTHS_30 = (4, 6, 9, 11) # April, June, September, and November | |
51 FEB = 2 | |
52 SHORT_MONTH = 30 | |
53 LONG_MONTH = 31 | |
54 FEB_MONTH = 28 | |
55 FEB_LEAP_MONTH = 29 | |
56 DAYS_IN_WEEK = 7 | |
57 | |
58 | |
59 class _FakeParent(object): | |
60 """ | |
61 Fake parent class. | |
62 | |
63 When we have a fragment with no `BeautifulSoup` document object, | |
64 we can't evaluate `nth` selectors properly. Create a temporary | |
65 fake parent so we can traverse the root element as a child. | |
66 """ | |
67 | |
68 def __init__(self, element): | |
69 """Initialize.""" | |
70 | |
71 self.contents = [element] | |
72 | |
73 def __len__(self): | |
74 """Length.""" | |
75 | |
76 return len(self.contents) | |
77 | |
78 | |
79 class _DocumentNav(object): | |
80 """Navigate a Beautiful Soup document.""" | |
81 | |
82 @classmethod | |
83 def assert_valid_input(cls, tag): | |
84 """Check if valid input tag or document.""" | |
85 | |
86 # Fail on unexpected types. | |
87 if not cls.is_tag(tag): | |
88 raise TypeError("Expected a BeautifulSoup 'Tag', but instead recieved type {}".format(type(tag))) | |
89 | |
90 @staticmethod | |
91 def is_doc(obj): | |
92 """Is `BeautifulSoup` object.""" | |
93 return isinstance(obj, bs4.BeautifulSoup) | |
94 | |
95 @staticmethod | |
96 def is_tag(obj): | |
97 """Is tag.""" | |
98 return isinstance(obj, bs4.Tag) | |
99 | |
100 @staticmethod | |
101 def is_declaration(obj): # pragma: no cover | |
102 """Is declaration.""" | |
103 return isinstance(obj, bs4.Declaration) | |
104 | |
105 @staticmethod | |
106 def is_cdata(obj): | |
107 """Is CDATA.""" | |
108 return isinstance(obj, bs4.CData) | |
109 | |
110 @staticmethod | |
111 def is_processing_instruction(obj): # pragma: no cover | |
112 """Is processing instruction.""" | |
113 return isinstance(obj, bs4.ProcessingInstruction) | |
114 | |
115 @staticmethod | |
116 def is_navigable_string(obj): | |
117 """Is navigable string.""" | |
118 return isinstance(obj, bs4.NavigableString) | |
119 | |
120 @staticmethod | |
121 def is_special_string(obj): | |
122 """Is special string.""" | |
123 return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype)) | |
124 | |
125 @classmethod | |
126 def is_content_string(cls, obj): | |
127 """Check if node is content string.""" | |
128 | |
129 return cls.is_navigable_string(obj) and not cls.is_special_string(obj) | |
130 | |
131 @staticmethod | |
132 def create_fake_parent(el): | |
133 """Create fake parent for a given element.""" | |
134 | |
135 return _FakeParent(el) | |
136 | |
137 @staticmethod | |
138 def is_xml_tree(el): | |
139 """Check if element (or document) is from a XML tree.""" | |
140 | |
141 return el._is_xml | |
142 | |
143 def is_iframe(self, el): | |
144 """Check if element is an `iframe`.""" | |
145 | |
146 return ((el.name if self.is_xml_tree(el) else util.lower(el.name)) == 'iframe') and self.is_html_tag(el) | |
147 | |
148 def is_root(self, el): | |
149 """ | |
150 Return whether element is a root element. | |
151 | |
152 We check that the element is the root of the tree (which we have already pre-calculated), | |
153 and we check if it is the root element under an `iframe`. | |
154 """ | |
155 | |
156 root = self.root and self.root is el | |
157 if not root: | |
158 parent = self.get_parent(el) | |
159 root = parent is not None and self.is_html and self.is_iframe(parent) | |
160 return root | |
161 | |
162 def get_contents(self, el, no_iframe=False): | |
163 """Get contents or contents in reverse.""" | |
164 if not no_iframe or not self.is_iframe(el): | |
165 for content in el.contents: | |
166 yield content | |
167 | |
168 def get_children(self, el, start=None, reverse=False, tags=True, no_iframe=False): | |
169 """Get children.""" | |
170 | |
171 if not no_iframe or not self.is_iframe(el): | |
172 last = len(el.contents) - 1 | |
173 if start is None: | |
174 index = last if reverse else 0 | |
175 else: | |
176 index = start | |
177 end = -1 if reverse else last + 1 | |
178 incr = -1 if reverse else 1 | |
179 | |
180 if 0 <= index <= last: | |
181 while index != end: | |
182 node = el.contents[index] | |
183 index += incr | |
184 if not tags or self.is_tag(node): | |
185 yield node | |
186 | |
187 def get_descendants(self, el, tags=True, no_iframe=False): | |
188 """Get descendants.""" | |
189 | |
190 if not no_iframe or not self.is_iframe(el): | |
191 next_good = None | |
192 for child in el.descendants: | |
193 | |
194 if next_good is not None: | |
195 if child is not next_good: | |
196 continue | |
197 next_good = None | |
198 | |
199 is_tag = self.is_tag(child) | |
200 | |
201 if no_iframe and is_tag and self.is_iframe(child): | |
202 if child.next_sibling is not None: | |
203 next_good = child.next_sibling | |
204 else: | |
205 last_child = child | |
206 while self.is_tag(last_child) and last_child.contents: | |
207 last_child = last_child.contents[-1] | |
208 next_good = last_child.next_element | |
209 yield child | |
210 if next_good is None: | |
211 break | |
212 # Coverage isn't seeing this even though it's executed | |
213 continue # pragma: no cover | |
214 | |
215 if not tags or is_tag: | |
216 yield child | |
217 | |
218 def get_parent(self, el, no_iframe=False): | |
219 """Get parent.""" | |
220 | |
221 parent = el.parent | |
222 if no_iframe and parent is not None and self.is_iframe(parent): | |
223 parent = None | |
224 return parent | |
225 | |
226 @staticmethod | |
227 def get_tag_name(el): | |
228 """Get tag.""" | |
229 | |
230 return el.name | |
231 | |
232 @staticmethod | |
233 def get_prefix_name(el): | |
234 """Get prefix.""" | |
235 | |
236 return el.prefix | |
237 | |
238 @staticmethod | |
239 def get_uri(el): | |
240 """Get namespace `URI`.""" | |
241 | |
242 return el.namespace | |
243 | |
244 @classmethod | |
245 def get_next(cls, el, tags=True): | |
246 """Get next sibling tag.""" | |
247 | |
248 sibling = el.next_sibling | |
249 while tags and not cls.is_tag(sibling) and sibling is not None: | |
250 sibling = sibling.next_sibling | |
251 return sibling | |
252 | |
253 @classmethod | |
254 def get_previous(cls, el, tags=True): | |
255 """Get previous sibling tag.""" | |
256 | |
257 sibling = el.previous_sibling | |
258 while tags and not cls.is_tag(sibling) and sibling is not None: | |
259 sibling = sibling.previous_sibling | |
260 return sibling | |
261 | |
262 @staticmethod | |
263 def has_html_ns(el): | |
264 """ | |
265 Check if element has an HTML namespace. | |
266 | |
267 This is a bit different than whether a element is treated as having an HTML namespace, | |
268 like we do in the case of `is_html_tag`. | |
269 """ | |
270 | |
271 ns = getattr(el, 'namespace') if el else None | |
272 return ns and ns == NS_XHTML | |
273 | |
274 @staticmethod | |
275 def split_namespace(el, attr_name): | |
276 """Return namespace and attribute name without the prefix.""" | |
277 | |
278 return getattr(attr_name, 'namespace', None), getattr(attr_name, 'name', None) | |
279 | |
280 @classmethod | |
281 def normalize_value(cls, value): | |
282 """Normalize the value to be a string or list of strings.""" | |
283 | |
284 # Treat `None` as empty string. | |
285 if value is None: | |
286 return '' | |
287 | |
288 # Pass through strings | |
289 if (isinstance(value, str)): | |
290 return value | |
291 | |
292 # If it's a byte string, convert it to Unicode, treating it as UTF-8. | |
293 if isinstance(value, bytes): | |
294 return value.decode("utf8") | |
295 | |
296 # BeautifulSoup supports sequences of attribute values, so make sure the children are strings. | |
297 if isinstance(value, Sequence): | |
298 new_value = [] | |
299 for v in value: | |
300 if isinstance(v, Sequence): | |
301 # This is most certainly a user error and will crash and burn later, | |
302 # but to avoid excessive recursion, kick out now. | |
303 new_value.append(v) | |
304 else: | |
305 # Convert the child to a string | |
306 new_value.append(cls.normalize_value(v)) | |
307 return new_value | |
308 | |
309 # Try and make anything else a string | |
310 return str(value) | |
311 | |
312 @classmethod | |
313 def get_attribute_by_name(cls, el, name, default=None): | |
314 """Get attribute by name.""" | |
315 | |
316 value = default | |
317 if el._is_xml: | |
318 try: | |
319 value = cls.normalize_value(el.attrs[name]) | |
320 except KeyError: | |
321 pass | |
322 else: | |
323 for k, v in el.attrs.items(): | |
324 if util.lower(k) == name: | |
325 value = cls.normalize_value(v) | |
326 break | |
327 return value | |
328 | |
329 @classmethod | |
330 def iter_attributes(cls, el): | |
331 """Iterate attributes.""" | |
332 | |
333 for k, v in el.attrs.items(): | |
334 yield k, cls.normalize_value(v) | |
335 | |
336 @classmethod | |
337 def get_classes(cls, el): | |
338 """Get classes.""" | |
339 | |
340 classes = cls.get_attribute_by_name(el, 'class', []) | |
341 if isinstance(classes, str): | |
342 classes = RE_NOT_WS.findall(classes) | |
343 return classes | |
344 | |
345 def get_text(self, el, no_iframe=False): | |
346 """Get text.""" | |
347 | |
348 return ''.join( | |
349 [node for node in self.get_descendants(el, tags=False, no_iframe=no_iframe) if self.is_content_string(node)] | |
350 ) | |
351 | |
352 def get_own_text(self, el, no_iframe=False): | |
353 """Get Own Text.""" | |
354 | |
355 return [node for node in self.get_contents(el, no_iframe=no_iframe) if self.is_content_string(node)] | |
356 | |
357 | |
358 class Inputs(object): | |
359 """Class for parsing and validating input items.""" | |
360 | |
361 @staticmethod | |
362 def validate_day(year, month, day): | |
363 """Validate day.""" | |
364 | |
365 max_days = LONG_MONTH | |
366 if month == FEB: | |
367 max_days = FEB_LEAP_MONTH if ((year % 4 == 0) and (year % 100 != 0)) or (year % 400 == 0) else FEB_MONTH | |
368 elif month in MONTHS_30: | |
369 max_days = SHORT_MONTH | |
370 return 1 <= day <= max_days | |
371 | |
372 @staticmethod | |
373 def validate_week(year, week): | |
374 """Validate week.""" | |
375 | |
376 max_week = datetime.strptime("{}-{}-{}".format(12, 31, year), "%m-%d-%Y").isocalendar()[1] | |
377 if max_week == 1: | |
378 max_week = 53 | |
379 return 1 <= week <= max_week | |
380 | |
381 @staticmethod | |
382 def validate_month(month): | |
383 """Validate month.""" | |
384 | |
385 return 1 <= month <= 12 | |
386 | |
387 @staticmethod | |
388 def validate_year(year): | |
389 """Validate year.""" | |
390 | |
391 return 1 <= year | |
392 | |
393 @staticmethod | |
394 def validate_hour(hour): | |
395 """Validate hour.""" | |
396 | |
397 return 0 <= hour <= 23 | |
398 | |
399 @staticmethod | |
400 def validate_minutes(minutes): | |
401 """Validate minutes.""" | |
402 | |
403 return 0 <= minutes <= 59 | |
404 | |
405 @classmethod | |
406 def parse_value(cls, itype, value): | |
407 """Parse the input value.""" | |
408 | |
409 parsed = None | |
410 if itype == "date": | |
411 m = RE_DATE.match(value) | |
412 if m: | |
413 year = int(m.group('year'), 10) | |
414 month = int(m.group('month'), 10) | |
415 day = int(m.group('day'), 10) | |
416 if cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day): | |
417 parsed = (year, month, day) | |
418 elif itype == "month": | |
419 m = RE_MONTH.match(value) | |
420 if m: | |
421 year = int(m.group('year'), 10) | |
422 month = int(m.group('month'), 10) | |
423 if cls.validate_year(year) and cls.validate_month(month): | |
424 parsed = (year, month) | |
425 elif itype == "week": | |
426 m = RE_WEEK.match(value) | |
427 if m: | |
428 year = int(m.group('year'), 10) | |
429 week = int(m.group('week'), 10) | |
430 if cls.validate_year(year) and cls.validate_week(year, week): | |
431 parsed = (year, week) | |
432 elif itype == "time": | |
433 m = RE_TIME.match(value) | |
434 if m: | |
435 hour = int(m.group('hour'), 10) | |
436 minutes = int(m.group('minutes'), 10) | |
437 if cls.validate_hour(hour) and cls.validate_minutes(minutes): | |
438 parsed = (hour, minutes) | |
439 elif itype == "datetime-local": | |
440 m = RE_DATETIME.match(value) | |
441 if m: | |
442 year = int(m.group('year'), 10) | |
443 month = int(m.group('month'), 10) | |
444 day = int(m.group('day'), 10) | |
445 hour = int(m.group('hour'), 10) | |
446 minutes = int(m.group('minutes'), 10) | |
447 if ( | |
448 cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day) and | |
449 cls.validate_hour(hour) and cls.validate_minutes(minutes) | |
450 ): | |
451 parsed = (year, month, day, hour, minutes) | |
452 elif itype in ("number", "range"): | |
453 m = RE_NUM.match(value) | |
454 if m: | |
455 parsed = float(m.group('value')) | |
456 return parsed | |
457 | |
458 | |
459 class _Match(object): | |
460 """Perform CSS matching.""" | |
461 | |
462 def __init__(self, selectors, scope, namespaces, flags): | |
463 """Initialize.""" | |
464 | |
465 self.assert_valid_input(scope) | |
466 self.tag = scope | |
467 self.cached_meta_lang = [] | |
468 self.cached_default_forms = [] | |
469 self.cached_indeterminate_forms = [] | |
470 self.selectors = selectors | |
471 self.namespaces = {} if namespaces is None else namespaces | |
472 self.flags = flags | |
473 self.iframe_restrict = False | |
474 | |
475 # Find the root element for the whole tree | |
476 doc = scope | |
477 parent = self.get_parent(doc) | |
478 while parent: | |
479 doc = parent | |
480 parent = self.get_parent(doc) | |
481 root = None | |
482 if not self.is_doc(doc): | |
483 root = doc | |
484 else: | |
485 for child in self.get_children(doc): | |
486 root = child | |
487 break | |
488 | |
489 self.root = root | |
490 self.scope = scope if scope is not doc else root | |
491 self.has_html_namespace = self.has_html_ns(root) | |
492 | |
493 # A document can be both XML and HTML (XHTML) | |
494 self.is_xml = self.is_xml_tree(doc) | |
495 self.is_html = not self.is_xml or self.has_html_namespace | |
496 | |
497 def supports_namespaces(self): | |
498 """Check if namespaces are supported in the HTML type.""" | |
499 | |
500 return self.is_xml or self.has_html_namespace | |
501 | |
502 def get_tag_ns(self, el): | |
503 """Get tag namespace.""" | |
504 | |
505 if self.supports_namespaces(): | |
506 namespace = '' | |
507 ns = self.get_uri(el) | |
508 if ns: | |
509 namespace = ns | |
510 else: | |
511 namespace = NS_XHTML | |
512 return namespace | |
513 | |
514 def is_html_tag(self, el): | |
515 """Check if tag is in HTML namespace.""" | |
516 | |
517 return self.get_tag_ns(el) == NS_XHTML | |
518 | |
519 def get_tag(self, el): | |
520 """Get tag.""" | |
521 | |
522 name = self.get_tag_name(el) | |
523 return util.lower(name) if name is not None and not self.is_xml else name | |
524 | |
525 def get_prefix(self, el): | |
526 """Get prefix.""" | |
527 | |
528 prefix = self.get_prefix_name(el) | |
529 return util.lower(prefix) if prefix is not None and not self.is_xml else prefix | |
530 | |
531 def find_bidi(self, el): | |
532 """Get directionality from element text.""" | |
533 | |
534 for node in self.get_children(el, tags=False): | |
535 | |
536 # Analyze child text nodes | |
537 if self.is_tag(node): | |
538 | |
539 # Avoid analyzing certain elements specified in the specification. | |
540 direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(node, 'dir', '')), None) | |
541 if ( | |
542 self.get_tag(node) in ('bdi', 'script', 'style', 'textarea', 'iframe') or | |
543 not self.is_html_tag(node) or | |
544 direction is not None | |
545 ): | |
546 continue # pragma: no cover | |
547 | |
548 # Check directionality of this node's text | |
549 value = self.find_bidi(node) | |
550 if value is not None: | |
551 return value | |
552 | |
553 # Direction could not be determined | |
554 continue # pragma: no cover | |
555 | |
556 # Skip `doctype` comments, etc. | |
557 if self.is_special_string(node): | |
558 continue | |
559 | |
560 # Analyze text nodes for directionality. | |
561 for c in node: | |
562 bidi = unicodedata.bidirectional(c) | |
563 if bidi in ('AL', 'R', 'L'): | |
564 return ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL | |
565 return None | |
566 | |
567 def extended_language_filter(self, lang_range, lang_tag): | |
568 """Filter the language tags.""" | |
569 | |
570 match = True | |
571 lang_range = RE_WILD_STRIP.sub('-', lang_range).lower() | |
572 ranges = lang_range.split('-') | |
573 subtags = lang_tag.lower().split('-') | |
574 length = len(ranges) | |
575 rindex = 0 | |
576 sindex = 0 | |
577 r = ranges[rindex] | |
578 s = subtags[sindex] | |
579 | |
580 # Primary tag needs to match | |
581 if r != '*' and r != s: | |
582 match = False | |
583 | |
584 rindex += 1 | |
585 sindex += 1 | |
586 | |
587 # Match until we run out of ranges | |
588 while match and rindex < length: | |
589 r = ranges[rindex] | |
590 try: | |
591 s = subtags[sindex] | |
592 except IndexError: | |
593 # Ran out of subtags, | |
594 # but we still have ranges | |
595 match = False | |
596 continue | |
597 | |
598 # Empty range | |
599 if not r: | |
600 match = False | |
601 continue | |
602 | |
603 # Matched range | |
604 elif s == r: | |
605 rindex += 1 | |
606 | |
607 # Implicit wildcard cannot match | |
608 # singletons | |
609 elif len(s) == 1: | |
610 match = False | |
611 continue | |
612 | |
613 # Implicitly matched, so grab next subtag | |
614 sindex += 1 | |
615 | |
616 return match | |
617 | |
618 def match_attribute_name(self, el, attr, prefix): | |
619 """Match attribute name and return value if it exists.""" | |
620 | |
621 value = None | |
622 if self.supports_namespaces(): | |
623 value = None | |
624 # If we have not defined namespaces, we can't very well find them, so don't bother trying. | |
625 if prefix: | |
626 ns = self.namespaces.get(prefix) | |
627 if ns is None and prefix != '*': | |
628 return None | |
629 else: | |
630 ns = None | |
631 | |
632 for k, v in self.iter_attributes(el): | |
633 | |
634 # Get attribute parts | |
635 namespace, name = self.split_namespace(el, k) | |
636 | |
637 # Can't match a prefix attribute as we haven't specified one to match | |
638 # Try to match it normally as a whole `p:a` as selector may be trying `p\:a`. | |
639 if ns is None: | |
640 if (self.is_xml and attr == k) or (not self.is_xml and util.lower(attr) == util.lower(k)): | |
641 value = v | |
642 break | |
643 # Coverage is not finding this even though it is executed. | |
644 # Adding a print statement before this (and erasing coverage) causes coverage to find the line. | |
645 # Ignore the false positive message. | |
646 continue # pragma: no cover | |
647 | |
648 # We can't match our desired prefix attribute as the attribute doesn't have a prefix | |
649 if namespace is None or ns != namespace and prefix != '*': | |
650 continue | |
651 | |
652 # The attribute doesn't match. | |
653 if (util.lower(attr) != util.lower(name)) if not self.is_xml else (attr != name): | |
654 continue | |
655 | |
656 value = v | |
657 break | |
658 else: | |
659 for k, v in self.iter_attributes(el): | |
660 if util.lower(attr) != util.lower(k): | |
661 continue | |
662 value = v | |
663 break | |
664 return value | |
665 | |
666 def match_namespace(self, el, tag): | |
667 """Match the namespace of the element.""" | |
668 | |
669 match = True | |
670 namespace = self.get_tag_ns(el) | |
671 default_namespace = self.namespaces.get('') | |
672 tag_ns = '' if tag.prefix is None else self.namespaces.get(tag.prefix, None) | |
673 # We must match the default namespace if one is not provided | |
674 if tag.prefix is None and (default_namespace is not None and namespace != default_namespace): | |
675 match = False | |
676 # If we specified `|tag`, we must not have a namespace. | |
677 elif (tag.prefix is not None and tag.prefix == '' and namespace): | |
678 match = False | |
679 # Verify prefix matches | |
680 elif ( | |
681 tag.prefix and | |
682 tag.prefix != '*' and (tag_ns is None or namespace != tag_ns) | |
683 ): | |
684 match = False | |
685 return match | |
686 | |
687 def match_attributes(self, el, attributes): | |
688 """Match attributes.""" | |
689 | |
690 match = True | |
691 if attributes: | |
692 for a in attributes: | |
693 value = self.match_attribute_name(el, a.attribute, a.prefix) | |
694 pattern = a.xml_type_pattern if self.is_xml and a.xml_type_pattern else a.pattern | |
695 if isinstance(value, list): | |
696 value = ' '.join(value) | |
697 if value is None: | |
698 match = False | |
699 break | |
700 elif pattern is None: | |
701 continue | |
702 elif pattern.match(value) is None: | |
703 match = False | |
704 break | |
705 return match | |
706 | |
707 def match_tagname(self, el, tag): | |
708 """Match tag name.""" | |
709 | |
710 name = (util.lower(tag.name) if not self.is_xml and tag.name is not None else tag.name) | |
711 return not ( | |
712 name is not None and | |
713 name not in (self.get_tag(el), '*') | |
714 ) | |
715 | |
716 def match_tag(self, el, tag): | |
717 """Match the tag.""" | |
718 | |
719 match = True | |
720 if tag is not None: | |
721 # Verify namespace | |
722 if not self.match_namespace(el, tag): | |
723 match = False | |
724 if not self.match_tagname(el, tag): | |
725 match = False | |
726 return match | |
727 | |
728 def match_past_relations(self, el, relation): | |
729 """Match past relationship.""" | |
730 | |
731 found = False | |
732 if relation[0].rel_type == REL_PARENT: | |
733 parent = self.get_parent(el, no_iframe=self.iframe_restrict) | |
734 while not found and parent: | |
735 found = self.match_selectors(parent, relation) | |
736 parent = self.get_parent(parent, no_iframe=self.iframe_restrict) | |
737 elif relation[0].rel_type == REL_CLOSE_PARENT: | |
738 parent = self.get_parent(el, no_iframe=self.iframe_restrict) | |
739 if parent: | |
740 found = self.match_selectors(parent, relation) | |
741 elif relation[0].rel_type == REL_SIBLING: | |
742 sibling = self.get_previous(el) | |
743 while not found and sibling: | |
744 found = self.match_selectors(sibling, relation) | |
745 sibling = self.get_previous(sibling) | |
746 elif relation[0].rel_type == REL_CLOSE_SIBLING: | |
747 sibling = self.get_previous(el) | |
748 if sibling and self.is_tag(sibling): | |
749 found = self.match_selectors(sibling, relation) | |
750 return found | |
751 | |
752 def match_future_child(self, parent, relation, recursive=False): | |
753 """Match future child.""" | |
754 | |
755 match = False | |
756 children = self.get_descendants if recursive else self.get_children | |
757 for child in children(parent, no_iframe=self.iframe_restrict): | |
758 match = self.match_selectors(child, relation) | |
759 if match: | |
760 break | |
761 return match | |
762 | |
763 def match_future_relations(self, el, relation): | |
764 """Match future relationship.""" | |
765 | |
766 found = False | |
767 if relation[0].rel_type == REL_HAS_PARENT: | |
768 found = self.match_future_child(el, relation, True) | |
769 elif relation[0].rel_type == REL_HAS_CLOSE_PARENT: | |
770 found = self.match_future_child(el, relation) | |
771 elif relation[0].rel_type == REL_HAS_SIBLING: | |
772 sibling = self.get_next(el) | |
773 while not found and sibling: | |
774 found = self.match_selectors(sibling, relation) | |
775 sibling = self.get_next(sibling) | |
776 elif relation[0].rel_type == REL_HAS_CLOSE_SIBLING: | |
777 sibling = self.get_next(el) | |
778 if sibling and self.is_tag(sibling): | |
779 found = self.match_selectors(sibling, relation) | |
780 return found | |
781 | |
782 def match_relations(self, el, relation): | |
783 """Match relationship to other elements.""" | |
784 | |
785 found = False | |
786 | |
787 if relation[0].rel_type.startswith(':'): | |
788 found = self.match_future_relations(el, relation) | |
789 else: | |
790 found = self.match_past_relations(el, relation) | |
791 | |
792 return found | |
793 | |
794 def match_id(self, el, ids): | |
795 """Match element's ID.""" | |
796 | |
797 found = True | |
798 for i in ids: | |
799 if i != self.get_attribute_by_name(el, 'id', ''): | |
800 found = False | |
801 break | |
802 return found | |
803 | |
804 def match_classes(self, el, classes): | |
805 """Match element's classes.""" | |
806 | |
807 current_classes = self.get_classes(el) | |
808 found = True | |
809 for c in classes: | |
810 if c not in current_classes: | |
811 found = False | |
812 break | |
813 return found | |
814 | |
815 def match_root(self, el): | |
816 """Match element as root.""" | |
817 | |
818 is_root = self.is_root(el) | |
819 if is_root: | |
820 sibling = self.get_previous(el, tags=False) | |
821 while is_root and sibling is not None: | |
822 if ( | |
823 self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or | |
824 self.is_cdata(sibling) | |
825 ): | |
826 is_root = False | |
827 else: | |
828 sibling = self.get_previous(sibling, tags=False) | |
829 if is_root: | |
830 sibling = self.get_next(el, tags=False) | |
831 while is_root and sibling is not None: | |
832 if ( | |
833 self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or | |
834 self.is_cdata(sibling) | |
835 ): | |
836 is_root = False | |
837 else: | |
838 sibling = self.get_next(sibling, tags=False) | |
839 return is_root | |
840 | |
841 def match_scope(self, el): | |
842 """Match element as scope.""" | |
843 | |
844 return self.scope is el | |
845 | |
846 def match_nth_tag_type(self, el, child): | |
847 """Match tag type for `nth` matches.""" | |
848 | |
849 return( | |
850 (self.get_tag(child) == self.get_tag(el)) and | |
851 (self.get_tag_ns(child) == self.get_tag_ns(el)) | |
852 ) | |
853 | |
854 def match_nth(self, el, nth): | |
855 """Match `nth` elements.""" | |
856 | |
857 matched = True | |
858 | |
859 for n in nth: | |
860 matched = False | |
861 if n.selectors and not self.match_selectors(el, n.selectors): | |
862 break | |
863 parent = self.get_parent(el) | |
864 if parent is None: | |
865 parent = self.create_fake_parent(el) | |
866 last = n.last | |
867 last_index = len(parent) - 1 | |
868 index = last_index if last else 0 | |
869 relative_index = 0 | |
870 a = n.a | |
871 b = n.b | |
872 var = n.n | |
873 count = 0 | |
874 count_incr = 1 | |
875 factor = -1 if last else 1 | |
876 idx = last_idx = a * count + b if var else a | |
877 | |
878 # We can only adjust bounds within a variable index | |
879 if var: | |
880 # Abort if our nth index is out of bounds and only getting further out of bounds as we increment. | |
881 # Otherwise, increment to try to get in bounds. | |
882 adjust = None | |
883 while idx < 1 or idx > last_index: | |
884 if idx < 0: | |
885 diff_low = 0 - idx | |
886 if adjust is not None and adjust == 1: | |
887 break | |
888 adjust = -1 | |
889 count += count_incr | |
890 idx = last_idx = a * count + b if var else a | |
891 diff = 0 - idx | |
892 if diff >= diff_low: | |
893 break | |
894 else: | |
895 diff_high = idx - last_index | |
896 if adjust is not None and adjust == -1: | |
897 break | |
898 adjust = 1 | |
899 count += count_incr | |
900 idx = last_idx = a * count + b if var else a | |
901 diff = idx - last_index | |
902 if diff >= diff_high: | |
903 break | |
904 diff_high = diff | |
905 | |
906 # If a < 0, our count is working backwards, so floor the index by increasing the count. | |
907 # Find the count that yields the lowest, in bound value and use that. | |
908 # Lastly reverse count increment so that we'll increase our index. | |
909 lowest = count | |
910 if a < 0: | |
911 while idx >= 1: | |
912 lowest = count | |
913 count += count_incr | |
914 idx = last_idx = a * count + b if var else a | |
915 count_incr = -1 | |
916 count = lowest | |
917 idx = last_idx = a * count + b if var else a | |
918 | |
919 # Evaluate elements while our calculated nth index is still in range | |
920 while 1 <= idx <= last_index + 1: | |
921 child = None | |
922 # Evaluate while our child index is still in range. | |
923 for child in self.get_children(parent, start=index, reverse=factor < 0, tags=False): | |
924 index += factor | |
925 if not self.is_tag(child): | |
926 continue | |
927 # Handle `of S` in `nth-child` | |
928 if n.selectors and not self.match_selectors(child, n.selectors): | |
929 continue | |
930 # Handle `of-type` | |
931 if n.of_type and not self.match_nth_tag_type(el, child): | |
932 continue | |
933 relative_index += 1 | |
934 if relative_index == idx: | |
935 if child is el: | |
936 matched = True | |
937 else: | |
938 break | |
939 if child is el: | |
940 break | |
941 if child is el: | |
942 break | |
943 last_idx = idx | |
944 count += count_incr | |
945 if count < 0: | |
946 # Count is counting down and has now ventured into invalid territory. | |
947 break | |
948 idx = a * count + b if var else a | |
949 if last_idx == idx: | |
950 break | |
951 if not matched: | |
952 break | |
953 return matched | |
954 | |
955 def match_empty(self, el): | |
956 """Check if element is empty (if requested).""" | |
957 | |
958 is_empty = True | |
959 for child in self.get_children(el, tags=False): | |
960 if self.is_tag(child): | |
961 is_empty = False | |
962 break | |
963 elif self.is_content_string(child) and RE_NOT_EMPTY.search(child): | |
964 is_empty = False | |
965 break | |
966 return is_empty | |
967 | |
968 def match_subselectors(self, el, selectors): | |
969 """Match selectors.""" | |
970 | |
971 match = True | |
972 for sel in selectors: | |
973 if not self.match_selectors(el, sel): | |
974 match = False | |
975 return match | |
976 | |
977 def match_contains(self, el, contains): | |
978 """Match element if it contains text.""" | |
979 | |
980 match = True | |
981 content = None | |
982 for contain_list in contains: | |
983 if content is None: | |
984 if contain_list.own: | |
985 content = self.get_own_text(el, no_iframe=self.is_html) | |
986 else: | |
987 content = self.get_text(el, no_iframe=self.is_html) | |
988 found = False | |
989 for text in contain_list.text: | |
990 if contain_list.own: | |
991 for c in content: | |
992 if text in c: | |
993 found = True | |
994 break | |
995 if found: | |
996 break | |
997 else: | |
998 if text in content: | |
999 found = True | |
1000 break | |
1001 if not found: | |
1002 match = False | |
1003 return match | |
1004 | |
1005 def match_default(self, el): | |
1006 """Match default.""" | |
1007 | |
1008 match = False | |
1009 | |
1010 # Find this input's form | |
1011 form = None | |
1012 parent = self.get_parent(el, no_iframe=True) | |
1013 while parent and form is None: | |
1014 if self.get_tag(parent) == 'form' and self.is_html_tag(parent): | |
1015 form = parent | |
1016 else: | |
1017 parent = self.get_parent(parent, no_iframe=True) | |
1018 | |
1019 # Look in form cache to see if we've already located its default button | |
1020 found_form = False | |
1021 for f, t in self.cached_default_forms: | |
1022 if f is form: | |
1023 found_form = True | |
1024 if t is el: | |
1025 match = True | |
1026 break | |
1027 | |
1028 # We didn't have the form cached, so look for its default button | |
1029 if not found_form: | |
1030 for child in self.get_descendants(form, no_iframe=True): | |
1031 name = self.get_tag(child) | |
1032 # Can't do nested forms (haven't figured out why we never hit this) | |
1033 if name == 'form': # pragma: no cover | |
1034 break | |
1035 if name in ('input', 'button'): | |
1036 v = self.get_attribute_by_name(child, 'type', '') | |
1037 if v and util.lower(v) == 'submit': | |
1038 self.cached_default_forms.append([form, child]) | |
1039 if el is child: | |
1040 match = True | |
1041 break | |
1042 return match | |
1043 | |
1044 def match_indeterminate(self, el): | |
1045 """Match default.""" | |
1046 | |
1047 match = False | |
1048 name = self.get_attribute_by_name(el, 'name') | |
1049 | |
1050 def get_parent_form(el): | |
1051 """Find this input's form.""" | |
1052 form = None | |
1053 parent = self.get_parent(el, no_iframe=True) | |
1054 while form is None: | |
1055 if self.get_tag(parent) == 'form' and self.is_html_tag(parent): | |
1056 form = parent | |
1057 break | |
1058 last_parent = parent | |
1059 parent = self.get_parent(parent, no_iframe=True) | |
1060 if parent is None: | |
1061 form = last_parent | |
1062 break | |
1063 return form | |
1064 | |
1065 form = get_parent_form(el) | |
1066 | |
1067 # Look in form cache to see if we've already evaluated that its fellow radio buttons are indeterminate | |
1068 found_form = False | |
1069 for f, n, i in self.cached_indeterminate_forms: | |
1070 if f is form and n == name: | |
1071 found_form = True | |
1072 if i is True: | |
1073 match = True | |
1074 break | |
1075 | |
1076 # We didn't have the form cached, so validate that the radio button is indeterminate | |
1077 if not found_form: | |
1078 checked = False | |
1079 for child in self.get_descendants(form, no_iframe=True): | |
1080 if child is el: | |
1081 continue | |
1082 tag_name = self.get_tag(child) | |
1083 if tag_name == 'input': | |
1084 is_radio = False | |
1085 check = False | |
1086 has_name = False | |
1087 for k, v in self.iter_attributes(child): | |
1088 if util.lower(k) == 'type' and util.lower(v) == 'radio': | |
1089 is_radio = True | |
1090 elif util.lower(k) == 'name' and v == name: | |
1091 has_name = True | |
1092 elif util.lower(k) == 'checked': | |
1093 check = True | |
1094 if is_radio and check and has_name and get_parent_form(child) is form: | |
1095 checked = True | |
1096 break | |
1097 if checked: | |
1098 break | |
1099 if not checked: | |
1100 match = True | |
1101 self.cached_indeterminate_forms.append([form, name, match]) | |
1102 | |
1103 return match | |
1104 | |
1105 def match_lang(self, el, langs): | |
1106 """Match languages.""" | |
1107 | |
1108 match = False | |
1109 has_ns = self.supports_namespaces() | |
1110 root = self.root | |
1111 has_html_namespace = self.has_html_namespace | |
1112 | |
1113 # Walk parents looking for `lang` (HTML) or `xml:lang` XML property. | |
1114 parent = el | |
1115 found_lang = None | |
1116 last = None | |
1117 while not found_lang: | |
1118 has_html_ns = self.has_html_ns(parent) | |
1119 for k, v in self.iter_attributes(parent): | |
1120 attr_ns, attr = self.split_namespace(parent, k) | |
1121 if ( | |
1122 ((not has_ns or has_html_ns) and (util.lower(k) if not self.is_xml else k) == 'lang') or | |
1123 ( | |
1124 has_ns and not has_html_ns and attr_ns == NS_XML and | |
1125 (util.lower(attr) if not self.is_xml and attr is not None else attr) == 'lang' | |
1126 ) | |
1127 ): | |
1128 found_lang = v | |
1129 break | |
1130 last = parent | |
1131 parent = self.get_parent(parent, no_iframe=self.is_html) | |
1132 | |
1133 if parent is None: | |
1134 root = last | |
1135 has_html_namespace = self.has_html_ns(root) | |
1136 parent = last | |
1137 break | |
1138 | |
1139 # Use cached meta language. | |
1140 if not found_lang and self.cached_meta_lang: | |
1141 for cache in self.cached_meta_lang: | |
1142 if root is cache[0]: | |
1143 found_lang = cache[1] | |
1144 | |
1145 # If we couldn't find a language, and the document is HTML, look to meta to determine language. | |
1146 if found_lang is None and (not self.is_xml or (has_html_namespace and root.name == 'html')): | |
1147 # Find head | |
1148 found = False | |
1149 for tag in ('html', 'head'): | |
1150 found = False | |
1151 for child in self.get_children(parent, no_iframe=self.is_html): | |
1152 if self.get_tag(child) == tag and self.is_html_tag(child): | |
1153 found = True | |
1154 parent = child | |
1155 break | |
1156 if not found: # pragma: no cover | |
1157 break | |
1158 | |
1159 # Search meta tags | |
1160 if found: | |
1161 for child in parent: | |
1162 if self.is_tag(child) and self.get_tag(child) == 'meta' and self.is_html_tag(parent): | |
1163 c_lang = False | |
1164 content = None | |
1165 for k, v in self.iter_attributes(child): | |
1166 if util.lower(k) == 'http-equiv' and util.lower(v) == 'content-language': | |
1167 c_lang = True | |
1168 if util.lower(k) == 'content': | |
1169 content = v | |
1170 if c_lang and content: | |
1171 found_lang = content | |
1172 self.cached_meta_lang.append((root, found_lang)) | |
1173 break | |
1174 if found_lang: | |
1175 break | |
1176 if not found_lang: | |
1177 self.cached_meta_lang.append((root, False)) | |
1178 | |
1179 # If we determined a language, compare. | |
1180 if found_lang: | |
1181 for patterns in langs: | |
1182 match = False | |
1183 for pattern in patterns: | |
1184 if self.extended_language_filter(pattern, found_lang): | |
1185 match = True | |
1186 if not match: | |
1187 break | |
1188 | |
1189 return match | |
1190 | |
1191 def match_dir(self, el, directionality): | |
1192 """Check directionality.""" | |
1193 | |
1194 # If we have to match both left and right, we can't match either. | |
1195 if directionality & ct.SEL_DIR_LTR and directionality & ct.SEL_DIR_RTL: | |
1196 return False | |
1197 | |
1198 if el is None or not self.is_html_tag(el): | |
1199 return False | |
1200 | |
1201 # Element has defined direction of left to right or right to left | |
1202 direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(el, 'dir', '')), None) | |
1203 if direction not in (None, 0): | |
1204 return direction == directionality | |
1205 | |
1206 # Element is the document element (the root) and no direction assigned, assume left to right. | |
1207 is_root = self.is_root(el) | |
1208 if is_root and direction is None: | |
1209 return ct.SEL_DIR_LTR == directionality | |
1210 | |
1211 # If `input[type=telephone]` and no direction is assigned, assume left to right. | |
1212 name = self.get_tag(el) | |
1213 is_input = name == 'input' | |
1214 is_textarea = name == 'textarea' | |
1215 is_bdi = name == 'bdi' | |
1216 itype = util.lower(self.get_attribute_by_name(el, 'type', '')) if is_input else '' | |
1217 if is_input and itype == 'tel' and direction is None: | |
1218 return ct.SEL_DIR_LTR == directionality | |
1219 | |
1220 # Auto handling for text inputs | |
1221 if ((is_input and itype in ('text', 'search', 'tel', 'url', 'email')) or is_textarea) and direction == 0: | |
1222 if is_textarea: | |
1223 value = [] | |
1224 for node in self.get_contents(el, no_iframe=True): | |
1225 if self.is_content_string(node): | |
1226 value.append(node) | |
1227 value = ''.join(value) | |
1228 else: | |
1229 value = self.get_attribute_by_name(el, 'value', '') | |
1230 if value: | |
1231 for c in value: | |
1232 bidi = unicodedata.bidirectional(c) | |
1233 if bidi in ('AL', 'R', 'L'): | |
1234 direction = ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL | |
1235 return direction == directionality | |
1236 # Assume left to right | |
1237 return ct.SEL_DIR_LTR == directionality | |
1238 elif is_root: | |
1239 return ct.SEL_DIR_LTR == directionality | |
1240 return self.match_dir(self.get_parent(el, no_iframe=True), directionality) | |
1241 | |
1242 # Auto handling for `bdi` and other non text inputs. | |
1243 if (is_bdi and direction is None) or direction == 0: | |
1244 direction = self.find_bidi(el) | |
1245 if direction is not None: | |
1246 return direction == directionality | |
1247 elif is_root: | |
1248 return ct.SEL_DIR_LTR == directionality | |
1249 return self.match_dir(self.get_parent(el, no_iframe=True), directionality) | |
1250 | |
1251 # Match parents direction | |
1252 return self.match_dir(self.get_parent(el, no_iframe=True), directionality) | |
1253 | |
1254 def match_range(self, el, condition): | |
1255 """ | |
1256 Match range. | |
1257 | |
1258 Behavior is modeled after what we see in browsers. Browsers seem to evaluate | |
1259 if the value is out of range, and if not, it is in range. So a missing value | |
1260 will not evaluate out of range; therefore, value is in range. Personally, I | |
1261 feel like this should evaluate as neither in or out of range. | |
1262 """ | |
1263 | |
1264 out_of_range = False | |
1265 | |
1266 itype = util.lower(self.get_attribute_by_name(el, 'type')) | |
1267 mn = self.get_attribute_by_name(el, 'min', None) | |
1268 if mn is not None: | |
1269 mn = Inputs.parse_value(itype, mn) | |
1270 mx = self.get_attribute_by_name(el, 'max', None) | |
1271 if mx is not None: | |
1272 mx = Inputs.parse_value(itype, mx) | |
1273 | |
1274 # There is no valid min or max, so we cannot evaluate a range | |
1275 if mn is None and mx is None: | |
1276 return False | |
1277 | |
1278 value = self.get_attribute_by_name(el, 'value', None) | |
1279 if value is not None: | |
1280 value = Inputs.parse_value(itype, value) | |
1281 if value is not None: | |
1282 if itype in ("date", "datetime-local", "month", "week", "number", "range"): | |
1283 if mn is not None and value < mn: | |
1284 out_of_range = True | |
1285 if not out_of_range and mx is not None and value > mx: | |
1286 out_of_range = True | |
1287 elif itype == "time": | |
1288 if mn is not None and mx is not None and mn > mx: | |
1289 # Time is periodic, so this is a reversed/discontinuous range | |
1290 if value < mn and value > mx: | |
1291 out_of_range = True | |
1292 else: | |
1293 if mn is not None and value < mn: | |
1294 out_of_range = True | |
1295 if not out_of_range and mx is not None and value > mx: | |
1296 out_of_range = True | |
1297 | |
1298 return not out_of_range if condition & ct.SEL_IN_RANGE else out_of_range | |
1299 | |
1300 def match_defined(self, el): | |
1301 """ | |
1302 Match defined. | |
1303 | |
1304 `:defined` is related to custom elements in a browser. | |
1305 | |
1306 - If the document is XML (not XHTML), all tags will match. | |
1307 - Tags that are not custom (don't have a hyphen) are marked defined. | |
1308 - If the tag has a prefix (without or without a namespace), it will not match. | |
1309 | |
1310 This is of course requires the parser to provide us with the proper prefix and namespace info, | |
1311 if it doesn't, there is nothing we can do. | |
1312 """ | |
1313 | |
1314 name = self.get_tag(el) | |
1315 return ( | |
1316 name.find('-') == -1 or | |
1317 name.find(':') != -1 or | |
1318 self.get_prefix(el) is not None | |
1319 ) | |
1320 | |
1321 def match_placeholder_shown(self, el): | |
1322 """ | |
1323 Match placeholder shown according to HTML spec. | |
1324 | |
1325 - text area should be checked if they have content. A single newline does not count as content. | |
1326 | |
1327 """ | |
1328 | |
1329 match = False | |
1330 content = self.get_text(el) | |
1331 if content in ('', '\n'): | |
1332 match = True | |
1333 | |
1334 return match | |
1335 | |
1336 def match_selectors(self, el, selectors): | |
1337 """Check if element matches one of the selectors.""" | |
1338 | |
1339 match = False | |
1340 is_not = selectors.is_not | |
1341 is_html = selectors.is_html | |
1342 | |
1343 # Internal selector lists that use the HTML flag, will automatically get the `html` namespace. | |
1344 if is_html: | |
1345 namespaces = self.namespaces | |
1346 iframe_restrict = self.iframe_restrict | |
1347 self.namespaces = {'html': NS_XHTML} | |
1348 self.iframe_restrict = True | |
1349 | |
1350 if not is_html or self.is_html: | |
1351 for selector in selectors: | |
1352 match = is_not | |
1353 # We have a un-matchable situation (like `:focus` as you can focus an element in this environment) | |
1354 if isinstance(selector, ct.SelectorNull): | |
1355 continue | |
1356 # Verify tag matches | |
1357 if not self.match_tag(el, selector.tag): | |
1358 continue | |
1359 # Verify tag is defined | |
1360 if selector.flags & ct.SEL_DEFINED and not self.match_defined(el): | |
1361 continue | |
1362 # Verify element is root | |
1363 if selector.flags & ct.SEL_ROOT and not self.match_root(el): | |
1364 continue | |
1365 # Verify element is scope | |
1366 if selector.flags & ct.SEL_SCOPE and not self.match_scope(el): | |
1367 continue | |
1368 # Verify element has placeholder shown | |
1369 if selector.flags & ct.SEL_PLACEHOLDER_SHOWN and not self.match_placeholder_shown(el): | |
1370 continue | |
1371 # Verify `nth` matches | |
1372 if not self.match_nth(el, selector.nth): | |
1373 continue | |
1374 if selector.flags & ct.SEL_EMPTY and not self.match_empty(el): | |
1375 continue | |
1376 # Verify id matches | |
1377 if selector.ids and not self.match_id(el, selector.ids): | |
1378 continue | |
1379 # Verify classes match | |
1380 if selector.classes and not self.match_classes(el, selector.classes): | |
1381 continue | |
1382 # Verify attribute(s) match | |
1383 if not self.match_attributes(el, selector.attributes): | |
1384 continue | |
1385 # Verify ranges | |
1386 if selector.flags & RANGES and not self.match_range(el, selector.flags & RANGES): | |
1387 continue | |
1388 # Verify language patterns | |
1389 if selector.lang and not self.match_lang(el, selector.lang): | |
1390 continue | |
1391 # Verify pseudo selector patterns | |
1392 if selector.selectors and not self.match_subselectors(el, selector.selectors): | |
1393 continue | |
1394 # Verify relationship selectors | |
1395 if selector.relation and not self.match_relations(el, selector.relation): | |
1396 continue | |
1397 # Validate that the current default selector match corresponds to the first submit button in the form | |
1398 if selector.flags & ct.SEL_DEFAULT and not self.match_default(el): | |
1399 continue | |
1400 # Validate that the unset radio button is among radio buttons with the same name in a form that are | |
1401 # also not set. | |
1402 if selector.flags & ct.SEL_INDETERMINATE and not self.match_indeterminate(el): | |
1403 continue | |
1404 # Validate element directionality | |
1405 if selector.flags & DIR_FLAGS and not self.match_dir(el, selector.flags & DIR_FLAGS): | |
1406 continue | |
1407 # Validate that the tag contains the specified text. | |
1408 if not self.match_contains(el, selector.contains): | |
1409 continue | |
1410 match = not is_not | |
1411 break | |
1412 | |
1413 # Restore actual namespaces being used for external selector lists | |
1414 if is_html: | |
1415 self.namespaces = namespaces | |
1416 self.iframe_restrict = iframe_restrict | |
1417 | |
1418 return match | |
1419 | |
1420 def select(self, limit=0): | |
1421 """Match all tags under the targeted tag.""" | |
1422 | |
1423 if limit < 1: | |
1424 limit = None | |
1425 | |
1426 for child in self.get_descendants(self.tag): | |
1427 if self.match(child): | |
1428 yield child | |
1429 if limit is not None: | |
1430 limit -= 1 | |
1431 if limit < 1: | |
1432 break | |
1433 | |
1434 def closest(self): | |
1435 """Match closest ancestor.""" | |
1436 | |
1437 current = self.tag | |
1438 closest = None | |
1439 while closest is None and current is not None: | |
1440 if self.match(current): | |
1441 closest = current | |
1442 else: | |
1443 current = self.get_parent(current) | |
1444 return closest | |
1445 | |
1446 def filter(self): # noqa A001 | |
1447 """Filter tag's children.""" | |
1448 | |
1449 return [tag for tag in self.get_contents(self.tag) if not self.is_navigable_string(tag) and self.match(tag)] | |
1450 | |
1451 def match(self, el): | |
1452 """Match.""" | |
1453 | |
1454 return not self.is_doc(el) and self.is_tag(el) and self.match_selectors(el, self.selectors) | |
1455 | |
1456 | |
1457 class CSSMatch(_DocumentNav, _Match): | |
1458 """The Beautiful Soup CSS match class.""" | |
1459 | |
1460 | |
1461 class SoupSieve(ct.Immutable): | |
1462 """Compiled Soup Sieve selector matching object.""" | |
1463 | |
1464 __slots__ = ("pattern", "selectors", "namespaces", "custom", "flags", "_hash") | |
1465 | |
1466 def __init__(self, pattern, selectors, namespaces, custom, flags): | |
1467 """Initialize.""" | |
1468 | |
1469 super(SoupSieve, self).__init__( | |
1470 pattern=pattern, | |
1471 selectors=selectors, | |
1472 namespaces=namespaces, | |
1473 custom=custom, | |
1474 flags=flags | |
1475 ) | |
1476 | |
1477 def match(self, tag): | |
1478 """Match.""" | |
1479 | |
1480 return CSSMatch(self.selectors, tag, self.namespaces, self.flags).match(tag) | |
1481 | |
1482 def closest(self, tag): | |
1483 """Match closest ancestor.""" | |
1484 | |
1485 return CSSMatch(self.selectors, tag, self.namespaces, self.flags).closest() | |
1486 | |
1487 def filter(self, iterable): # noqa A001 | |
1488 """ | |
1489 Filter. | |
1490 | |
1491 `CSSMatch` can cache certain searches for tags of the same document, | |
1492 so if we are given a tag, all tags are from the same document, | |
1493 and we can take advantage of the optimization. | |
1494 | |
1495 Any other kind of iterable could have tags from different documents or detached tags, | |
1496 so for those, we use a new `CSSMatch` for each item in the iterable. | |
1497 """ | |
1498 | |
1499 if CSSMatch.is_tag(iterable): | |
1500 return CSSMatch(self.selectors, iterable, self.namespaces, self.flags).filter() | |
1501 else: | |
1502 return [node for node in iterable if not CSSMatch.is_navigable_string(node) and self.match(node)] | |
1503 | |
1504 def select_one(self, tag): | |
1505 """Select a single tag.""" | |
1506 | |
1507 tags = self.select(tag, limit=1) | |
1508 return tags[0] if tags else None | |
1509 | |
1510 def select(self, tag, limit=0): | |
1511 """Select the specified tags.""" | |
1512 | |
1513 return list(self.iselect(tag, limit)) | |
1514 | |
1515 def iselect(self, tag, limit=0): | |
1516 """Iterate the specified tags.""" | |
1517 | |
1518 for el in CSSMatch(self.selectors, tag, self.namespaces, self.flags).select(limit): | |
1519 yield el | |
1520 | |
1521 def __repr__(self): # pragma: no cover | |
1522 """Representation.""" | |
1523 | |
1524 return "SoupSieve(pattern={!r}, namespaces={!r}, custom={!r}, flags={!r})".format( | |
1525 self.pattern, | |
1526 self.namespaces, | |
1527 self.custom, | |
1528 self.flags | |
1529 ) | |
1530 | |
1531 __str__ = __repr__ | |
1532 | |
1533 | |
1534 ct.pickle_register(SoupSieve) |