comparison env/lib/python3.9/site-packages/soupsieve/css_match.py @ 0:4f3585e2f14b draft default tip

"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author shellac
date Mon, 22 Mar 2021 18:12:50 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4f3585e2f14b
1 """CSS matcher."""
2 from datetime import datetime
3 from . import util
4 import re
5 from .import css_types as ct
6 import unicodedata
7 from collections.abc import Sequence
8
9 import bs4
10
11 # Empty tag pattern (whitespace okay)
12 RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]')
13
14 RE_NOT_WS = re.compile('[^ \t\r\n\f]+')
15
16 # Relationships
17 REL_PARENT = ' '
18 REL_CLOSE_PARENT = '>'
19 REL_SIBLING = '~'
20 REL_CLOSE_SIBLING = '+'
21
22 # Relationships for :has() (forward looking)
23 REL_HAS_PARENT = ': '
24 REL_HAS_CLOSE_PARENT = ':>'
25 REL_HAS_SIBLING = ':~'
26 REL_HAS_CLOSE_SIBLING = ':+'
27
28 NS_XHTML = 'http://www.w3.org/1999/xhtml'
29 NS_XML = 'http://www.w3.org/XML/1998/namespace'
30
31 DIR_FLAGS = ct.SEL_DIR_LTR | ct.SEL_DIR_RTL
32 RANGES = ct.SEL_IN_RANGE | ct.SEL_OUT_OF_RANGE
33
34 DIR_MAP = {
35 'ltr': ct.SEL_DIR_LTR,
36 'rtl': ct.SEL_DIR_RTL,
37 'auto': 0
38 }
39
40 RE_NUM = re.compile(r"^(?P<value>-?(?:[0-9]{1,}(\.[0-9]+)?|\.[0-9]+))$")
41 RE_TIME = re.compile(r'^(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$')
42 RE_MONTH = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})$')
43 RE_WEEK = re.compile(r'^(?P<year>[0-9]{4,})-W(?P<week>[0-9]{2})$')
44 RE_DATE = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})$')
45 RE_DATETIME = re.compile(
46 r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})T(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$'
47 )
48 RE_WILD_STRIP = re.compile(r'(?:(?:-\*-)(?:\*(?:-|$))*|-\*$)')
49
50 MONTHS_30 = (4, 6, 9, 11) # April, June, September, and November
51 FEB = 2
52 SHORT_MONTH = 30
53 LONG_MONTH = 31
54 FEB_MONTH = 28
55 FEB_LEAP_MONTH = 29
56 DAYS_IN_WEEK = 7
57
58
59 class _FakeParent(object):
60 """
61 Fake parent class.
62
63 When we have a fragment with no `BeautifulSoup` document object,
64 we can't evaluate `nth` selectors properly. Create a temporary
65 fake parent so we can traverse the root element as a child.
66 """
67
68 def __init__(self, element):
69 """Initialize."""
70
71 self.contents = [element]
72
73 def __len__(self):
74 """Length."""
75
76 return len(self.contents)
77
78
79 class _DocumentNav(object):
80 """Navigate a Beautiful Soup document."""
81
82 @classmethod
83 def assert_valid_input(cls, tag):
84 """Check if valid input tag or document."""
85
86 # Fail on unexpected types.
87 if not cls.is_tag(tag):
88 raise TypeError("Expected a BeautifulSoup 'Tag', but instead recieved type {}".format(type(tag)))
89
90 @staticmethod
91 def is_doc(obj):
92 """Is `BeautifulSoup` object."""
93 return isinstance(obj, bs4.BeautifulSoup)
94
95 @staticmethod
96 def is_tag(obj):
97 """Is tag."""
98 return isinstance(obj, bs4.Tag)
99
100 @staticmethod
101 def is_declaration(obj): # pragma: no cover
102 """Is declaration."""
103 return isinstance(obj, bs4.Declaration)
104
105 @staticmethod
106 def is_cdata(obj):
107 """Is CDATA."""
108 return isinstance(obj, bs4.CData)
109
110 @staticmethod
111 def is_processing_instruction(obj): # pragma: no cover
112 """Is processing instruction."""
113 return isinstance(obj, bs4.ProcessingInstruction)
114
115 @staticmethod
116 def is_navigable_string(obj):
117 """Is navigable string."""
118 return isinstance(obj, bs4.NavigableString)
119
120 @staticmethod
121 def is_special_string(obj):
122 """Is special string."""
123 return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype))
124
125 @classmethod
126 def is_content_string(cls, obj):
127 """Check if node is content string."""
128
129 return cls.is_navigable_string(obj) and not cls.is_special_string(obj)
130
131 @staticmethod
132 def create_fake_parent(el):
133 """Create fake parent for a given element."""
134
135 return _FakeParent(el)
136
137 @staticmethod
138 def is_xml_tree(el):
139 """Check if element (or document) is from a XML tree."""
140
141 return el._is_xml
142
143 def is_iframe(self, el):
144 """Check if element is an `iframe`."""
145
146 return ((el.name if self.is_xml_tree(el) else util.lower(el.name)) == 'iframe') and self.is_html_tag(el)
147
148 def is_root(self, el):
149 """
150 Return whether element is a root element.
151
152 We check that the element is the root of the tree (which we have already pre-calculated),
153 and we check if it is the root element under an `iframe`.
154 """
155
156 root = self.root and self.root is el
157 if not root:
158 parent = self.get_parent(el)
159 root = parent is not None and self.is_html and self.is_iframe(parent)
160 return root
161
162 def get_contents(self, el, no_iframe=False):
163 """Get contents or contents in reverse."""
164 if not no_iframe or not self.is_iframe(el):
165 for content in el.contents:
166 yield content
167
168 def get_children(self, el, start=None, reverse=False, tags=True, no_iframe=False):
169 """Get children."""
170
171 if not no_iframe or not self.is_iframe(el):
172 last = len(el.contents) - 1
173 if start is None:
174 index = last if reverse else 0
175 else:
176 index = start
177 end = -1 if reverse else last + 1
178 incr = -1 if reverse else 1
179
180 if 0 <= index <= last:
181 while index != end:
182 node = el.contents[index]
183 index += incr
184 if not tags or self.is_tag(node):
185 yield node
186
187 def get_descendants(self, el, tags=True, no_iframe=False):
188 """Get descendants."""
189
190 if not no_iframe or not self.is_iframe(el):
191 next_good = None
192 for child in el.descendants:
193
194 if next_good is not None:
195 if child is not next_good:
196 continue
197 next_good = None
198
199 is_tag = self.is_tag(child)
200
201 if no_iframe and is_tag and self.is_iframe(child):
202 if child.next_sibling is not None:
203 next_good = child.next_sibling
204 else:
205 last_child = child
206 while self.is_tag(last_child) and last_child.contents:
207 last_child = last_child.contents[-1]
208 next_good = last_child.next_element
209 yield child
210 if next_good is None:
211 break
212 # Coverage isn't seeing this even though it's executed
213 continue # pragma: no cover
214
215 if not tags or is_tag:
216 yield child
217
218 def get_parent(self, el, no_iframe=False):
219 """Get parent."""
220
221 parent = el.parent
222 if no_iframe and parent is not None and self.is_iframe(parent):
223 parent = None
224 return parent
225
226 @staticmethod
227 def get_tag_name(el):
228 """Get tag."""
229
230 return el.name
231
232 @staticmethod
233 def get_prefix_name(el):
234 """Get prefix."""
235
236 return el.prefix
237
238 @staticmethod
239 def get_uri(el):
240 """Get namespace `URI`."""
241
242 return el.namespace
243
244 @classmethod
245 def get_next(cls, el, tags=True):
246 """Get next sibling tag."""
247
248 sibling = el.next_sibling
249 while tags and not cls.is_tag(sibling) and sibling is not None:
250 sibling = sibling.next_sibling
251 return sibling
252
253 @classmethod
254 def get_previous(cls, el, tags=True):
255 """Get previous sibling tag."""
256
257 sibling = el.previous_sibling
258 while tags and not cls.is_tag(sibling) and sibling is not None:
259 sibling = sibling.previous_sibling
260 return sibling
261
262 @staticmethod
263 def has_html_ns(el):
264 """
265 Check if element has an HTML namespace.
266
267 This is a bit different than whether a element is treated as having an HTML namespace,
268 like we do in the case of `is_html_tag`.
269 """
270
271 ns = getattr(el, 'namespace') if el else None
272 return ns and ns == NS_XHTML
273
274 @staticmethod
275 def split_namespace(el, attr_name):
276 """Return namespace and attribute name without the prefix."""
277
278 return getattr(attr_name, 'namespace', None), getattr(attr_name, 'name', None)
279
280 @classmethod
281 def normalize_value(cls, value):
282 """Normalize the value to be a string or list of strings."""
283
284 # Treat `None` as empty string.
285 if value is None:
286 return ''
287
288 # Pass through strings
289 if (isinstance(value, str)):
290 return value
291
292 # If it's a byte string, convert it to Unicode, treating it as UTF-8.
293 if isinstance(value, bytes):
294 return value.decode("utf8")
295
296 # BeautifulSoup supports sequences of attribute values, so make sure the children are strings.
297 if isinstance(value, Sequence):
298 new_value = []
299 for v in value:
300 if isinstance(v, Sequence):
301 # This is most certainly a user error and will crash and burn later,
302 # but to avoid excessive recursion, kick out now.
303 new_value.append(v)
304 else:
305 # Convert the child to a string
306 new_value.append(cls.normalize_value(v))
307 return new_value
308
309 # Try and make anything else a string
310 return str(value)
311
312 @classmethod
313 def get_attribute_by_name(cls, el, name, default=None):
314 """Get attribute by name."""
315
316 value = default
317 if el._is_xml:
318 try:
319 value = cls.normalize_value(el.attrs[name])
320 except KeyError:
321 pass
322 else:
323 for k, v in el.attrs.items():
324 if util.lower(k) == name:
325 value = cls.normalize_value(v)
326 break
327 return value
328
329 @classmethod
330 def iter_attributes(cls, el):
331 """Iterate attributes."""
332
333 for k, v in el.attrs.items():
334 yield k, cls.normalize_value(v)
335
336 @classmethod
337 def get_classes(cls, el):
338 """Get classes."""
339
340 classes = cls.get_attribute_by_name(el, 'class', [])
341 if isinstance(classes, str):
342 classes = RE_NOT_WS.findall(classes)
343 return classes
344
345 def get_text(self, el, no_iframe=False):
346 """Get text."""
347
348 return ''.join(
349 [node for node in self.get_descendants(el, tags=False, no_iframe=no_iframe) if self.is_content_string(node)]
350 )
351
352 def get_own_text(self, el, no_iframe=False):
353 """Get Own Text."""
354
355 return [node for node in self.get_contents(el, no_iframe=no_iframe) if self.is_content_string(node)]
356
357
358 class Inputs(object):
359 """Class for parsing and validating input items."""
360
361 @staticmethod
362 def validate_day(year, month, day):
363 """Validate day."""
364
365 max_days = LONG_MONTH
366 if month == FEB:
367 max_days = FEB_LEAP_MONTH if ((year % 4 == 0) and (year % 100 != 0)) or (year % 400 == 0) else FEB_MONTH
368 elif month in MONTHS_30:
369 max_days = SHORT_MONTH
370 return 1 <= day <= max_days
371
372 @staticmethod
373 def validate_week(year, week):
374 """Validate week."""
375
376 max_week = datetime.strptime("{}-{}-{}".format(12, 31, year), "%m-%d-%Y").isocalendar()[1]
377 if max_week == 1:
378 max_week = 53
379 return 1 <= week <= max_week
380
381 @staticmethod
382 def validate_month(month):
383 """Validate month."""
384
385 return 1 <= month <= 12
386
387 @staticmethod
388 def validate_year(year):
389 """Validate year."""
390
391 return 1 <= year
392
393 @staticmethod
394 def validate_hour(hour):
395 """Validate hour."""
396
397 return 0 <= hour <= 23
398
399 @staticmethod
400 def validate_minutes(minutes):
401 """Validate minutes."""
402
403 return 0 <= minutes <= 59
404
405 @classmethod
406 def parse_value(cls, itype, value):
407 """Parse the input value."""
408
409 parsed = None
410 if itype == "date":
411 m = RE_DATE.match(value)
412 if m:
413 year = int(m.group('year'), 10)
414 month = int(m.group('month'), 10)
415 day = int(m.group('day'), 10)
416 if cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day):
417 parsed = (year, month, day)
418 elif itype == "month":
419 m = RE_MONTH.match(value)
420 if m:
421 year = int(m.group('year'), 10)
422 month = int(m.group('month'), 10)
423 if cls.validate_year(year) and cls.validate_month(month):
424 parsed = (year, month)
425 elif itype == "week":
426 m = RE_WEEK.match(value)
427 if m:
428 year = int(m.group('year'), 10)
429 week = int(m.group('week'), 10)
430 if cls.validate_year(year) and cls.validate_week(year, week):
431 parsed = (year, week)
432 elif itype == "time":
433 m = RE_TIME.match(value)
434 if m:
435 hour = int(m.group('hour'), 10)
436 minutes = int(m.group('minutes'), 10)
437 if cls.validate_hour(hour) and cls.validate_minutes(minutes):
438 parsed = (hour, minutes)
439 elif itype == "datetime-local":
440 m = RE_DATETIME.match(value)
441 if m:
442 year = int(m.group('year'), 10)
443 month = int(m.group('month'), 10)
444 day = int(m.group('day'), 10)
445 hour = int(m.group('hour'), 10)
446 minutes = int(m.group('minutes'), 10)
447 if (
448 cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day) and
449 cls.validate_hour(hour) and cls.validate_minutes(minutes)
450 ):
451 parsed = (year, month, day, hour, minutes)
452 elif itype in ("number", "range"):
453 m = RE_NUM.match(value)
454 if m:
455 parsed = float(m.group('value'))
456 return parsed
457
458
459 class _Match(object):
460 """Perform CSS matching."""
461
462 def __init__(self, selectors, scope, namespaces, flags):
463 """Initialize."""
464
465 self.assert_valid_input(scope)
466 self.tag = scope
467 self.cached_meta_lang = []
468 self.cached_default_forms = []
469 self.cached_indeterminate_forms = []
470 self.selectors = selectors
471 self.namespaces = {} if namespaces is None else namespaces
472 self.flags = flags
473 self.iframe_restrict = False
474
475 # Find the root element for the whole tree
476 doc = scope
477 parent = self.get_parent(doc)
478 while parent:
479 doc = parent
480 parent = self.get_parent(doc)
481 root = None
482 if not self.is_doc(doc):
483 root = doc
484 else:
485 for child in self.get_children(doc):
486 root = child
487 break
488
489 self.root = root
490 self.scope = scope if scope is not doc else root
491 self.has_html_namespace = self.has_html_ns(root)
492
493 # A document can be both XML and HTML (XHTML)
494 self.is_xml = self.is_xml_tree(doc)
495 self.is_html = not self.is_xml or self.has_html_namespace
496
497 def supports_namespaces(self):
498 """Check if namespaces are supported in the HTML type."""
499
500 return self.is_xml or self.has_html_namespace
501
502 def get_tag_ns(self, el):
503 """Get tag namespace."""
504
505 if self.supports_namespaces():
506 namespace = ''
507 ns = self.get_uri(el)
508 if ns:
509 namespace = ns
510 else:
511 namespace = NS_XHTML
512 return namespace
513
514 def is_html_tag(self, el):
515 """Check if tag is in HTML namespace."""
516
517 return self.get_tag_ns(el) == NS_XHTML
518
519 def get_tag(self, el):
520 """Get tag."""
521
522 name = self.get_tag_name(el)
523 return util.lower(name) if name is not None and not self.is_xml else name
524
525 def get_prefix(self, el):
526 """Get prefix."""
527
528 prefix = self.get_prefix_name(el)
529 return util.lower(prefix) if prefix is not None and not self.is_xml else prefix
530
531 def find_bidi(self, el):
532 """Get directionality from element text."""
533
534 for node in self.get_children(el, tags=False):
535
536 # Analyze child text nodes
537 if self.is_tag(node):
538
539 # Avoid analyzing certain elements specified in the specification.
540 direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(node, 'dir', '')), None)
541 if (
542 self.get_tag(node) in ('bdi', 'script', 'style', 'textarea', 'iframe') or
543 not self.is_html_tag(node) or
544 direction is not None
545 ):
546 continue # pragma: no cover
547
548 # Check directionality of this node's text
549 value = self.find_bidi(node)
550 if value is not None:
551 return value
552
553 # Direction could not be determined
554 continue # pragma: no cover
555
556 # Skip `doctype` comments, etc.
557 if self.is_special_string(node):
558 continue
559
560 # Analyze text nodes for directionality.
561 for c in node:
562 bidi = unicodedata.bidirectional(c)
563 if bidi in ('AL', 'R', 'L'):
564 return ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL
565 return None
566
567 def extended_language_filter(self, lang_range, lang_tag):
568 """Filter the language tags."""
569
570 match = True
571 lang_range = RE_WILD_STRIP.sub('-', lang_range).lower()
572 ranges = lang_range.split('-')
573 subtags = lang_tag.lower().split('-')
574 length = len(ranges)
575 rindex = 0
576 sindex = 0
577 r = ranges[rindex]
578 s = subtags[sindex]
579
580 # Primary tag needs to match
581 if r != '*' and r != s:
582 match = False
583
584 rindex += 1
585 sindex += 1
586
587 # Match until we run out of ranges
588 while match and rindex < length:
589 r = ranges[rindex]
590 try:
591 s = subtags[sindex]
592 except IndexError:
593 # Ran out of subtags,
594 # but we still have ranges
595 match = False
596 continue
597
598 # Empty range
599 if not r:
600 match = False
601 continue
602
603 # Matched range
604 elif s == r:
605 rindex += 1
606
607 # Implicit wildcard cannot match
608 # singletons
609 elif len(s) == 1:
610 match = False
611 continue
612
613 # Implicitly matched, so grab next subtag
614 sindex += 1
615
616 return match
617
618 def match_attribute_name(self, el, attr, prefix):
619 """Match attribute name and return value if it exists."""
620
621 value = None
622 if self.supports_namespaces():
623 value = None
624 # If we have not defined namespaces, we can't very well find them, so don't bother trying.
625 if prefix:
626 ns = self.namespaces.get(prefix)
627 if ns is None and prefix != '*':
628 return None
629 else:
630 ns = None
631
632 for k, v in self.iter_attributes(el):
633
634 # Get attribute parts
635 namespace, name = self.split_namespace(el, k)
636
637 # Can't match a prefix attribute as we haven't specified one to match
638 # Try to match it normally as a whole `p:a` as selector may be trying `p\:a`.
639 if ns is None:
640 if (self.is_xml and attr == k) or (not self.is_xml and util.lower(attr) == util.lower(k)):
641 value = v
642 break
643 # Coverage is not finding this even though it is executed.
644 # Adding a print statement before this (and erasing coverage) causes coverage to find the line.
645 # Ignore the false positive message.
646 continue # pragma: no cover
647
648 # We can't match our desired prefix attribute as the attribute doesn't have a prefix
649 if namespace is None or ns != namespace and prefix != '*':
650 continue
651
652 # The attribute doesn't match.
653 if (util.lower(attr) != util.lower(name)) if not self.is_xml else (attr != name):
654 continue
655
656 value = v
657 break
658 else:
659 for k, v in self.iter_attributes(el):
660 if util.lower(attr) != util.lower(k):
661 continue
662 value = v
663 break
664 return value
665
666 def match_namespace(self, el, tag):
667 """Match the namespace of the element."""
668
669 match = True
670 namespace = self.get_tag_ns(el)
671 default_namespace = self.namespaces.get('')
672 tag_ns = '' if tag.prefix is None else self.namespaces.get(tag.prefix, None)
673 # We must match the default namespace if one is not provided
674 if tag.prefix is None and (default_namespace is not None and namespace != default_namespace):
675 match = False
676 # If we specified `|tag`, we must not have a namespace.
677 elif (tag.prefix is not None and tag.prefix == '' and namespace):
678 match = False
679 # Verify prefix matches
680 elif (
681 tag.prefix and
682 tag.prefix != '*' and (tag_ns is None or namespace != tag_ns)
683 ):
684 match = False
685 return match
686
687 def match_attributes(self, el, attributes):
688 """Match attributes."""
689
690 match = True
691 if attributes:
692 for a in attributes:
693 value = self.match_attribute_name(el, a.attribute, a.prefix)
694 pattern = a.xml_type_pattern if self.is_xml and a.xml_type_pattern else a.pattern
695 if isinstance(value, list):
696 value = ' '.join(value)
697 if value is None:
698 match = False
699 break
700 elif pattern is None:
701 continue
702 elif pattern.match(value) is None:
703 match = False
704 break
705 return match
706
707 def match_tagname(self, el, tag):
708 """Match tag name."""
709
710 name = (util.lower(tag.name) if not self.is_xml and tag.name is not None else tag.name)
711 return not (
712 name is not None and
713 name not in (self.get_tag(el), '*')
714 )
715
716 def match_tag(self, el, tag):
717 """Match the tag."""
718
719 match = True
720 if tag is not None:
721 # Verify namespace
722 if not self.match_namespace(el, tag):
723 match = False
724 if not self.match_tagname(el, tag):
725 match = False
726 return match
727
728 def match_past_relations(self, el, relation):
729 """Match past relationship."""
730
731 found = False
732 if relation[0].rel_type == REL_PARENT:
733 parent = self.get_parent(el, no_iframe=self.iframe_restrict)
734 while not found and parent:
735 found = self.match_selectors(parent, relation)
736 parent = self.get_parent(parent, no_iframe=self.iframe_restrict)
737 elif relation[0].rel_type == REL_CLOSE_PARENT:
738 parent = self.get_parent(el, no_iframe=self.iframe_restrict)
739 if parent:
740 found = self.match_selectors(parent, relation)
741 elif relation[0].rel_type == REL_SIBLING:
742 sibling = self.get_previous(el)
743 while not found and sibling:
744 found = self.match_selectors(sibling, relation)
745 sibling = self.get_previous(sibling)
746 elif relation[0].rel_type == REL_CLOSE_SIBLING:
747 sibling = self.get_previous(el)
748 if sibling and self.is_tag(sibling):
749 found = self.match_selectors(sibling, relation)
750 return found
751
752 def match_future_child(self, parent, relation, recursive=False):
753 """Match future child."""
754
755 match = False
756 children = self.get_descendants if recursive else self.get_children
757 for child in children(parent, no_iframe=self.iframe_restrict):
758 match = self.match_selectors(child, relation)
759 if match:
760 break
761 return match
762
763 def match_future_relations(self, el, relation):
764 """Match future relationship."""
765
766 found = False
767 if relation[0].rel_type == REL_HAS_PARENT:
768 found = self.match_future_child(el, relation, True)
769 elif relation[0].rel_type == REL_HAS_CLOSE_PARENT:
770 found = self.match_future_child(el, relation)
771 elif relation[0].rel_type == REL_HAS_SIBLING:
772 sibling = self.get_next(el)
773 while not found and sibling:
774 found = self.match_selectors(sibling, relation)
775 sibling = self.get_next(sibling)
776 elif relation[0].rel_type == REL_HAS_CLOSE_SIBLING:
777 sibling = self.get_next(el)
778 if sibling and self.is_tag(sibling):
779 found = self.match_selectors(sibling, relation)
780 return found
781
782 def match_relations(self, el, relation):
783 """Match relationship to other elements."""
784
785 found = False
786
787 if relation[0].rel_type.startswith(':'):
788 found = self.match_future_relations(el, relation)
789 else:
790 found = self.match_past_relations(el, relation)
791
792 return found
793
794 def match_id(self, el, ids):
795 """Match element's ID."""
796
797 found = True
798 for i in ids:
799 if i != self.get_attribute_by_name(el, 'id', ''):
800 found = False
801 break
802 return found
803
804 def match_classes(self, el, classes):
805 """Match element's classes."""
806
807 current_classes = self.get_classes(el)
808 found = True
809 for c in classes:
810 if c not in current_classes:
811 found = False
812 break
813 return found
814
815 def match_root(self, el):
816 """Match element as root."""
817
818 is_root = self.is_root(el)
819 if is_root:
820 sibling = self.get_previous(el, tags=False)
821 while is_root and sibling is not None:
822 if (
823 self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or
824 self.is_cdata(sibling)
825 ):
826 is_root = False
827 else:
828 sibling = self.get_previous(sibling, tags=False)
829 if is_root:
830 sibling = self.get_next(el, tags=False)
831 while is_root and sibling is not None:
832 if (
833 self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or
834 self.is_cdata(sibling)
835 ):
836 is_root = False
837 else:
838 sibling = self.get_next(sibling, tags=False)
839 return is_root
840
841 def match_scope(self, el):
842 """Match element as scope."""
843
844 return self.scope is el
845
846 def match_nth_tag_type(self, el, child):
847 """Match tag type for `nth` matches."""
848
849 return(
850 (self.get_tag(child) == self.get_tag(el)) and
851 (self.get_tag_ns(child) == self.get_tag_ns(el))
852 )
853
854 def match_nth(self, el, nth):
855 """Match `nth` elements."""
856
857 matched = True
858
859 for n in nth:
860 matched = False
861 if n.selectors and not self.match_selectors(el, n.selectors):
862 break
863 parent = self.get_parent(el)
864 if parent is None:
865 parent = self.create_fake_parent(el)
866 last = n.last
867 last_index = len(parent) - 1
868 index = last_index if last else 0
869 relative_index = 0
870 a = n.a
871 b = n.b
872 var = n.n
873 count = 0
874 count_incr = 1
875 factor = -1 if last else 1
876 idx = last_idx = a * count + b if var else a
877
878 # We can only adjust bounds within a variable index
879 if var:
880 # Abort if our nth index is out of bounds and only getting further out of bounds as we increment.
881 # Otherwise, increment to try to get in bounds.
882 adjust = None
883 while idx < 1 or idx > last_index:
884 if idx < 0:
885 diff_low = 0 - idx
886 if adjust is not None and adjust == 1:
887 break
888 adjust = -1
889 count += count_incr
890 idx = last_idx = a * count + b if var else a
891 diff = 0 - idx
892 if diff >= diff_low:
893 break
894 else:
895 diff_high = idx - last_index
896 if adjust is not None and adjust == -1:
897 break
898 adjust = 1
899 count += count_incr
900 idx = last_idx = a * count + b if var else a
901 diff = idx - last_index
902 if diff >= diff_high:
903 break
904 diff_high = diff
905
906 # If a < 0, our count is working backwards, so floor the index by increasing the count.
907 # Find the count that yields the lowest, in bound value and use that.
908 # Lastly reverse count increment so that we'll increase our index.
909 lowest = count
910 if a < 0:
911 while idx >= 1:
912 lowest = count
913 count += count_incr
914 idx = last_idx = a * count + b if var else a
915 count_incr = -1
916 count = lowest
917 idx = last_idx = a * count + b if var else a
918
919 # Evaluate elements while our calculated nth index is still in range
920 while 1 <= idx <= last_index + 1:
921 child = None
922 # Evaluate while our child index is still in range.
923 for child in self.get_children(parent, start=index, reverse=factor < 0, tags=False):
924 index += factor
925 if not self.is_tag(child):
926 continue
927 # Handle `of S` in `nth-child`
928 if n.selectors and not self.match_selectors(child, n.selectors):
929 continue
930 # Handle `of-type`
931 if n.of_type and not self.match_nth_tag_type(el, child):
932 continue
933 relative_index += 1
934 if relative_index == idx:
935 if child is el:
936 matched = True
937 else:
938 break
939 if child is el:
940 break
941 if child is el:
942 break
943 last_idx = idx
944 count += count_incr
945 if count < 0:
946 # Count is counting down and has now ventured into invalid territory.
947 break
948 idx = a * count + b if var else a
949 if last_idx == idx:
950 break
951 if not matched:
952 break
953 return matched
954
955 def match_empty(self, el):
956 """Check if element is empty (if requested)."""
957
958 is_empty = True
959 for child in self.get_children(el, tags=False):
960 if self.is_tag(child):
961 is_empty = False
962 break
963 elif self.is_content_string(child) and RE_NOT_EMPTY.search(child):
964 is_empty = False
965 break
966 return is_empty
967
968 def match_subselectors(self, el, selectors):
969 """Match selectors."""
970
971 match = True
972 for sel in selectors:
973 if not self.match_selectors(el, sel):
974 match = False
975 return match
976
977 def match_contains(self, el, contains):
978 """Match element if it contains text."""
979
980 match = True
981 content = None
982 for contain_list in contains:
983 if content is None:
984 if contain_list.own:
985 content = self.get_own_text(el, no_iframe=self.is_html)
986 else:
987 content = self.get_text(el, no_iframe=self.is_html)
988 found = False
989 for text in contain_list.text:
990 if contain_list.own:
991 for c in content:
992 if text in c:
993 found = True
994 break
995 if found:
996 break
997 else:
998 if text in content:
999 found = True
1000 break
1001 if not found:
1002 match = False
1003 return match
1004
1005 def match_default(self, el):
1006 """Match default."""
1007
1008 match = False
1009
1010 # Find this input's form
1011 form = None
1012 parent = self.get_parent(el, no_iframe=True)
1013 while parent and form is None:
1014 if self.get_tag(parent) == 'form' and self.is_html_tag(parent):
1015 form = parent
1016 else:
1017 parent = self.get_parent(parent, no_iframe=True)
1018
1019 # Look in form cache to see if we've already located its default button
1020 found_form = False
1021 for f, t in self.cached_default_forms:
1022 if f is form:
1023 found_form = True
1024 if t is el:
1025 match = True
1026 break
1027
1028 # We didn't have the form cached, so look for its default button
1029 if not found_form:
1030 for child in self.get_descendants(form, no_iframe=True):
1031 name = self.get_tag(child)
1032 # Can't do nested forms (haven't figured out why we never hit this)
1033 if name == 'form': # pragma: no cover
1034 break
1035 if name in ('input', 'button'):
1036 v = self.get_attribute_by_name(child, 'type', '')
1037 if v and util.lower(v) == 'submit':
1038 self.cached_default_forms.append([form, child])
1039 if el is child:
1040 match = True
1041 break
1042 return match
1043
1044 def match_indeterminate(self, el):
1045 """Match default."""
1046
1047 match = False
1048 name = self.get_attribute_by_name(el, 'name')
1049
1050 def get_parent_form(el):
1051 """Find this input's form."""
1052 form = None
1053 parent = self.get_parent(el, no_iframe=True)
1054 while form is None:
1055 if self.get_tag(parent) == 'form' and self.is_html_tag(parent):
1056 form = parent
1057 break
1058 last_parent = parent
1059 parent = self.get_parent(parent, no_iframe=True)
1060 if parent is None:
1061 form = last_parent
1062 break
1063 return form
1064
1065 form = get_parent_form(el)
1066
1067 # Look in form cache to see if we've already evaluated that its fellow radio buttons are indeterminate
1068 found_form = False
1069 for f, n, i in self.cached_indeterminate_forms:
1070 if f is form and n == name:
1071 found_form = True
1072 if i is True:
1073 match = True
1074 break
1075
1076 # We didn't have the form cached, so validate that the radio button is indeterminate
1077 if not found_form:
1078 checked = False
1079 for child in self.get_descendants(form, no_iframe=True):
1080 if child is el:
1081 continue
1082 tag_name = self.get_tag(child)
1083 if tag_name == 'input':
1084 is_radio = False
1085 check = False
1086 has_name = False
1087 for k, v in self.iter_attributes(child):
1088 if util.lower(k) == 'type' and util.lower(v) == 'radio':
1089 is_radio = True
1090 elif util.lower(k) == 'name' and v == name:
1091 has_name = True
1092 elif util.lower(k) == 'checked':
1093 check = True
1094 if is_radio and check and has_name and get_parent_form(child) is form:
1095 checked = True
1096 break
1097 if checked:
1098 break
1099 if not checked:
1100 match = True
1101 self.cached_indeterminate_forms.append([form, name, match])
1102
1103 return match
1104
1105 def match_lang(self, el, langs):
1106 """Match languages."""
1107
1108 match = False
1109 has_ns = self.supports_namespaces()
1110 root = self.root
1111 has_html_namespace = self.has_html_namespace
1112
1113 # Walk parents looking for `lang` (HTML) or `xml:lang` XML property.
1114 parent = el
1115 found_lang = None
1116 last = None
1117 while not found_lang:
1118 has_html_ns = self.has_html_ns(parent)
1119 for k, v in self.iter_attributes(parent):
1120 attr_ns, attr = self.split_namespace(parent, k)
1121 if (
1122 ((not has_ns or has_html_ns) and (util.lower(k) if not self.is_xml else k) == 'lang') or
1123 (
1124 has_ns and not has_html_ns and attr_ns == NS_XML and
1125 (util.lower(attr) if not self.is_xml and attr is not None else attr) == 'lang'
1126 )
1127 ):
1128 found_lang = v
1129 break
1130 last = parent
1131 parent = self.get_parent(parent, no_iframe=self.is_html)
1132
1133 if parent is None:
1134 root = last
1135 has_html_namespace = self.has_html_ns(root)
1136 parent = last
1137 break
1138
1139 # Use cached meta language.
1140 if not found_lang and self.cached_meta_lang:
1141 for cache in self.cached_meta_lang:
1142 if root is cache[0]:
1143 found_lang = cache[1]
1144
1145 # If we couldn't find a language, and the document is HTML, look to meta to determine language.
1146 if found_lang is None and (not self.is_xml or (has_html_namespace and root.name == 'html')):
1147 # Find head
1148 found = False
1149 for tag in ('html', 'head'):
1150 found = False
1151 for child in self.get_children(parent, no_iframe=self.is_html):
1152 if self.get_tag(child) == tag and self.is_html_tag(child):
1153 found = True
1154 parent = child
1155 break
1156 if not found: # pragma: no cover
1157 break
1158
1159 # Search meta tags
1160 if found:
1161 for child in parent:
1162 if self.is_tag(child) and self.get_tag(child) == 'meta' and self.is_html_tag(parent):
1163 c_lang = False
1164 content = None
1165 for k, v in self.iter_attributes(child):
1166 if util.lower(k) == 'http-equiv' and util.lower(v) == 'content-language':
1167 c_lang = True
1168 if util.lower(k) == 'content':
1169 content = v
1170 if c_lang and content:
1171 found_lang = content
1172 self.cached_meta_lang.append((root, found_lang))
1173 break
1174 if found_lang:
1175 break
1176 if not found_lang:
1177 self.cached_meta_lang.append((root, False))
1178
1179 # If we determined a language, compare.
1180 if found_lang:
1181 for patterns in langs:
1182 match = False
1183 for pattern in patterns:
1184 if self.extended_language_filter(pattern, found_lang):
1185 match = True
1186 if not match:
1187 break
1188
1189 return match
1190
1191 def match_dir(self, el, directionality):
1192 """Check directionality."""
1193
1194 # If we have to match both left and right, we can't match either.
1195 if directionality & ct.SEL_DIR_LTR and directionality & ct.SEL_DIR_RTL:
1196 return False
1197
1198 if el is None or not self.is_html_tag(el):
1199 return False
1200
1201 # Element has defined direction of left to right or right to left
1202 direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(el, 'dir', '')), None)
1203 if direction not in (None, 0):
1204 return direction == directionality
1205
1206 # Element is the document element (the root) and no direction assigned, assume left to right.
1207 is_root = self.is_root(el)
1208 if is_root and direction is None:
1209 return ct.SEL_DIR_LTR == directionality
1210
1211 # If `input[type=telephone]` and no direction is assigned, assume left to right.
1212 name = self.get_tag(el)
1213 is_input = name == 'input'
1214 is_textarea = name == 'textarea'
1215 is_bdi = name == 'bdi'
1216 itype = util.lower(self.get_attribute_by_name(el, 'type', '')) if is_input else ''
1217 if is_input and itype == 'tel' and direction is None:
1218 return ct.SEL_DIR_LTR == directionality
1219
1220 # Auto handling for text inputs
1221 if ((is_input and itype in ('text', 'search', 'tel', 'url', 'email')) or is_textarea) and direction == 0:
1222 if is_textarea:
1223 value = []
1224 for node in self.get_contents(el, no_iframe=True):
1225 if self.is_content_string(node):
1226 value.append(node)
1227 value = ''.join(value)
1228 else:
1229 value = self.get_attribute_by_name(el, 'value', '')
1230 if value:
1231 for c in value:
1232 bidi = unicodedata.bidirectional(c)
1233 if bidi in ('AL', 'R', 'L'):
1234 direction = ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL
1235 return direction == directionality
1236 # Assume left to right
1237 return ct.SEL_DIR_LTR == directionality
1238 elif is_root:
1239 return ct.SEL_DIR_LTR == directionality
1240 return self.match_dir(self.get_parent(el, no_iframe=True), directionality)
1241
1242 # Auto handling for `bdi` and other non text inputs.
1243 if (is_bdi and direction is None) or direction == 0:
1244 direction = self.find_bidi(el)
1245 if direction is not None:
1246 return direction == directionality
1247 elif is_root:
1248 return ct.SEL_DIR_LTR == directionality
1249 return self.match_dir(self.get_parent(el, no_iframe=True), directionality)
1250
1251 # Match parents direction
1252 return self.match_dir(self.get_parent(el, no_iframe=True), directionality)
1253
1254 def match_range(self, el, condition):
1255 """
1256 Match range.
1257
1258 Behavior is modeled after what we see in browsers. Browsers seem to evaluate
1259 if the value is out of range, and if not, it is in range. So a missing value
1260 will not evaluate out of range; therefore, value is in range. Personally, I
1261 feel like this should evaluate as neither in or out of range.
1262 """
1263
1264 out_of_range = False
1265
1266 itype = util.lower(self.get_attribute_by_name(el, 'type'))
1267 mn = self.get_attribute_by_name(el, 'min', None)
1268 if mn is not None:
1269 mn = Inputs.parse_value(itype, mn)
1270 mx = self.get_attribute_by_name(el, 'max', None)
1271 if mx is not None:
1272 mx = Inputs.parse_value(itype, mx)
1273
1274 # There is no valid min or max, so we cannot evaluate a range
1275 if mn is None and mx is None:
1276 return False
1277
1278 value = self.get_attribute_by_name(el, 'value', None)
1279 if value is not None:
1280 value = Inputs.parse_value(itype, value)
1281 if value is not None:
1282 if itype in ("date", "datetime-local", "month", "week", "number", "range"):
1283 if mn is not None and value < mn:
1284 out_of_range = True
1285 if not out_of_range and mx is not None and value > mx:
1286 out_of_range = True
1287 elif itype == "time":
1288 if mn is not None and mx is not None and mn > mx:
1289 # Time is periodic, so this is a reversed/discontinuous range
1290 if value < mn and value > mx:
1291 out_of_range = True
1292 else:
1293 if mn is not None and value < mn:
1294 out_of_range = True
1295 if not out_of_range and mx is not None and value > mx:
1296 out_of_range = True
1297
1298 return not out_of_range if condition & ct.SEL_IN_RANGE else out_of_range
1299
1300 def match_defined(self, el):
1301 """
1302 Match defined.
1303
1304 `:defined` is related to custom elements in a browser.
1305
1306 - If the document is XML (not XHTML), all tags will match.
1307 - Tags that are not custom (don't have a hyphen) are marked defined.
1308 - If the tag has a prefix (without or without a namespace), it will not match.
1309
1310 This is of course requires the parser to provide us with the proper prefix and namespace info,
1311 if it doesn't, there is nothing we can do.
1312 """
1313
1314 name = self.get_tag(el)
1315 return (
1316 name.find('-') == -1 or
1317 name.find(':') != -1 or
1318 self.get_prefix(el) is not None
1319 )
1320
1321 def match_placeholder_shown(self, el):
1322 """
1323 Match placeholder shown according to HTML spec.
1324
1325 - text area should be checked if they have content. A single newline does not count as content.
1326
1327 """
1328
1329 match = False
1330 content = self.get_text(el)
1331 if content in ('', '\n'):
1332 match = True
1333
1334 return match
1335
1336 def match_selectors(self, el, selectors):
1337 """Check if element matches one of the selectors."""
1338
1339 match = False
1340 is_not = selectors.is_not
1341 is_html = selectors.is_html
1342
1343 # Internal selector lists that use the HTML flag, will automatically get the `html` namespace.
1344 if is_html:
1345 namespaces = self.namespaces
1346 iframe_restrict = self.iframe_restrict
1347 self.namespaces = {'html': NS_XHTML}
1348 self.iframe_restrict = True
1349
1350 if not is_html or self.is_html:
1351 for selector in selectors:
1352 match = is_not
1353 # We have a un-matchable situation (like `:focus` as you can focus an element in this environment)
1354 if isinstance(selector, ct.SelectorNull):
1355 continue
1356 # Verify tag matches
1357 if not self.match_tag(el, selector.tag):
1358 continue
1359 # Verify tag is defined
1360 if selector.flags & ct.SEL_DEFINED and not self.match_defined(el):
1361 continue
1362 # Verify element is root
1363 if selector.flags & ct.SEL_ROOT and not self.match_root(el):
1364 continue
1365 # Verify element is scope
1366 if selector.flags & ct.SEL_SCOPE and not self.match_scope(el):
1367 continue
1368 # Verify element has placeholder shown
1369 if selector.flags & ct.SEL_PLACEHOLDER_SHOWN and not self.match_placeholder_shown(el):
1370 continue
1371 # Verify `nth` matches
1372 if not self.match_nth(el, selector.nth):
1373 continue
1374 if selector.flags & ct.SEL_EMPTY and not self.match_empty(el):
1375 continue
1376 # Verify id matches
1377 if selector.ids and not self.match_id(el, selector.ids):
1378 continue
1379 # Verify classes match
1380 if selector.classes and not self.match_classes(el, selector.classes):
1381 continue
1382 # Verify attribute(s) match
1383 if not self.match_attributes(el, selector.attributes):
1384 continue
1385 # Verify ranges
1386 if selector.flags & RANGES and not self.match_range(el, selector.flags & RANGES):
1387 continue
1388 # Verify language patterns
1389 if selector.lang and not self.match_lang(el, selector.lang):
1390 continue
1391 # Verify pseudo selector patterns
1392 if selector.selectors and not self.match_subselectors(el, selector.selectors):
1393 continue
1394 # Verify relationship selectors
1395 if selector.relation and not self.match_relations(el, selector.relation):
1396 continue
1397 # Validate that the current default selector match corresponds to the first submit button in the form
1398 if selector.flags & ct.SEL_DEFAULT and not self.match_default(el):
1399 continue
1400 # Validate that the unset radio button is among radio buttons with the same name in a form that are
1401 # also not set.
1402 if selector.flags & ct.SEL_INDETERMINATE and not self.match_indeterminate(el):
1403 continue
1404 # Validate element directionality
1405 if selector.flags & DIR_FLAGS and not self.match_dir(el, selector.flags & DIR_FLAGS):
1406 continue
1407 # Validate that the tag contains the specified text.
1408 if not self.match_contains(el, selector.contains):
1409 continue
1410 match = not is_not
1411 break
1412
1413 # Restore actual namespaces being used for external selector lists
1414 if is_html:
1415 self.namespaces = namespaces
1416 self.iframe_restrict = iframe_restrict
1417
1418 return match
1419
1420 def select(self, limit=0):
1421 """Match all tags under the targeted tag."""
1422
1423 if limit < 1:
1424 limit = None
1425
1426 for child in self.get_descendants(self.tag):
1427 if self.match(child):
1428 yield child
1429 if limit is not None:
1430 limit -= 1
1431 if limit < 1:
1432 break
1433
1434 def closest(self):
1435 """Match closest ancestor."""
1436
1437 current = self.tag
1438 closest = None
1439 while closest is None and current is not None:
1440 if self.match(current):
1441 closest = current
1442 else:
1443 current = self.get_parent(current)
1444 return closest
1445
1446 def filter(self): # noqa A001
1447 """Filter tag's children."""
1448
1449 return [tag for tag in self.get_contents(self.tag) if not self.is_navigable_string(tag) and self.match(tag)]
1450
1451 def match(self, el):
1452 """Match."""
1453
1454 return not self.is_doc(el) and self.is_tag(el) and self.match_selectors(el, self.selectors)
1455
1456
1457 class CSSMatch(_DocumentNav, _Match):
1458 """The Beautiful Soup CSS match class."""
1459
1460
1461 class SoupSieve(ct.Immutable):
1462 """Compiled Soup Sieve selector matching object."""
1463
1464 __slots__ = ("pattern", "selectors", "namespaces", "custom", "flags", "_hash")
1465
1466 def __init__(self, pattern, selectors, namespaces, custom, flags):
1467 """Initialize."""
1468
1469 super(SoupSieve, self).__init__(
1470 pattern=pattern,
1471 selectors=selectors,
1472 namespaces=namespaces,
1473 custom=custom,
1474 flags=flags
1475 )
1476
1477 def match(self, tag):
1478 """Match."""
1479
1480 return CSSMatch(self.selectors, tag, self.namespaces, self.flags).match(tag)
1481
1482 def closest(self, tag):
1483 """Match closest ancestor."""
1484
1485 return CSSMatch(self.selectors, tag, self.namespaces, self.flags).closest()
1486
1487 def filter(self, iterable): # noqa A001
1488 """
1489 Filter.
1490
1491 `CSSMatch` can cache certain searches for tags of the same document,
1492 so if we are given a tag, all tags are from the same document,
1493 and we can take advantage of the optimization.
1494
1495 Any other kind of iterable could have tags from different documents or detached tags,
1496 so for those, we use a new `CSSMatch` for each item in the iterable.
1497 """
1498
1499 if CSSMatch.is_tag(iterable):
1500 return CSSMatch(self.selectors, iterable, self.namespaces, self.flags).filter()
1501 else:
1502 return [node for node in iterable if not CSSMatch.is_navigable_string(node) and self.match(node)]
1503
1504 def select_one(self, tag):
1505 """Select a single tag."""
1506
1507 tags = self.select(tag, limit=1)
1508 return tags[0] if tags else None
1509
1510 def select(self, tag, limit=0):
1511 """Select the specified tags."""
1512
1513 return list(self.iselect(tag, limit))
1514
1515 def iselect(self, tag, limit=0):
1516 """Iterate the specified tags."""
1517
1518 for el in CSSMatch(self.selectors, tag, self.namespaces, self.flags).select(limit):
1519 yield el
1520
1521 def __repr__(self): # pragma: no cover
1522 """Representation."""
1523
1524 return "SoupSieve(pattern={!r}, namespaces={!r}, custom={!r}, flags={!r})".format(
1525 self.pattern,
1526 self.namespaces,
1527 self.custom,
1528 self.flags
1529 )
1530
1531 __str__ = __repr__
1532
1533
1534 ct.pickle_register(SoupSieve)