Mercurial > repos > shellac > sam_consensus_v3

"""CSS matcher."""
from datetime import datetime
from . import util
import re
from .import css_types as ct
import unicodedata
from collections.abc import Sequence

import bs4

# Empty tag pattern (whitespace okay)
RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]')

RE_NOT_WS = re.compile('[^ \t\r\n\f]+')

# Relationships
REL_PARENT = ' '
REL_CLOSE_PARENT = '>'
REL_SIBLING = '~'
REL_CLOSE_SIBLING = '+'

# Relationships for :has() (forward looking)
REL_HAS_PARENT = ': '
REL_HAS_CLOSE_PARENT = ':>'
REL_HAS_SIBLING = ':~'
REL_HAS_CLOSE_SIBLING = ':+'

NS_XHTML = 'http://www.w3.org/1999/xhtml'
NS_XML = 'http://www.w3.org/XML/1998/namespace'

DIR_FLAGS = ct.SEL_DIR_LTR | ct.SEL_DIR_RTL
RANGES = ct.SEL_IN_RANGE | ct.SEL_OUT_OF_RANGE

DIR_MAP = {
    'ltr': ct.SEL_DIR_LTR,
    'rtl': ct.SEL_DIR_RTL,
    'auto': 0
}

RE_NUM = re.compile(r"^(?P<value>-?(?:[0-9]{1,}(\.[0-9]+)?|\.[0-9]+))$")
RE_TIME = re.compile(r'^(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$')
RE_MONTH = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})$')
RE_WEEK = re.compile(r'^(?P<year>[0-9]{4,})-W(?P<week>[0-9]{2})$')
RE_DATE = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})$')
RE_DATETIME = re.compile(
    r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})T(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$'
)
RE_WILD_STRIP = re.compile(r'(?:(?:-\*-)(?:\*(?:-|$))*|-\*$)')

MONTHS_30 = (4, 6, 9, 11)  # April, June, September, and November
FEB = 2
SHORT_MONTH = 30
LONG_MONTH = 31
FEB_MONTH = 28
FEB_LEAP_MONTH = 29
DAYS_IN_WEEK = 7


class _FakeParent(object):
    """
    Fake parent class.

    When we have a fragment with no `BeautifulSoup` document object,
    we can't evaluate `nth` selectors properly.  Create a temporary
    fake parent so we can traverse the root element as a child.
    """

    def __init__(self, element):
        """Initialize."""

        self.contents = [element]

    def __len__(self):
        """Length."""

        return len(self.contents)


class _DocumentNav(object):
    """Navigate a Beautiful Soup document."""

    @classmethod
    def assert_valid_input(cls, tag):
        """Check if valid input tag or document."""

        # Fail on unexpected types.
        if not cls.is_tag(tag):
            raise TypeError("Expected a BeautifulSoup 'Tag', but instead recieved type {}".format(type(tag)))

    @staticmethod
    def is_doc(obj):
        """Is `BeautifulSoup` object."""
        return isinstance(obj, bs4.BeautifulSoup)

    @staticmethod
    def is_tag(obj):
        """Is tag."""
        return isinstance(obj, bs4.Tag)

    @staticmethod
    def is_declaration(obj):  # pragma: no cover
        """Is declaration."""
        return isinstance(obj, bs4.Declaration)

    @staticmethod
    def is_cdata(obj):
        """Is CDATA."""
        return isinstance(obj, bs4.CData)

    @staticmethod
    def is_processing_instruction(obj):  # pragma: no cover
        """Is processing instruction."""
        return isinstance(obj, bs4.ProcessingInstruction)

    @staticmethod
    def is_navigable_string(obj):
        """Is navigable string."""
        return isinstance(obj, bs4.NavigableString)

    @staticmethod
    def is_special_string(obj):
        """Is special string."""
        return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype))

    @classmethod
    def is_content_string(cls, obj):
        """Check if node is content string."""

        return cls.is_navigable_string(obj) and not cls.is_special_string(obj)

    @staticmethod
    def create_fake_parent(el):
        """Create fake parent for a given element."""

        return _FakeParent(el)

    @staticmethod
    def is_xml_tree(el):
        """Check if element (or document) is from a XML tree."""

        return el._is_xml

    def is_iframe(self, el):
        """Check if element is an `iframe`."""

        return ((el.name if self.is_xml_tree(el) else util.lower(el.name)) == 'iframe') and self.is_html_tag(el)

    def is_root(self, el):
        """
        Return whether element is a root element.

        We check that the element is the root of the tree (which we have already pre-calculated),
        and we check if it is the root element under an `iframe`.
        """

        root = self.root and self.root is el
        if not root:
            parent = self.get_parent(el)
            root = parent is not None and self.is_html and self.is_iframe(parent)
        return root

    def get_contents(self, el, no_iframe=False):
        """Get contents or contents in reverse."""
        if not no_iframe or not self.is_iframe(el):
            for content in el.contents:
                yield content

    def get_children(self, el, start=None, reverse=False, tags=True, no_iframe=False):
        """Get children."""

        if not no_iframe or not self.is_iframe(el):
            last = len(el.contents) - 1
            if start is None:
                index = last if reverse else 0
            else:
                index = start
            end = -1 if reverse else last + 1
            incr = -1 if reverse else 1

            if 0 <= index <= last:
                while index != end:
                    node = el.contents[index]
                    index += incr
                    if not tags or self.is_tag(node):
                        yield node

    def get_descendants(self, el, tags=True, no_iframe=False):
        """Get descendants."""

        if not no_iframe or not self.is_iframe(el):
            next_good = None
            for child in el.descendants:

                if next_good is not None:
                    if child is not next_good:
                        continue
                    next_good = None

                is_tag = self.is_tag(child)

                if no_iframe and is_tag and self.is_iframe(child):
                    if child.next_sibling is not None:
                        next_good = child.next_sibling
                    else:
                        last_child = child
                        while self.is_tag(last_child) and last_child.contents:
                            last_child = last_child.contents[-1]
                        next_good = last_child.next_element
                    yield child
                    if next_good is None:
                        break
                    # Coverage isn't seeing this even though it's executed
                    continue  # pragma: no cover

                if not tags or is_tag:
                    yield child

    def get_parent(self, el, no_iframe=False):
        """Get parent."""

        parent = el.parent
        if no_iframe and parent is not None and self.is_iframe(parent):
            parent = None
        return parent

    @staticmethod
    def get_tag_name(el):
        """Get tag."""

        return el.name

    @staticmethod
    def get_prefix_name(el):
        """Get prefix."""

        return el.prefix

    @staticmethod
    def get_uri(el):
        """Get namespace `URI`."""

        return el.namespace

    @classmethod
    def get_next(cls, el, tags=True):
        """Get next sibling tag."""

        sibling = el.next_sibling
        while tags and not cls.is_tag(sibling) and sibling is not None:
            sibling = sibling.next_sibling
        return sibling

    @classmethod
    def get_previous(cls, el, tags=True):
        """Get previous sibling tag."""

        sibling = el.previous_sibling
        while tags and not cls.is_tag(sibling) and sibling is not None:
            sibling = sibling.previous_sibling
        return sibling

    @staticmethod
    def has_html_ns(el):
        """
        Check if element has an HTML namespace.

        This is a bit different than whether a element is treated as having an HTML namespace,
        like we do in the case of `is_html_tag`.
        """

        ns = getattr(el, 'namespace') if el else None
        return ns and ns == NS_XHTML

    @staticmethod
    def split_namespace(el, attr_name):
        """Return namespace and attribute name without the prefix."""

        return getattr(attr_name, 'namespace', None), getattr(attr_name, 'name', None)

    @classmethod
    def normalize_value(cls, value):
        """Normalize the value to be a string or list of strings."""

        # Treat `None` as empty string.
        if value is None:
            return ''

        # Pass through strings
        if (isinstance(value, str)):
            return value

        # If it's a byte string, convert it to Unicode, treating it as UTF-8.
        if isinstance(value, bytes):
            return value.decode("utf8")

        # BeautifulSoup supports sequences of attribute values, so make sure the children are strings.
        if isinstance(value, Sequence):
            new_value = []
            for v in value:
                if isinstance(v, Sequence):
                    # This is most certainly a user error and will crash and burn later,
                    # but to avoid excessive recursion, kick out now.
                    new_value.append(v)
                else:
                    # Convert the child to a string
                    new_value.append(cls.normalize_value(v))
            return new_value

        # Try and make anything else a string
        return str(value)

    @classmethod
    def get_attribute_by_name(cls, el, name, default=None):
        """Get attribute by name."""

        value = default
        if el._is_xml:
            try:
                value = cls.normalize_value(el.attrs[name])
            except KeyError:
                pass
        else:
            for k, v in el.attrs.items():
                if util.lower(k) == name:
                    value = cls.normalize_value(v)
                    break
        return value

    @classmethod
    def iter_attributes(cls, el):
        """Iterate attributes."""

        for k, v in el.attrs.items():
            yield k, cls.normalize_value(v)

    @classmethod
    def get_classes(cls, el):
        """Get classes."""

        classes = cls.get_attribute_by_name(el, 'class', [])
        if isinstance(classes, str):
            classes = RE_NOT_WS.findall(classes)
        return classes

    def get_text(self, el, no_iframe=False):
        """Get text."""

        return ''.join(
            [node for node in self.get_descendants(el, tags=False, no_iframe=no_iframe) if self.is_content_string(node)]
        )

    def get_own_text(self, el, no_iframe=False):
        """Get Own Text."""

        return [node for node in self.get_contents(el, no_iframe=no_iframe) if self.is_content_string(node)]


class Inputs(object):
    """Class for parsing and validating input items."""

    @staticmethod
    def validate_day(year, month, day):
        """Validate day."""

        max_days = LONG_MONTH
        if month == FEB:
            max_days = FEB_LEAP_MONTH if ((year % 4 == 0) and (year % 100 != 0)) or (year % 400 == 0) else FEB_MONTH
        elif month in MONTHS_30:
            max_days = SHORT_MONTH
        return 1 <= day <= max_days

    @staticmethod
    def validate_week(year, week):
        """Validate week."""

        max_week = datetime.strptime("{}-{}-{}".format(12, 31, year), "%m-%d-%Y").isocalendar()[1]
        if max_week == 1:
            max_week = 53
        return 1 <= week <= max_week

    @staticmethod
    def validate_month(month):
        """Validate month."""

        return 1 <= month <= 12

    @staticmethod
    def validate_year(year):
        """Validate year."""

        return 1 <= year

    @staticmethod
    def validate_hour(hour):
        """Validate hour."""

        return 0 <= hour <= 23

    @staticmethod
    def validate_minutes(minutes):
        """Validate minutes."""

        return 0 <= minutes <= 59

    @classmethod
    def parse_value(cls, itype, value):
        """Parse the input value."""

        parsed = None
        if itype == "date":
            m = RE_DATE.match(value)
            if m:
                year = int(m.group('year'), 10)
                month = int(m.group('month'), 10)
                day = int(m.group('day'), 10)
                if cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day):
                    parsed = (year, month, day)
        elif itype == "month":
            m = RE_MONTH.match(value)
            if m:
                year = int(m.group('year'), 10)
                month = int(m.group('month'), 10)
                if cls.validate_year(year) and cls.validate_month(month):
                    parsed = (year, month)
        elif itype == "week":
            m = RE_WEEK.match(value)
            if m:
                year = int(m.group('year'), 10)
                week = int(m.group('week'), 10)
                if cls.validate_year(year) and cls.validate_week(year, week):
                    parsed = (year, week)
        elif itype == "time":
            m = RE_TIME.match(value)
            if m:
                hour = int(m.group('hour'), 10)
                minutes = int(m.group('minutes'), 10)
                if cls.validate_hour(hour) and cls.validate_minutes(minutes):
                    parsed = (hour, minutes)
        elif itype == "datetime-local":
            m = RE_DATETIME.match(value)
            if m:
                year = int(m.group('year'), 10)
                month = int(m.group('month'), 10)
                day = int(m.group('day'), 10)
                hour = int(m.group('hour'), 10)
                minutes = int(m.group('minutes'), 10)
                if (
                    cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day) and
                    cls.validate_hour(hour) and cls.validate_minutes(minutes)
                ):
                    parsed = (year, month, day, hour, minutes)
        elif itype in ("number", "range"):
            m = RE_NUM.match(value)
            if m:
                parsed = float(m.group('value'))
        return parsed


class _Match(object):
    """Perform CSS matching."""

    def __init__(self, selectors, scope, namespaces, flags):
        """Initialize."""

        self.assert_valid_input(scope)
        self.tag = scope
        self.cached_meta_lang = []
        self.cached_default_forms = []
        self.cached_indeterminate_forms = []
        self.selectors = selectors
        self.namespaces = {} if namespaces is None else namespaces
        self.flags = flags
        self.iframe_restrict = False

        # Find the root element for the whole tree
        doc = scope
        parent = self.get_parent(doc)
        while parent:
            doc = parent
            parent = self.get_parent(doc)
        root = None
        if not self.is_doc(doc):
            root = doc
        else:
            for child in self.get_children(doc):
                root = child
                break

        self.root = root
        self.scope = scope if scope is not doc else root
        self.has_html_namespace = self.has_html_ns(root)

        # A document can be both XML and HTML (XHTML)
        self.is_xml = self.is_xml_tree(doc)
        self.is_html = not self.is_xml or self.has_html_namespace

    def supports_namespaces(self):
        """Check if namespaces are supported in the HTML type."""

        return self.is_xml or self.has_html_namespace

    def get_tag_ns(self, el):
        """Get tag namespace."""

        if self.supports_namespaces():
            namespace = ''
            ns = self.get_uri(el)
            if ns:
                namespace = ns
        else:
            namespace = NS_XHTML
        return namespace

    def is_html_tag(self, el):
        """Check if tag is in HTML namespace."""

        return self.get_tag_ns(el) == NS_XHTML

    def get_tag(self, el):
        """Get tag."""

        name = self.get_tag_name(el)
        return util.lower(name) if name is not None and not self.is_xml else name

    def get_prefix(self, el):
        """Get prefix."""

        prefix = self.get_prefix_name(el)
        return util.lower(prefix) if prefix is not None and not self.is_xml else prefix

    def find_bidi(self, el):
        """Get directionality from element text."""

        for node in self.get_children(el, tags=False):

            # Analyze child text nodes
            if self.is_tag(node):

                # Avoid analyzing certain elements specified in the specification.
                direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(node, 'dir', '')), None)
                if (
                    self.get_tag(node) in ('bdi', 'script', 'style', 'textarea', 'iframe') or
                    not self.is_html_tag(node) or
                    direction is not None
                ):
                    continue  # pragma: no cover

                # Check directionality of this node's text
                value = self.find_bidi(node)
                if value is not None:
                    return value

                # Direction could not be determined
                continue  # pragma: no cover

            # Skip `doctype` comments, etc.
            if self.is_special_string(node):
                continue

            # Analyze text nodes for directionality.
            for c in node:
                bidi = unicodedata.bidirectional(c)
                if bidi in ('AL', 'R', 'L'):
                    return ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL
        return None

    def extended_language_filter(self, lang_range, lang_tag):
        """Filter the language tags."""

        match = True
        lang_range = RE_WILD_STRIP.sub('-', lang_range).lower()
        ranges = lang_range.split('-')
        subtags = lang_tag.lower().split('-')
        length = len(ranges)
        rindex = 0
        sindex = 0
        r = ranges[rindex]
        s = subtags[sindex]

        # Primary tag needs to match
        if r != '*' and r != s:
            match = False

        rindex += 1
        sindex += 1

        # Match until we run out of ranges
        while match and rindex < length:
            r = ranges[rindex]
            try:
                s = subtags[sindex]
            except IndexError:
                # Ran out of subtags,
                # but we still have ranges
                match = False
                continue

            # Empty range
            if not r:
                match = False
                continue

            # Matched range
            elif s == r:
                rindex += 1

            # Implicit wildcard cannot match
            # singletons
            elif len(s) == 1:
                match = False
                continue

            # Implicitly matched, so grab next subtag
            sindex += 1

        return match

    def match_attribute_name(self, el, attr, prefix):
        """Match attribute name and return value if it exists."""

        value = None
        if self.supports_namespaces():
            value = None
            # If we have not defined namespaces, we can't very well find them, so don't bother trying.
            if prefix:
                ns = self.namespaces.get(prefix)
                if ns is None and prefix != '*':
                    return None
            else:
                ns = None

            for k, v in self.iter_attributes(el):

                # Get attribute parts
                namespace, name = self.split_namespace(el, k)

                # Can't match a prefix attribute as we haven't specified one to match
                # Try to match it normally as a whole `p:a` as selector may be trying `p\:a`.
                if ns is None:
                    if (self.is_xml and attr == k) or (not self.is_xml and util.lower(attr) == util.lower(k)):
                        value = v
                        break
                    # Coverage is not finding this even though it is executed.
                    # Adding a print statement before this (and erasing coverage) causes coverage to find the line.
                    # Ignore the false positive message.
                    continue  # pragma: no cover

                # We can't match our desired prefix attribute as the attribute doesn't have a prefix
                if namespace is None or ns != namespace and prefix != '*':
                    continue

                # The attribute doesn't match.
                if (util.lower(attr) != util.lower(name)) if not self.is_xml else (attr != name):
                    continue

                value = v
                break
        else:
            for k, v in self.iter_attributes(el):
                if util.lower(attr) != util.lower(k):
                    continue
                value = v
                break
        return value

    def match_namespace(self, el, tag):
        """Match the namespace of the element."""

        match = True
        namespace = self.get_tag_ns(el)
        default_namespace = self.namespaces.get('')
        tag_ns = '' if tag.prefix is None else self.namespaces.get(tag.prefix, None)
        # We must match the default namespace if one is not provided
        if tag.prefix is None and (default_namespace is not None and namespace != default_namespace):
            match = False
        # If we specified `|tag`, we must not have a namespace.
        elif (tag.prefix is not None and tag.prefix == '' and namespace):
            match = False
        # Verify prefix matches
        elif (
            tag.prefix and
            tag.prefix != '*' and (tag_ns is None or namespace != tag_ns)
        ):
            match = False
        return match

    def match_attributes(self, el, attributes):
        """Match attributes."""

        match = True
        if attributes:
            for a in attributes:
                value = self.match_attribute_name(el, a.attribute, a.prefix)
                pattern = a.xml_type_pattern if self.is_xml and a.xml_type_pattern else a.pattern
                if isinstance(value, list):
                    value = ' '.join(value)
                if value is None:
                    match = False
                    break
                elif pattern is None:
                    continue
                elif pattern.match(value) is None:
                    match = False
                    break
        return match

    def match_tagname(self, el, tag):
        """Match tag name."""

        name = (util.lower(tag.name) if not self.is_xml and tag.name is not None else tag.name)
        return not (
            name is not None and
            name not in (self.get_tag(el), '*')
        )

    def match_tag(self, el, tag):
        """Match the tag."""

        match = True
        if tag is not None:
            # Verify namespace
            if not self.match_namespace(el, tag):
                match = False
            if not self.match_tagname(el, tag):
                match = False
        return match

    def match_past_relations(self, el, relation):
        """Match past relationship."""

        found = False
        if relation[0].rel_type == REL_PARENT:
            parent = self.get_parent(el, no_iframe=self.iframe_restrict)
            while not found and parent:
                found = self.match_selectors(parent, relation)
                parent = self.get_parent(parent, no_iframe=self.iframe_restrict)
        elif relation[0].rel_type == REL_CLOSE_PARENT:
            parent = self.get_parent(el, no_iframe=self.iframe_restrict)
            if parent:
                found = self.match_selectors(parent, relation)
        elif relation[0].rel_type == REL_SIBLING:
            sibling = self.get_previous(el)
            while not found and sibling:
                found = self.match_selectors(sibling, relation)
                sibling = self.get_previous(sibling)
        elif relation[0].rel_type == REL_CLOSE_SIBLING:
            sibling = self.get_previous(el)
            if sibling and self.is_tag(sibling):
                found = self.match_selectors(sibling, relation)
        return found

    def match_future_child(self, parent, relation, recursive=False):
        """Match future child."""

        match = False
        children = self.get_descendants if recursive else self.get_children
        for child in children(parent, no_iframe=self.iframe_restrict):
            match = self.match_selectors(child, relation)
            if match:
                break
        return match

    def match_future_relations(self, el, relation):
        """Match future relationship."""

        found = False
        if relation[0].rel_type == REL_HAS_PARENT:
            found = self.match_future_child(el, relation, True)
        elif relation[0].rel_type == REL_HAS_CLOSE_PARENT:
            found = self.match_future_child(el, relation)
        elif relation[0].rel_type == REL_HAS_SIBLING:
            sibling = self.get_next(el)
            while not found and sibling:
                found = self.match_selectors(sibling, relation)
                sibling = self.get_next(sibling)
        elif relation[0].rel_type == REL_HAS_CLOSE_SIBLING:
            sibling = self.get_next(el)
            if sibling and self.is_tag(sibling):
                found = self.match_selectors(sibling, relation)
        return found

    def match_relations(self, el, relation):
        """Match relationship to other elements."""

        found = False

        if relation[0].rel_type.startswith(':'):
            found = self.match_future_relations(el, relation)
        else:
            found = self.match_past_relations(el, relation)

        return found

    def match_id(self, el, ids):
        """Match element's ID."""

        found = True
        for i in ids:
            if i != self.get_attribute_by_name(el, 'id', ''):
                found = False
                break
        return found

    def match_classes(self, el, classes):
        """Match element's classes."""

        current_classes = self.get_classes(el)
        found = True
        for c in classes:
            if c not in current_classes:
                found = False
                break
        return found

    def match_root(self, el):
        """Match element as root."""

        is_root = self.is_root(el)
        if is_root:
            sibling = self.get_previous(el, tags=False)
            while is_root and sibling is not None:
                if (
                    self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or
                    self.is_cdata(sibling)
                ):
                    is_root = False
                else:
                    sibling = self.get_previous(sibling, tags=False)
        if is_root:
            sibling = self.get_next(el, tags=False)
            while is_root and sibling is not None:
                if (
                    self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or
                    self.is_cdata(sibling)
                ):
                    is_root = False
                else:
                    sibling = self.get_next(sibling, tags=False)
        return is_root

    def match_scope(self, el):
        """Match element as scope."""

        return self.scope is el

    def match_nth_tag_type(self, el, child):
        """Match tag type for `nth` matches."""

        return(
            (self.get_tag(child) == self.get_tag(el)) and
            (self.get_tag_ns(child) == self.get_tag_ns(el))
        )

    def match_nth(self, el, nth):
        """Match `nth` elements."""

        matched = True

        for n in nth:
            matched = False
            if n.selectors and not self.match_selectors(el, n.selectors):
                break
            parent = self.get_parent(el)
            if parent is None:
                parent = self.create_fake_parent(el)
            last = n.last
            last_index = len(parent) - 1
            index = last_index if last else 0
            relative_index = 0
            a = n.a
            b = n.b
            var = n.n
            count = 0
            count_incr = 1
            factor = -1 if last else 1
            idx = last_idx = a * count + b if var else a

            # We can only adjust bounds within a variable index
            if var:
                # Abort if our nth index is out of bounds and only getting further out of bounds as we increment.
                # Otherwise, increment to try to get in bounds.
                adjust = None
                while idx < 1 or idx > last_index:
                    if idx < 0:
                        diff_low = 0 - idx
                        if adjust is not None and adjust == 1:
                            break
                        adjust = -1
                        count += count_incr
                        idx = last_idx = a * count + b if var else a
                        diff = 0 - idx
                        if diff >= diff_low:
                            break
                    else:
                        diff_high = idx - last_index
                        if adjust is not None and adjust == -1:
                            break
                        adjust = 1
                        count += count_incr
                        idx = last_idx = a * count + b if var else a
                        diff = idx - last_index
                        if diff >= diff_high:
                            break
                        diff_high = diff

                # If a < 0, our count is working backwards, so floor the index by increasing the count.
                # Find the count that yields the lowest, in bound value and use that.
                # Lastly reverse count increment so that we'll increase our index.
                lowest = count
                if a < 0:
                    while idx >= 1:
                        lowest = count
                        count += count_incr
                        idx = last_idx = a * count + b if var else a
                    count_incr = -1
                count = lowest
                idx = last_idx = a * count + b if var else a

            # Evaluate elements while our calculated nth index is still in range
            while 1 <= idx <= last_index + 1:
                child = None
                # Evaluate while our child index is still in range.
                for child in self.get_children(parent, start=index, reverse=factor < 0, tags=False):
                    index += factor
                    if not self.is_tag(child):
                        continue
                    # Handle `of S` in `nth-child`
                    if n.selectors and not self.match_selectors(child, n.selectors):
                        continue
                    # Handle `of-type`
                    if n.of_type and not self.match_nth_tag_type(el, child):
                        continue
                    relative_index += 1
                    if relative_index == idx:
                        if child is el:
                            matched = True
                        else:
                            break
                    if child is el:
                        break
                if child is el:
                    break
                last_idx = idx
                count += count_incr
                if count < 0:
                    # Count is counting down and has now ventured into invalid territory.
                    break
                idx = a * count + b if var else a
                if last_idx == idx:
                    break
            if not matched:
                break
        return matched

    def match_empty(self, el):
        """Check if element is empty (if requested)."""

        is_empty = True
        for child in self.get_children(el, tags=False):
            if self.is_tag(child):
                is_empty = False
                break
            elif self.is_content_string(child) and RE_NOT_EMPTY.search(child):
                is_empty = False
                break
        return is_empty

    def match_subselectors(self, el, selectors):
        """Match selectors."""

        match = True
        for sel in selectors:
            if not self.match_selectors(el, sel):
                match = False
        return match

    def match_contains(self, el, contains):
        """Match element if it contains text."""

        match = True
        content = None
        for contain_list in contains:
            if content is None:
                if contain_list.own:
                    content = self.get_own_text(el, no_iframe=self.is_html)
                else:
                    content = self.get_text(el, no_iframe=self.is_html)
            found = False
            for text in contain_list.text:
                if contain_list.own:
                    for c in content:
                        if text in c:
                            found = True
                            break
                    if found:
                        break
                else:
                    if text in content:
                        found = True
                        break
            if not found:
                match = False
        return match

    def match_default(self, el):
        """Match default."""

        match = False

        # Find this input's form
        form = None
        parent = self.get_parent(el, no_iframe=True)
        while parent and form is None:
            if self.get_tag(parent) == 'form' and self.is_html_tag(parent):
                form = parent
            else:
                parent = self.get_parent(parent, no_iframe=True)

        # Look in form cache to see if we've already located its default button
        found_form = False
        for f, t in self.cached_default_forms:
            if f is form:
                found_form = True
                if t is el:
                    match = True
                break

        # We didn't have the form cached, so look for its default button
        if not found_form:
            for child in self.get_descendants(form, no_iframe=True):
                name = self.get_tag(child)
                # Can't do nested forms (haven't figured out why we never hit this)
                if name == 'form':  # pragma: no cover
                    break
                if name in ('input', 'button'):
                    v = self.get_attribute_by_name(child, 'type', '')
                    if v and util.lower(v) == 'submit':
                        self.cached_default_forms.append([form, child])
                        if el is child:
                            match = True
                        break
        return match

    def match_indeterminate(self, el):
        """Match default."""

        match = False
        name = self.get_attribute_by_name(el, 'name')

        def get_parent_form(el):
            """Find this input's form."""
            form = None
            parent = self.get_parent(el, no_iframe=True)
            while form is None:
                if self.get_tag(parent) == 'form' and self.is_html_tag(parent):
                    form = parent
                    break
                last_parent = parent
                parent = self.get_parent(parent, no_iframe=True)
                if parent is None:
                    form = last_parent
                    break
            return form

        form = get_parent_form(el)

        # Look in form cache to see if we've already evaluated that its fellow radio buttons are indeterminate
        found_form = False
        for f, n, i in self.cached_indeterminate_forms:
            if f is form and n == name:
                found_form = True
                if i is True:
                    match = True
                break

        # We didn't have the form cached, so validate that the radio button is indeterminate
        if not found_form:
            checked = False
            for child in self.get_descendants(form, no_iframe=True):
                if child is el:
                    continue
                tag_name = self.get_tag(child)
                if tag_name == 'input':
                    is_radio = False
                    check = False
                    has_name = False
                    for k, v in self.iter_attributes(child):
                        if util.lower(k) == 'type' and util.lower(v) == 'radio':
                            is_radio = True
                        elif util.lower(k) == 'name' and v == name:
                            has_name = True
                        elif util.lower(k) == 'checked':
                            check = True
                        if is_radio and check and has_name and get_parent_form(child) is form:
                            checked = True
                            break
                if checked:
                    break
            if not checked:
                match = True
            self.cached_indeterminate_forms.append([form, name, match])

        return match

    def match_lang(self, el, langs):
        """Match languages."""

        match = False
        has_ns = self.supports_namespaces()
        root = self.root
        has_html_namespace = self.has_html_namespace

        # Walk parents looking for `lang` (HTML) or `xml:lang` XML property.
        parent = el
        found_lang = None
        last = None
        while not found_lang:
            has_html_ns = self.has_html_ns(parent)
            for k, v in self.iter_attributes(parent):
                attr_ns, attr = self.split_namespace(parent, k)
                if (
                    ((not has_ns or has_html_ns) and (util.lower(k) if not self.is_xml else k) == 'lang') or
                    (
                        has_ns and not has_html_ns and attr_ns == NS_XML and
                        (util.lower(attr) if not self.is_xml and attr is not None else attr) == 'lang'
                    )
                ):
                    found_lang = v
                    break
            last = parent
            parent = self.get_parent(parent, no_iframe=self.is_html)

            if parent is None:
                root = last
                has_html_namespace = self.has_html_ns(root)
                parent = last
                break

        # Use cached meta language.
        if not found_lang and self.cached_meta_lang:
            for cache in self.cached_meta_lang:
                if root is cache[0]:
                    found_lang = cache[1]

        # If we couldn't find a language, and the document is HTML, look to meta to determine language.
        if found_lang is None and (not self.is_xml or (has_html_namespace and root.name == 'html')):
            # Find head
            found = False
            for tag in ('html', 'head'):
                found = False
                for child in self.get_children(parent, no_iframe=self.is_html):
                    if self.get_tag(child) == tag and self.is_html_tag(child):
                        found = True
                        parent = child
                        break
                if not found:  # pragma: no cover
                    break

            # Search meta tags
            if found:
                for child in parent:
                    if self.is_tag(child) and self.get_tag(child) == 'meta' and self.is_html_tag(parent):
                        c_lang = False
                        content = None
                        for k, v in self.iter_attributes(child):
                            if util.lower(k) == 'http-equiv' and util.lower(v) == 'content-language':
                                c_lang = True
                            if util.lower(k) == 'content':
                                content = v
                            if c_lang and content:
                                found_lang = content
                                self.cached_meta_lang.append((root, found_lang))
                                break
                    if found_lang:
                        break
                if not found_lang:
                    self.cached_meta_lang.append((root, False))

        # If we determined a language, compare.
        if found_lang:
            for patterns in langs:
                match = False
                for pattern in patterns:
                    if self.extended_language_filter(pattern, found_lang):
                        match = True
                if not match:
                    break

        return match

    def match_dir(self, el, directionality):
        """Check directionality."""

        # If we have to match both left and right, we can't match either.
        if directionality & ct.SEL_DIR_LTR and directionality & ct.SEL_DIR_RTL:
            return False

        if el is None or not self.is_html_tag(el):
            return False

        # Element has defined direction of left to right or right to left
        direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(el, 'dir', '')), None)
        if direction not in (None, 0):
            return direction == directionality

        # Element is the document element (the root) and no direction assigned, assume left to right.
        is_root = self.is_root(el)
        if is_root and direction is None:
            return ct.SEL_DIR_LTR == directionality

        # If `input[type=telephone]` and no direction is assigned, assume left to right.
        name = self.get_tag(el)
        is_input = name == 'input'
        is_textarea = name == 'textarea'
        is_bdi = name == 'bdi'
        itype = util.lower(self.get_attribute_by_name(el, 'type', '')) if is_input else ''
        if is_input and itype == 'tel' and direction is None:
            return ct.SEL_DIR_LTR == directionality

        # Auto handling for text inputs
        if ((is_input and itype in ('text', 'search', 'tel', 'url', 'email')) or is_textarea) and direction == 0:
            if is_textarea:
                value = []
                for node in self.get_contents(el, no_iframe=True):
                    if self.is_content_string(node):
                        value.append(node)
                value = ''.join(value)
            else:
                value = self.get_attribute_by_name(el, 'value', '')
            if value:
                for c in value:
                    bidi = unicodedata.bidirectional(c)
                    if bidi in ('AL', 'R', 'L'):
                        direction = ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL
                        return direction == directionality
                # Assume left to right
                return ct.SEL_DIR_LTR == directionality
            elif is_root:
                return ct.SEL_DIR_LTR == directionality
            return self.match_dir(self.get_parent(el, no_iframe=True), directionality)

        # Auto handling for `bdi` and other non text inputs.
        if (is_bdi and direction is None) or direction == 0:
            direction = self.find_bidi(el)
            if direction is not None:
                return direction == directionality
            elif is_root:
                return ct.SEL_DIR_LTR == directionality
            return self.match_dir(self.get_parent(el, no_iframe=True), directionality)

        # Match parents direction
        return self.match_dir(self.get_parent(el, no_iframe=True), directionality)

    def match_range(self, el, condition):
        """
        Match range.

        Behavior is modeled after what we see in browsers. Browsers seem to evaluate
        if the value is out of range, and if not, it is in range. So a missing value
        will not evaluate out of range; therefore, value is in range. Personally, I
        feel like this should evaluate as neither in or out of range.
        """

        out_of_range = False

        itype = util.lower(self.get_attribute_by_name(el, 'type'))
        mn = self.get_attribute_by_name(el, 'min', None)
        if mn is not None:
            mn = Inputs.parse_value(itype, mn)
        mx = self.get_attribute_by_name(el, 'max', None)
        if mx is not None:
            mx = Inputs.parse_value(itype, mx)

        # There is no valid min or max, so we cannot evaluate a range
        if mn is None and mx is None:
            return False

        value = self.get_attribute_by_name(el, 'value', None)
        if value is not None:
            value = Inputs.parse_value(itype, value)
        if value is not None:
            if itype in ("date", "datetime-local", "month", "week", "number", "range"):
                if mn is not None and value < mn:
                    out_of_range = True
                if not out_of_range and mx is not None and value > mx:
                    out_of_range = True
            elif itype == "time":
                if mn is not None and mx is not None and mn > mx:
                    # Time is periodic, so this is a reversed/discontinuous range
                    if value < mn and value > mx:
                        out_of_range = True
                else:
                    if mn is not None and value < mn:
                        out_of_range = True
                    if not out_of_range and mx is not None and value > mx:
                        out_of_range = True

        return not out_of_range if condition & ct.SEL_IN_RANGE else out_of_range

    def match_defined(self, el):
        """
        Match defined.

        `:defined` is related to custom elements in a browser.

        - If the document is XML (not XHTML), all tags will match.
        - Tags that are not custom (don't have a hyphen) are marked defined.
        - If the tag has a prefix (without or without a namespace), it will not match.

        This is of course requires the parser to provide us with the proper prefix and namespace info,
        if it doesn't, there is nothing we can do.
        """

        name = self.get_tag(el)
        return (
            name.find('-') == -1 or
            name.find(':') != -1 or
            self.get_prefix(el) is not None
        )

    def match_placeholder_shown(self, el):
        """
        Match placeholder shown according to HTML spec.

        - text area should be checked if they have content. A single newline does not count as content.

        """

        match = False
        content = self.get_text(el)
        if content in ('', '\n'):
            match = True

        return match

    def match_selectors(self, el, selectors):
        """Check if element matches one of the selectors."""

        match = False
        is_not = selectors.is_not
        is_html = selectors.is_html

        # Internal selector lists that use the HTML flag, will automatically get the `html` namespace.
        if is_html:
            namespaces = self.namespaces
            iframe_restrict = self.iframe_restrict
            self.namespaces = {'html': NS_XHTML}
            self.iframe_restrict = True

        if not is_html or self.is_html:
            for selector in selectors:
                match = is_not
                # We have a un-matchable situation (like `:focus` as you can focus an element in this environment)
                if isinstance(selector, ct.SelectorNull):
                    continue
                # Verify tag matches
                if not self.match_tag(el, selector.tag):
                    continue
                # Verify tag is defined
                if selector.flags & ct.SEL_DEFINED and not self.match_defined(el):
                    continue
                # Verify element is root
                if selector.flags & ct.SEL_ROOT and not self.match_root(el):
                    continue
                # Verify element is scope
                if selector.flags & ct.SEL_SCOPE and not self.match_scope(el):
                    continue
                # Verify element has placeholder shown
                if selector.flags & ct.SEL_PLACEHOLDER_SHOWN and not self.match_placeholder_shown(el):
                    continue
                # Verify `nth` matches
                if not self.match_nth(el, selector.nth):
                    continue
                if selector.flags & ct.SEL_EMPTY and not self.match_empty(el):
                    continue
                # Verify id matches
                if selector.ids and not self.match_id(el, selector.ids):
                    continue
                # Verify classes match
                if selector.classes and not self.match_classes(el, selector.classes):
                    continue
                # Verify attribute(s) match
                if not self.match_attributes(el, selector.attributes):
                    continue
                # Verify ranges
                if selector.flags & RANGES and not self.match_range(el, selector.flags & RANGES):
                    continue
                # Verify language patterns
                if selector.lang and not self.match_lang(el, selector.lang):
                    continue
                # Verify pseudo selector patterns
                if selector.selectors and not self.match_subselectors(el, selector.selectors):
                    continue
                # Verify relationship selectors
                if selector.relation and not self.match_relations(el, selector.relation):
                    continue
                # Validate that the current default selector match corresponds to the first submit button in the form
                if selector.flags & ct.SEL_DEFAULT and not self.match_default(el):
                    continue
                # Validate that the unset radio button is among radio buttons with the same name in a form that are
                # also not set.
                if selector.flags & ct.SEL_INDETERMINATE and not self.match_indeterminate(el):
                    continue
                # Validate element directionality
                if selector.flags & DIR_FLAGS and not self.match_dir(el, selector.flags & DIR_FLAGS):
                    continue
                # Validate that the tag contains the specified text.
                if not self.match_contains(el, selector.contains):
                    continue
                match = not is_not
                break

        # Restore actual namespaces being used for external selector lists
        if is_html:
            self.namespaces = namespaces
            self.iframe_restrict = iframe_restrict

        return match

    def select(self, limit=0):
        """Match all tags under the targeted tag."""

        if limit < 1:
            limit = None

        for child in self.get_descendants(self.tag):
            if self.match(child):
                yield child
                if limit is not None:
                    limit -= 1
                    if limit < 1:
                        break

    def closest(self):
        """Match closest ancestor."""

        current = self.tag
        closest = None
        while closest is None and current is not None:
            if self.match(current):
                closest = current
            else:
                current = self.get_parent(current)
        return closest

    def filter(self):  # noqa A001
        """Filter tag's children."""

        return [tag for tag in self.get_contents(self.tag) if not self.is_navigable_string(tag) and self.match(tag)]

    def match(self, el):
        """Match."""

        return not self.is_doc(el) and self.is_tag(el) and self.match_selectors(el, self.selectors)


class CSSMatch(_DocumentNav, _Match):
    """The Beautiful Soup CSS match class."""


class SoupSieve(ct.Immutable):
    """Compiled Soup Sieve selector matching object."""

    __slots__ = ("pattern", "selectors", "namespaces", "custom", "flags", "_hash")

    def __init__(self, pattern, selectors, namespaces, custom, flags):
        """Initialize."""

        super(SoupSieve, self).__init__(
            pattern=pattern,
            selectors=selectors,
            namespaces=namespaces,
            custom=custom,
            flags=flags
        )

    def match(self, tag):
        """Match."""

        return CSSMatch(self.selectors, tag, self.namespaces, self.flags).match(tag)

    def closest(self, tag):
        """Match closest ancestor."""

        return CSSMatch(self.selectors, tag, self.namespaces, self.flags).closest()

    def filter(self, iterable):  # noqa A001
        """
        Filter.

        `CSSMatch` can cache certain searches for tags of the same document,
        so if we are given a tag, all tags are from the same document,
        and we can take advantage of the optimization.

        Any other kind of iterable could have tags from different documents or detached tags,
        so for those, we use a new `CSSMatch` for each item in the iterable.
        """

        if CSSMatch.is_tag(iterable):
            return CSSMatch(self.selectors, iterable, self.namespaces, self.flags).filter()
        else:
            return [node for node in iterable if not CSSMatch.is_navigable_string(node) and self.match(node)]

    def select_one(self, tag):
        """Select a single tag."""

        tags = self.select(tag, limit=1)
        return tags[0] if tags else None

    def select(self, tag, limit=0):
        """Select the specified tags."""

        return list(self.iselect(tag, limit))

    def iselect(self, tag, limit=0):
        """Iterate the specified tags."""

        for el in CSSMatch(self.selectors, tag, self.namespaces, self.flags).select(limit):
            yield el

    def __repr__(self):  # pragma: no cover
        """Representation."""

        return "SoupSieve(pattern={!r}, namespaces={!r}, custom={!r}, flags={!r})".format(
            self.pattern,
            self.namespaces,
            self.custom,
            self.flags
        )

    __str__ = __repr__


ct.pickle_register(SoupSieve)
author	shellac
date	Mon, 22 Mar 2021 18:12:50 +0000
parents
children