sam_consensus_v3: env/lib/python3.9/site-packages/bs4/element.py comparison

comparison env/lib/python3.9/site-packages/bs4/element.py @ 0:4f3585e2f14b draft default tip

"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"

author	shellac
date	Mon, 22 Mar 2021 18:12:50 +0000
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:4f3585e2f14b
+# Use of this source code is governed by the MIT license.
+__license__ = "MIT"
+try:
+from collections.abc import Callable # Python 3.6
+except ImportError as e:
+from collections import Callable
+import re
+import sys
+import warnings
+try:
+import soupsieve
+except ImportError as e:
+soupsieve = None
+warnings.warn(
+'The soupsieve package is not installed. CSS selectors cannot be used.'
+)
+from bs4.formatter import (
+Formatter,
+HTMLFormatter,
+XMLFormatter,
+)
+DEFAULT_OUTPUT_ENCODING = "utf-8"
+PY3K = (sys.version_info[0] > 2)
+nonwhitespace_re = re.compile(r"\S+")
+# NOTE: This isn't used as of 4.7.0. I'm leaving it for a little bit on
+# the off chance someone imported it for their own use.
+whitespace_re = re.compile(r"\s+")
+def _alias(attr):
+"""Alias one attribute name to another for backward compatibility"""
+@property
+def alias(self):
+return getattr(self, attr)
+@alias.setter
+def alias(self):
+return setattr(self, attr)
+return alias
+# These encodings are recognized by Python (so PageElement.encode
+# could theoretically support them) but XML and HTML don't recognize
+# them (so they should not show up in an XML or HTML document as that
+# document's encoding).
+#
+# If an XML document is encoded in one of these encodings, no encoding
+# will be mentioned in the XML declaration. If an HTML document is
+# encoded in one of these encodings, and the HTML document has a
+# <meta> tag that mentions an encoding, the encoding will be given as
+# the empty string.
+#
+# Source:
+# https://docs.python.org/3/library/codecs.html#python-specific-encodings
+PYTHON_SPECIFIC_ENCODINGS = set([
+"idna",
+"mbcs",
+"oem",
+"palmos",
+"punycode",
+"raw_unicode_escape",
+"undefined",
+"unicode_escape",
+"raw-unicode-escape",
+"unicode-escape",
+"string-escape",
+"string_escape",
+])
+class NamespacedAttribute(str):
+"""A namespaced string (e.g. 'xml:lang') that remembers the namespace
+('xml') and the name ('lang') that were used to create it.
+"""
+def __new__(cls, prefix, name=None, namespace=None):
+if not name:
+# This is the default namespace. Its name "has no value"
+# per https://www.w3.org/TR/xml-names/#defaulting
+name = None
+if name is None:
+obj = str.__new__(cls, prefix)
+elif prefix is None:
+# Not really namespaced.
+obj = str.__new__(cls, name)
+else:
+obj = str.__new__(cls, prefix + ":" + name)
+obj.prefix = prefix
+obj.name = name
+obj.namespace = namespace
+return obj
+class AttributeValueWithCharsetSubstitution(str):
+"""A stand-in object for a character encoding specified in HTML."""
+class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
+"""A generic stand-in for the value of a meta tag's 'charset' attribute.
+When Beautiful Soup parses the markup '<meta charset="utf8">', the
+value of the 'charset' attribute will be one of these objects.
+"""
+def __new__(cls, original_value):
+obj = str.__new__(cls, original_value)
+obj.original_value = original_value
+return obj
+def encode(self, encoding):
+"""When an HTML document is being encoded to a given encoding, the
+value of a meta tag's 'charset' is the name of the encoding.
+"""
+if encoding in PYTHON_SPECIFIC_ENCODINGS:
+return ''
+return encoding
+class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
+"""A generic stand-in for the value of a meta tag's 'content' attribute.
+When Beautiful Soup parses the markup:
+<meta http-equiv="content-type" content="text/html; charset=utf8">
+The value of the 'content' attribute will be one of these objects.
+"""
+CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M)
+def __new__(cls, original_value):
+match = cls.CHARSET_RE.search(original_value)
+if match is None:
+# No substitution necessary.
+return str.__new__(str, original_value)
+obj = str.__new__(cls, original_value)
+obj.original_value = original_value
+return obj
+def encode(self, encoding):
+if encoding in PYTHON_SPECIFIC_ENCODINGS:
+return ''
+def rewrite(match):
+return match.group(1) + encoding
+return self.CHARSET_RE.sub(rewrite, self.original_value)
+class PageElement(object):
+"""Contains the navigational information for some part of the page:
+that is, its current location in the parse tree.
+NavigableString, Tag, etc. are all subclasses of PageElement.
+"""
+def setup(self, parent=None, previous_element=None, next_element=None,
+previous_sibling=None, next_sibling=None):
+"""Sets up the initial relations between this element and
+other elements.
+:param parent: The parent of this element.
+:param previous_element: The element parsed immediately before
+this one.
+:param next_element: The element parsed immediately before
+this one.
+:param previous_sibling: The most recently encountered element
+on the same level of the parse tree as this one.
+:param previous_sibling: The next element to be encountered
+on the same level of the parse tree as this one.
+"""
+self.parent = parent
+self.previous_element = previous_element
+if previous_element is not None:
+self.previous_element.next_element = self
+self.next_element = next_element
+if self.next_element is not None:
+self.next_element.previous_element = self
+self.next_sibling = next_sibling
+if self.next_sibling is not None:
+self.next_sibling.previous_sibling = self
+if (previous_sibling is None
+and self.parent is not None and self.parent.contents):
+previous_sibling = self.parent.contents[-1]
+self.previous_sibling = previous_sibling
+if previous_sibling is not None:
+self.previous_sibling.next_sibling = self
+def format_string(self, s, formatter):
+"""Format the given string using the given formatter.
+:param s: A string.
+:param formatter: A Formatter object, or a string naming one of the standard formatters.
+"""
+if formatter is None:
+return s
+if not isinstance(formatter, Formatter):
+formatter = self.formatter_for_name(formatter)
+output = formatter.substitute(s)
+return output
+def formatter_for_name(self, formatter):
+"""Look up or create a Formatter for the given identifier,
+if necessary.
+:param formatter: Can be a Formatter object (used as-is), a
+function (used as the entity substitution hook for an
+XMLFormatter or HTMLFormatter), or a string (used to look
+up an XMLFormatter or HTMLFormatter in the appropriate
+registry.
+"""
+if isinstance(formatter, Formatter):
+return formatter
+if self._is_xml:
+c = XMLFormatter
+else:
+c = HTMLFormatter
+if isinstance(formatter, Callable):
+return c(entity_substitution=formatter)
+return c.REGISTRY[formatter]
+@property
+def _is_xml(self):
+"""Is this element part of an XML tree or an HTML tree?
+This is used in formatter_for_name, when deciding whether an
+XMLFormatter or HTMLFormatter is more appropriate. It can be
+inefficient, but it should be called very rarely.
+"""
+if self.known_xml is not None:
+# Most of the time we will have determined this when the
+# document is parsed.
+return self.known_xml
+# Otherwise, it's likely that this element was created by
+# direct invocation of the constructor from within the user's
+# Python code.
+if self.parent is None:
+# This is the top-level object. It should have .known_xml set
+# from tree creation. If not, take a guess--BS is usually
+# used on HTML markup.
+return getattr(self, 'is_xml', False)
+return self.parent._is_xml
+nextSibling = _alias("next_sibling")  # BS3
+previousSibling = _alias("previous_sibling")  # BS3
+def replace_with(self, replace_with):
+"""Replace this PageElement with another one, keeping the rest of the
+tree the same.
+:param replace_with: A PageElement.
+:return: `self`, no longer part of the tree.
+"""
+if self.parent is None:
+raise ValueError(
+"Cannot replace one element with another when the "
+"element to be replaced is not part of a tree.")
+if replace_with is self:
+return
+if replace_with is self.parent:
+raise ValueError("Cannot replace a Tag with its parent.")
+old_parent = self.parent
+my_index = self.parent.index(self)
+self.extract(_self_index=my_index)
+old_parent.insert(my_index, replace_with)
+return self
+replaceWith = replace_with  # BS3
+def unwrap(self):
+"""Replace this PageElement with its contents.
+:return: `self`, no longer part of the tree.
+"""
+my_parent = self.parent
+if self.parent is None:
+raise ValueError(
+"Cannot replace an element with its contents when that"
+"element is not part of a tree.")
+my_index = self.parent.index(self)
+self.extract(_self_index=my_index)
+for child in reversed(self.contents[:]):
+my_parent.insert(my_index, child)
+return self
+replace_with_children = unwrap
+replaceWithChildren = unwrap  # BS3
+def wrap(self, wrap_inside):
+"""Wrap this PageElement inside another one.
+:param wrap_inside: A PageElement.
+:return: `wrap_inside`, occupying the position in the tree that used
+to be occupied by `self`, and with `self` inside it.
+"""
+me = self.replace_with(wrap_inside)
+wrap_inside.append(me)
+return wrap_inside
+def extract(self, _self_index=None):
+"""Destructively rips this element out of the tree.
+:param _self_index: The location of this element in its parent's
+.contents, if known. Passing this in allows for a performance
+optimization.
+:return: `self`, no longer part of the tree.
+"""
+if self.parent is not None:
+if _self_index is None:
+_self_index = self.parent.index(self)
+del self.parent.contents[_self_index]
+#Find the two elements that would be next to each other if
+#this element (and any children) hadn't been parsed. Connect
+#the two.
+last_child = self._last_descendant()
+next_element = last_child.next_element
+if (self.previous_element is not None and
+self.previous_element is not next_element):
+self.previous_element.next_element = next_element
+if next_element is not None and next_element is not self.previous_element:
+next_element.previous_element = self.previous_element
+self.previous_element = None
+last_child.next_element = None
+self.parent = None
+if (self.previous_sibling is not None
+and self.previous_sibling is not self.next_sibling):
+self.previous_sibling.next_sibling = self.next_sibling
+if (self.next_sibling is not None
+and self.next_sibling is not self.previous_sibling):
+self.next_sibling.previous_sibling = self.previous_sibling
+self.previous_sibling = self.next_sibling = None
+return self
+def _last_descendant(self, is_initialized=True, accept_self=True):
+"""Finds the last element beneath this object to be parsed.
+:param is_initialized: Has `setup` been called on this PageElement
+yet?
+:param accept_self: Is `self` an acceptable answer to the question?
+"""
+if is_initialized and self.next_sibling is not None:
+last_child = self.next_sibling.previous_element
+else:
+last_child = self
+while isinstance(last_child, Tag) and last_child.contents:
+last_child = last_child.contents[-1]
+if not accept_self and last_child is self:
+last_child = None
+return last_child
+# BS3: Not part of the API!
+_lastRecursiveChild = _last_descendant
+def insert(self, position, new_child):
+"""Insert a new PageElement in the list of this PageElement's children.
+This works the same way as `list.insert`.
+:param position: The numeric position that should be occupied
+in `self.children` by the new PageElement.
+:param new_child: A PageElement.
+"""
+if new_child is None:
+raise ValueError("Cannot insert None into a tag.")
+if new_child is self:
+raise ValueError("Cannot insert a tag into itself.")
+if (isinstance(new_child, str)
+and not isinstance(new_child, NavigableString)):
+new_child = NavigableString(new_child)
+from bs4 import BeautifulSoup
+if isinstance(new_child, BeautifulSoup):
+# We don't want to end up with a situation where one BeautifulSoup
+# object contains another. Insert the children one at a time.
+for subchild in list(new_child.contents):
+self.insert(position, subchild)
+position += 1
+return
+position = min(position, len(self.contents))
+if hasattr(new_child, 'parent') and new_child.parent is not None:
+# We're 'inserting' an element that's already one
+# of this object's children.
+if new_child.parent is self:
+current_index = self.index(new_child)
+if current_index < position:
+# We're moving this element further down the list
+# of this object's children. That means that when
+# we extract this element, our target index will
+# jump down one.
+position -= 1
+new_child.extract()
+new_child.parent = self
+previous_child = None
+if position == 0:
+new_child.previous_sibling = None
+new_child.previous_element = self
+else:
+previous_child = self.contents[position - 1]
+new_child.previous_sibling = previous_child
+new_child.previous_sibling.next_sibling = new_child
+new_child.previous_element = previous_child._last_descendant(False)
+if new_child.previous_element is not None:
+new_child.previous_element.next_element = new_child
+new_childs_last_element = new_child._last_descendant(False)
+if position >= len(self.contents):
+new_child.next_sibling = None
+parent = self
+parents_next_sibling = None
+while parents_next_sibling is None and parent is not None:
+parents_next_sibling = parent.next_sibling
+parent = parent.parent
+if parents_next_sibling is not None:
+# We found the element that comes next in the document.
+break
+if parents_next_sibling is not None:
+new_childs_last_element.next_element = parents_next_sibling
+else:
+# The last element of this tag is the last element in
+# the document.
+new_childs_last_element.next_element = None
+else:
+next_child = self.contents[position]
+new_child.next_sibling = next_child
+if new_child.next_sibling is not None:
+new_child.next_sibling.previous_sibling = new_child
+new_childs_last_element.next_element = next_child
+if new_childs_last_element.next_element is not None:
+new_childs_last_element.next_element.previous_element = new_childs_last_element
+self.contents.insert(position, new_child)
+def append(self, tag):
+"""Appends the given PageElement to the contents of this one.
+:param tag: A PageElement.
+"""
+self.insert(len(self.contents), tag)
+def extend(self, tags):
+"""Appends the given PageElements to this one's contents.
+:param tags: A list of PageElements.
+"""
+if isinstance(tags, Tag):
+# Calling self.append() on another tag's contents will change
+# the list we're iterating over. Make a list that won't
+# change.
+tags = list(tags.contents)
+for tag in tags:
+self.append(tag)
+def insert_before(self, *args):
+"""Makes the given element(s) the immediate predecessor of this one.
+All the elements will have the same parent, and the given elements
+will be immediately before this one.
+:param args: One or more PageElements.
+"""
+parent = self.parent
+if parent is None:
+raise ValueError(
+"Element has no parent, so 'before' has no meaning.")
+if any(x is self for x in args):
+raise ValueError("Can't insert an element before itself.")
+for predecessor in args:
+# Extract first so that the index won't be screwed up if they
+# are siblings.
+if isinstance(predecessor, PageElement):
+predecessor.extract()
+index = parent.index(self)
+parent.insert(index, predecessor)
+def insert_after(self, *args):
+"""Makes the given element(s) the immediate successor of this one.
+The elements will have the same parent, and the given elements
+will be immediately after this one.
+:param args: One or more PageElements.
+"""
+# Do all error checking before modifying the tree.
+parent = self.parent
+if parent is None:
+raise ValueError(
+"Element has no parent, so 'after' has no meaning.")
+if any(x is self for x in args):
+raise ValueError("Can't insert an element after itself.")
+offset = 0
+for successor in args:
+# Extract first so that the index won't be screwed up if they
+# are siblings.
+if isinstance(successor, PageElement):
+successor.extract()
+index = parent.index(self)
+parent.insert(index+1+offset, successor)
+offset += 1
+def find_next(self, name=None, attrs={}, text=None, **kwargs):
+"""Find the first PageElement that matches the given criteria and
+appears later in the document than this PageElement.
+All find_* methods take a common set of arguments. See the online
+documentation for detailed explanations.
+:param name: A filter on tag name.
+:param attrs: A dictionary of filters on attribute values.
+:param text: A filter for a NavigableString with specific text.
+:kwargs: A dictionary of filters on attribute values.
+:return: A PageElement.
+:rtype: bs4.element.Tag | bs4.element.NavigableString
+"""
+return self._find_one(self.find_all_next, name, attrs, text, **kwargs)
+findNext = find_next  # BS3
+def find_all_next(self, name=None, attrs={}, text=None, limit=None,
+**kwargs):
+"""Find all PageElements that match the given criteria and appear
+later in the document than this PageElement.
+All find_* methods take a common set of arguments. See the online
+documentation for detailed explanations.
+:param name: A filter on tag name.
+:param attrs: A dictionary of filters on attribute values.
+:param text: A filter for a NavigableString with specific text.
+:param limit: Stop looking after finding this many results.
+:kwargs: A dictionary of filters on attribute values.
+:return: A ResultSet containing PageElements.
+"""
+return self._find_all(name, attrs, text, limit, self.next_elements,
+**kwargs)
+findAllNext = find_all_next  # BS3
+def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs):
+"""Find the closest sibling to this PageElement that matches the
+given criteria and appears later in the document.
+All find_* methods take a common set of arguments. See the
+online documentation for detailed explanations.
+:param name: A filter on tag name.
+:param attrs: A dictionary of filters on attribute values.
+:param text: A filter for a NavigableString with specific text.
+:kwargs: A dictionary of filters on attribute values.
+:return: A PageElement.
+:rtype: bs4.element.Tag | bs4.element.NavigableString
+"""
+return self._find_one(self.find_next_siblings, name, attrs, text,
+**kwargs)
+findNextSibling = find_next_sibling  # BS3
+def find_next_siblings(self, name=None, attrs={}, text=None, limit=None,
+**kwargs):
+"""Find all siblings of this PageElement that match the given criteria
+and appear later in the document.
+All find_* methods take a common set of arguments. See the online
+documentation for detailed explanations.
+:param name: A filter on tag name.
+:param attrs: A dictionary of filters on attribute values.
+:param text: A filter for a NavigableString with specific text.
+:param limit: Stop looking after finding this many results.
+:kwargs: A dictionary of filters on attribute values.
+:return: A ResultSet of PageElements.
+:rtype: bs4.element.ResultSet
+"""
+return self._find_all(name, attrs, text, limit,
+self.next_siblings, **kwargs)
+findNextSiblings = find_next_siblings   # BS3
+fetchNextSiblings = find_next_siblings  # BS2
+def find_previous(self, name=None, attrs={}, text=None, **kwargs):
+"""Look backwards in the document from this PageElement and find the
+first PageElement that matches the given criteria.
+All find_* methods take a common set of arguments. See the online
+documentation for detailed explanations.
+:param name: A filter on tag name.
+:param attrs: A dictionary of filters on attribute values.
+:param text: A filter for a NavigableString with specific text.
+:kwargs: A dictionary of filters on attribute values.
+:return: A PageElement.
+:rtype: bs4.element.Tag | bs4.element.NavigableString
+"""
+return self._find_one(
+self.find_all_previous, name, attrs, text, **kwargs)
+findPrevious = find_previous  # BS3
+def find_all_previous(self, name=None, attrs={}, text=None, limit=None,
+**kwargs):
+"""Look backwards in the document from this PageElement and find all
+PageElements that match the given criteria.
+All find_* methods take a common set of arguments. See the online
+documentation for detailed explanations.
+:param name: A filter on tag name.
+:param attrs: A dictionary of filters on attribute values.
+:param text: A filter for a NavigableString with specific text.
+:param limit: Stop looking after finding this many results.
+:kwargs: A dictionary of filters on attribute values.
+:return: A ResultSet of PageElements.
+:rtype: bs4.element.ResultSet
+"""
+return self._find_all(name, attrs, text, limit, self.previous_elements,
+**kwargs)
+findAllPrevious = find_all_previous  # BS3
+fetchPrevious = find_all_previous    # BS2
+def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs):
+"""Returns the closest sibling to this PageElement that matches the
+given criteria and appears earlier in the document.
+All find_* methods take a common set of arguments. See the online
+documentation for detailed explanations.
+:param name: A filter on tag name.
+:param attrs: A dictionary of filters on attribute values.
+:param text: A filter for a NavigableString with specific text.
+:kwargs: A dictionary of filters on attribute values.
+:return: A PageElement.
+:rtype: bs4.element.Tag | bs4.element.NavigableString
+"""
+return self._find_one(self.find_previous_siblings, name, attrs, text,
+**kwargs)
+findPreviousSibling = find_previous_sibling  # BS3
+def find_previous_siblings(self, name=None, attrs={}, text=None,
+limit=None, **kwargs):
+"""Returns all siblings to this PageElement that match the
+given criteria and appear earlier in the document.
+All find_* methods take a common set of arguments. See the online
+documentation for detailed explanations.
+:param name: A filter on tag name.
+:param attrs: A dictionary of filters on attribute values.
+:param text: A filter for a NavigableString with specific text.
+:param limit: Stop looking after finding this many results.
+:kwargs: A dictionary of filters on attribute values.
+:return: A ResultSet of PageElements.
+:rtype: bs4.element.ResultSet
+"""
+return self._find_all(name, attrs, text, limit,
+self.previous_siblings, **kwargs)
+findPreviousSiblings = find_previous_siblings   # BS3
+fetchPreviousSiblings = find_previous_siblings  # BS2
+def find_parent(self, name=None, attrs={}, **kwargs):
+"""Find the closest parent of this PageElement that matches the given
+criteria.
+All find_* methods take a common set of arguments. See the online
+documentation for detailed explanations.
+:param name: A filter on tag name.
+:param attrs: A dictionary of filters on attribute values.
+:kwargs: A dictionary of filters on attribute values.
+:return: A PageElement.
+:rtype: bs4.element.Tag | bs4.element.NavigableString
+"""
+# NOTE: We can't use _find_one because findParents takes a different
+# set of arguments.
+r = None
+l = self.find_parents(name, attrs, 1, **kwargs)
+if l:
+r = l[0]
+return r
+findParent = find_parent  # BS3
+def find_parents(self, name=None, attrs={}, limit=None, **kwargs):
+"""Find all parents of this PageElement that match the given criteria.
+All find_* methods take a common set of arguments. See the online
+documentation for detailed explanations.
+:param name: A filter on tag name.
+:param attrs: A dictionary of filters on attribute values.
+:param limit: Stop looking after finding this many results.
+:kwargs: A dictionary of filters on attribute values.
+:return: A PageElement.
+:rtype: bs4.element.Tag | bs4.element.NavigableString
+"""
+return self._find_all(name, attrs, None, limit, self.parents,
+**kwargs)
+findParents = find_parents   # BS3
+fetchParents = find_parents  # BS2
+@property
+def next(self):
+"""The PageElement, if any, that was parsed just after this one.
+:return: A PageElement.
+:rtype: bs4.element.Tag | bs4.element.NavigableString
+"""
+return self.next_element
+@property
+def previous(self):
+"""The PageElement, if any, that was parsed just before this one.
+:return: A PageElement.
+:rtype: bs4.element.Tag | bs4.element.NavigableString
+"""
+return self.previous_element
+#These methods do the real heavy lifting.
+def _find_one(self, method, name, attrs, text, **kwargs):
+r = None
+l = method(name, attrs, text, 1, **kwargs)
+if l:
+r = l[0]
+return r
+def _find_all(self, name, attrs, text, limit, generator, **kwargs):
+"Iterates over a generator looking for things that match."
+if text is None and 'string' in kwargs:
+text = kwargs['string']
+del kwargs['string']
+if isinstance(name, SoupStrainer):
+strainer = name
+else:
+strainer = SoupStrainer(name, attrs, text, **kwargs)
+if text is None and not limit and not attrs and not kwargs:
+if name is True or name is None:
+# Optimization to find all tags.
+result = (element for element in generator
+if isinstance(element, Tag))
+return ResultSet(strainer, result)
+elif isinstance(name, str):
+# Optimization to find all tags with a given name.
+if name.count(':') == 1:
+# This is a name with a prefix. If this is a namespace-aware document,
+# we need to match the local name against tag.name. If not,
+# we need to match the fully-qualified name against tag.name.
+prefix, local_name = name.split(':', 1)
+else:
+prefix = None
+local_name = name
+result = (element for element in generator
+if isinstance(element, Tag)
+and (
+element.name == name
+) or (
+element.name == local_name
+and (prefix is None or element.prefix == prefix)
+)
+)
+return ResultSet(strainer, result)
+results = ResultSet(strainer)
+while True:
+try:
+i = next(generator)
+except StopIteration:
+break
+if i:
+found = strainer.search(i)
+if found:
+results.append(found)
+if limit and len(results) >= limit:
+break
+return results
+#These generators can be used to navigate starting from both
+#NavigableStrings and Tags.
+@property
+def next_elements(self):
+"""All PageElements that were parsed after this one.
+:yield: A sequence of PageElements.
+"""
+i = self.next_element
+while i is not None:
+yield i
+i = i.next_element
+@property
+def next_siblings(self):
+"""All PageElements that are siblings of this one but were parsed
+later.
+:yield: A sequence of PageElements.
+"""
+i = self.next_sibling
+while i is not None:
+yield i
+i = i.next_sibling
+@property
+def previous_elements(self):
+"""All PageElements that were parsed before this one.
+:yield: A sequence of PageElements.
+"""
+i = self.previous_element
+while i is not None:
+yield i
+i = i.previous_element
+@property
+def previous_siblings(self):
+"""All PageElements that are siblings of this one but were parsed
+earlier.
+:yield: A sequence of PageElements.
+"""
+i = self.previous_sibling
+while i is not None:
+yield i
+i = i.previous_sibling
+@property
+def parents(self):
+"""All PageElements that are parents of this PageElement.
+:yield: A sequence of PageElements.
+"""
+i = self.parent
+while i is not None:
+yield i
+i = i.parent
+@property
+def decomposed(self):
+"""Check whether a PageElement has been decomposed.
+:rtype: bool
+"""
+return getattr(self, '_decomposed', False) or False
+# Old non-property versions of the generators, for backwards
+# compatibility with BS3.
+def nextGenerator(self):
+return self.next_elements
+def nextSiblingGenerator(self):
+return self.next_siblings
+def previousGenerator(self):
+return self.previous_elements
+def previousSiblingGenerator(self):
+return self.previous_siblings
+def parentGenerator(self):
+return self.parents
+class NavigableString(str, PageElement):
+"""A Python Unicode string that is part of a parse tree.
+When Beautiful Soup parses the markup <b>penguin</b>, it will
+create a NavigableString for the string "penguin".
+"""
+PREFIX = ''
+SUFFIX = ''
+# We can't tell just by looking at a string whether it's contained
+# in an XML document or an HTML document.
+known_xml = None
+def __new__(cls, value):
+"""Create a new NavigableString.
+When unpickling a NavigableString, this method is called with
+the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
+passed in to the superclass's __new__ or the superclass won't know
+how to handle non-ASCII characters.
+"""
+if isinstance(value, str):
+u = str.__new__(cls, value)
+else:
+u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
+u.setup()
+return u
+def __copy__(self):
+"""A copy of a NavigableString has the same contents and class
+as the original, but it is not connected to the parse tree.
+"""
+return type(self)(self)
+def __getnewargs__(self):
+return (str(self),)
+def __getattr__(self, attr):
+"""text.string gives you text. This is for backwards
+compatibility for Navigable*String, but for CData* it lets you
+get the string without the CData wrapper."""
+if attr == 'string':
+return self
+else:
+raise AttributeError(
+"'%s' object has no attribute '%s'" % (
+self.__class__.__name__, attr))
+def output_ready(self, formatter="minimal"):
+"""Run the string through the provided formatter.
+:param formatter: A Formatter object, or a string naming one of the standard formatters.
+"""
+output = self.format_string(self, formatter)
+return self.PREFIX + output + self.SUFFIX
+@property
+def name(self):
+"""Since a NavigableString is not a Tag, it has no .name.
+This property is implemented so that code like this doesn't crash
+when run on a mixture of Tag and NavigableString objects:
+[x.name for x in tag.children]
+"""
+return None
+@name.setter
+def name(self, name):
+"""Prevent NavigableString.name from ever being set."""
+raise AttributeError("A NavigableString cannot be given a name.")
+class PreformattedString(NavigableString):
+"""A NavigableString not subject to the normal formatting rules.
+This is an abstract class used for special kinds of strings such
+as comments (the Comment class) and CDATA blocks (the CData
+class).
+"""
+PREFIX = ''
+SUFFIX = ''
+def output_ready(self, formatter=None):
+"""Make this string ready for output by adding any subclass-specific
+prefix or suffix.
+:param formatter: A Formatter object, or a string naming one
+of the standard formatters. The string will be passed into the
+Formatter, but only to trigger any side effects: the return
+value is ignored.
+:return: The string, with any subclass-specific prefix and
+suffix added on.
+"""
+if formatter is not None:
+ignore = self.format_string(self, formatter)
+return self.PREFIX + self + self.SUFFIX
+class CData(PreformattedString):
+"""A CDATA block."""
+PREFIX = '<![CDATA['
+SUFFIX = ']]>'
+class ProcessingInstruction(PreformattedString):
+"""A SGML processing instruction."""
+PREFIX = '<?'
+SUFFIX = '>'
+class XMLProcessingInstruction(ProcessingInstruction):
+"""An XML processing instruction."""
+PREFIX = '<?'
+SUFFIX = '?>'
+class Comment(PreformattedString):
+"""An HTML or XML comment."""
+PREFIX = '<!--'
+SUFFIX = '-->'
+class Declaration(PreformattedString):
+"""An XML declaration."""
+PREFIX = '<?'
+SUFFIX = '?>'
+class Doctype(PreformattedString):
+"""A document type declaration."""
+@classmethod
+def for_name_and_ids(cls, name, pub_id, system_id):
+"""Generate an appropriate document type declaration for a given
+public ID and system ID.
+:param name: The name of the document's root element, e.g. 'html'.
+:param pub_id: The Formal Public Identifier for this document type,
+e.g. '-//W3C//DTD XHTML 1.1//EN'
+:param system_id: The system identifier for this document type,
+e.g. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'
+:return: A Doctype.
+"""
+value = name or ''
+if pub_id is not None:
+value += ' PUBLIC "%s"' % pub_id
+if system_id is not None:
+value += ' "%s"' % system_id
+elif system_id is not None:
+value += ' SYSTEM "%s"' % system_id
+return Doctype(value)
+PREFIX = '<!DOCTYPE '
+SUFFIX = '>\n'
+class Stylesheet(NavigableString):
+"""A NavigableString representing an stylesheet (probably
+CSS).
+Used to distinguish embedded stylesheets from textual content.
+"""
+pass
+class Script(NavigableString):
+"""A NavigableString representing an executable script (probably
+Javascript).
+Used to distinguish executable code from textual content.
+"""
+pass
+class TemplateString(NavigableString):
+"""A NavigableString representing a string found inside an HTML
+template embedded in a larger document.
+Used to distinguish such strings from the main body of the document.
+"""
+pass
+class Tag(PageElement):
+"""Represents an HTML or XML tag that is part of a parse tree, along
+with its attributes and contents.
+When Beautiful Soup parses the markup <b>penguin</b>, it will
+create a Tag object representing the <b> tag.
+"""
+def __init__(self, parser=None, builder=None, name=None, namespace=None,
+prefix=None, attrs=None, parent=None, previous=None,
+is_xml=None, sourceline=None, sourcepos=None,
+can_be_empty_element=None, cdata_list_attributes=None,
+preserve_whitespace_tags=None
+):
+"""Basic constructor.
+:param parser: A BeautifulSoup object.
+:param builder: A TreeBuilder.
+:param name: The name of the tag.
+:param namespace: The URI of this Tag's XML namespace, if any.
+:param prefix: The prefix for this Tag's XML namespace, if any.
+:param attrs: A dictionary of this Tag's attribute values.
+:param parent: The PageElement to use as this Tag's parent.
+:param previous: The PageElement that was parsed immediately before
+this tag.
+:param is_xml: If True, this is an XML tag. Otherwise, this is an
+HTML tag.
+:param sourceline: The line number where this tag was found in its
+source document.
+:param sourcepos: The character position within `sourceline` where this
+tag was found.
+:param can_be_empty_element: If True, this tag should be
+represented as <tag/>. If False, this tag should be represented
+as <tag></tag>.
+:param cdata_list_attributes: A list of attributes whose values should
+be treated as CDATA if they ever show up on this tag.
+:param preserve_whitespace_tags: A list of tag names whose contents
+should have their whitespace preserved.
+"""
+if parser is None:
+self.parser_class = None
+else:
+# We don't actually store the parser object: that lets extracted
+# chunks be garbage-collected.
+self.parser_class = parser.__class__
+if name is None:
+raise ValueError("No value provided for new tag's name.")
+self.name = name
+self.namespace = namespace
+self.prefix = prefix
+if ((not builder or builder.store_line_numbers)
+and (sourceline is not None or sourcepos is not None)):
+self.sourceline = sourceline
+self.sourcepos = sourcepos
+if attrs is None:
+attrs = {}
+elif attrs:
+if builder is not None and builder.cdata_list_attributes:
+attrs = builder._replace_cdata_list_attribute_values(
+self.name, attrs)
+else:
+attrs = dict(attrs)
+else:
+attrs = dict(attrs)
+# If possible, determine ahead of time whether this tag is an
+# XML tag.
+if builder:
+self.known_xml = builder.is_xml
+else:
+self.known_xml = is_xml
+self.attrs = attrs
+self.contents = []
+self.setup(parent, previous)
+self.hidden = False
+if builder is None:
+# In the absence of a TreeBuilder, use whatever values were
+# passed in here. They're probably None, unless this is a copy of some
+# other tag.
+self.can_be_empty_element = can_be_empty_element
+self.cdata_list_attributes = cdata_list_attributes
+self.preserve_whitespace_tags = preserve_whitespace_tags
+else:
+# Set up any substitutions for this tag, such as the charset in a META tag.
+builder.set_up_substitutions(self)
+# Ask the TreeBuilder whether this tag might be an empty-element tag.
+self.can_be_empty_element = builder.can_be_empty_element(name)
+# Keep track of the list of attributes of this tag that
+# might need to be treated as a list.
+#
+# For performance reasons, we store the whole data structure
+# rather than asking the question of every tag. Asking would
+# require building a new data structure every time, and
+# (unlike can_be_empty_element), we almost never need
+# to check this.
+self.cdata_list_attributes = builder.cdata_list_attributes
+# Keep track of the names that might cause this tag to be treated as a
+# whitespace-preserved tag.
+self.preserve_whitespace_tags = builder.preserve_whitespace_tags
+parserClass = _alias("parser_class")  # BS3
+def __copy__(self):
+"""A copy of a Tag is a new Tag, unconnected to the parse tree.
+Its contents are a copy of the old Tag's contents.
+"""
+clone = type(self)(
+None, self.builder, self.name, self.namespace,
+self.prefix, self.attrs, is_xml=self._is_xml,
+sourceline=self.sourceline, sourcepos=self.sourcepos,
+can_be_empty_element=self.can_be_empty_element,
+cdata_list_attributes=self.cdata_list_attributes,
+preserve_whitespace_tags=self.preserve_whitespace_tags
+)
+for attr in ('can_be_empty_element', 'hidden'):
+setattr(clone, attr, getattr(self, attr))
+for child in self.contents:
+clone.append(child.__copy__())
+return clone
+@property
+def is_empty_element(self):
+"""Is this tag an empty-element tag? (aka a self-closing tag)
+A tag that has contents is never an empty-element tag.
+A tag that has no contents may or may not be an empty-element
+tag. It depends on the builder used to create the tag. If the
+builder has a designated list of empty-element tags, then only
+a tag whose name shows up in that list is considered an
+empty-element tag.
+If the builder has no designated list of empty-element tags,
+then any tag with no contents is an empty-element tag.
+"""
+return len(self.contents) == 0 and self.can_be_empty_element
+isSelfClosing = is_empty_element  # BS3
+@property
+def string(self):
+"""Convenience property to get the single string within this
+PageElement.
+TODO It might make sense to have NavigableString.string return
+itself.
+:return: If this element has a single string child, return
+value is that string. If this element has one child tag,
+return value is the 'string' attribute of the child tag,
+recursively. If this element is itself a string, has no
+children, or has more than one child, return value is None.
+"""
+if len(self.contents) != 1:
+return None
+child = self.contents[0]
+if isinstance(child, NavigableString):
+return child
+return child.string
+@string.setter
+def string(self, string):
+"""Replace this PageElement's contents with `string`."""
+self.clear()
+self.append(string.__class__(string))
+def _all_strings(self, strip=False, types=(NavigableString, CData)):
+"""Yield all strings of certain classes, possibly stripping them.
+:param strip: If True, all strings will be stripped before being
+yielded.
+:types: A tuple of NavigableString subclasses. Any strings of
+a subclass not found in this list will be ignored. By
+default, this means only NavigableString and CData objects
+will be considered. So no comments, processing instructions,
+etc.
+:yield: A sequence of strings.
+"""
+for descendant in self.descendants:
+if (
+(types is None and not isinstance(descendant, NavigableString))
+or
+(types is not None and type(descendant) not in types)):
+continue
+if strip:
+descendant = descendant.strip()
+if len(descendant) == 0:
+continue
+yield descendant
+strings = property(_all_strings)
+@property
+def stripped_strings(self):
+"""Yield all strings in the document, stripping them first.
+:yield: A sequence of stripped strings.
+"""
+for string in self._all_strings(True):
+yield string
+def get_text(self, separator="", strip=False,
+types=(NavigableString, CData)):
+"""Get all child strings, concatenated using the given separator.
+:param separator: Strings will be concatenated using this separator.
+:param strip: If True, strings will be stripped before being
+concatenated.
+:types: A tuple of NavigableString subclasses. Any strings of
+a subclass not found in this list will be ignored. By
+default, this means only NavigableString and CData objects
+will be considered. So no comments, processing instructions,
+stylesheets, etc.
+:return: A string.
+"""
+return separator.join([s for s in self._all_strings(
+strip, types=types)])
+getText = get_text
+text = property(get_text)
+def decompose(self):
+"""Recursively destroys this PageElement and its children.
+This element will be removed from the tree and wiped out; so
+will everything beneath it.
+The behavior of a decomposed PageElement is undefined and you
+should never use one for anything, but if you need to _check_
+whether an element has been decomposed, you can use the
+`decomposed` property.
+"""
+self.extract()
+i = self
+while i is not None:
+n = i.next_element
+i.__dict__.clear()
+i.contents = []
+i._decomposed = True
+i = n
+def clear(self, decompose=False):
+"""Wipe out all children of this PageElement by calling extract()
+on them.
+:param decompose: If this is True, decompose() (a more
+destructive method) will be called instead of extract().
+"""
+if decompose:
+for element in self.contents[:]:
+if isinstance(element, Tag):
+element.decompose()
+else:
+element.extract()
+else:
+for element in self.contents[:]:
+element.extract()
+def smooth(self):
+"""Smooth out this element's children by consolidating consecutive
+strings.
+This makes pretty-printed output look more natural following a
+lot of operations that modified the tree.
+"""
+# Mark the first position of every pair of children that need
+# to be consolidated.  Do this rather than making a copy of
+# self.contents, since in most cases very few strings will be
+# affected.
+marked = []
+for i, a in enumerate(self.contents):
+if isinstance(a, Tag):
+# Recursively smooth children.
+a.smooth()
+if i == len(self.contents)-1:
+# This is the last item in .contents, and it's not a
+# tag. There's no chance it needs any work.
+continue
+b = self.contents[i+1]
+if (isinstance(a, NavigableString)
+and isinstance(b, NavigableString)
+and not isinstance(a, PreformattedString)
+and not isinstance(b, PreformattedString)
+):
+marked.append(i)
+# Go over the marked positions in reverse order, so that
+# removing items from .contents won't affect the remaining
+# positions.
+for i in reversed(marked):
+a = self.contents[i]
+b = self.contents[i+1]
+b.extract()
+n = NavigableString(a+b)
+a.replace_with(n)
+def index(self, element):
+"""Find the index of a child by identity, not value.
+Avoids issues with tag.contents.index(element) getting the
+index of equal elements.
+:param element: Look for this PageElement in `self.contents`.
+"""
+for i, child in enumerate(self.contents):
+if child is element:
+return i
+raise ValueError("Tag.index: element not in tag")
+def get(self, key, default=None):
+"""Returns the value of the 'key' attribute for the tag, or
+the value given for 'default' if it doesn't have that
+attribute."""
+return self.attrs.get(key, default)
+def get_attribute_list(self, key, default=None):
+"""The same as get(), but always returns a list.
+:param key: The attribute to look for.
+:param default: Use this value if the attribute is not present
+on this PageElement.
+:return: A list of values, probably containing only a single
+value.
+"""
+value = self.get(key, default)
+if not isinstance(value, list):
+value = [value]
+return value
+def has_attr(self, key):
+"""Does this PageElement have an attribute with the given name?"""
+return key in self.attrs
+def __hash__(self):
+return str(self).__hash__()
+def __getitem__(self, key):
+"""tag[key] returns the value of the 'key' attribute for the Tag,
+and throws an exception if it's not there."""
+return self.attrs[key]
+def __iter__(self):
+"Iterating over a Tag iterates over its contents."
+return iter(self.contents)
+def __len__(self):
+"The length of a Tag is the length of its list of contents."
+return len(self.contents)
+def __contains__(self, x):
+return x in self.contents
+def __bool__(self):
+"A tag is non-None even if it has no contents."
+return True
+def __setitem__(self, key, value):
+"""Setting tag[key] sets the value of the 'key' attribute for the
+tag."""
+self.attrs[key] = value
+def __delitem__(self, key):
+"Deleting tag[key] deletes all 'key' attributes for the tag."
+self.attrs.pop(key, None)
+def __call__(self, *args, **kwargs):
+"""Calling a Tag like a function is the same as calling its
+find_all() method. Eg. tag('a') returns a list of all the A tags
+found within this tag."""
+return self.find_all(*args, **kwargs)
+def __getattr__(self, tag):
+"""Calling tag.subtag is the same as calling tag.find(name="subtag")"""
+#print("Getattr %s.%s" % (self.__class__, tag))
+if len(tag) > 3 and tag.endswith('Tag'):
+# BS3: soup.aTag -> "soup.find("a")
+tag_name = tag[:-3]
+warnings.warn(
+'.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict(
+name=tag_name
+)
+)
+return self.find(tag_name)
+# We special case contents to avoid recursion.
+elif not tag.startswith("__") and not tag == "contents":
+return self.find(tag)
+raise AttributeError(
+"'%s' object has no attribute '%s'" % (self.__class__, tag))
+def __eq__(self, other):
+"""Returns true iff this Tag has the same name, the same attributes,
+and the same contents (recursively) as `other`."""
+if self is other:
+return True
+if (not hasattr(other, 'name') or
+not hasattr(other, 'attrs') or
+not hasattr(other, 'contents') or
+self.name != other.name or
+self.attrs != other.attrs or
+len(self) != len(other)):
+return False
+for i, my_child in enumerate(self.contents):
+if my_child != other.contents[i]:
+return False
+return True
+def __ne__(self, other):
+"""Returns true iff this Tag is not identical to `other`,
+as defined in __eq__."""
+return not self == other
+def __repr__(self, encoding="unicode-escape"):
+"""Renders this PageElement as a string.
+:param encoding: The encoding to use (Python 2 only).
+:return: Under Python 2, a bytestring; under Python 3,
+a Unicode string.
+"""
+if PY3K:
+# "The return value must be a string object", i.e. Unicode
+return self.decode()
+else:
+# "The return value must be a string object", i.e. a bytestring.
+# By convention, the return value of __repr__ should also be
+# an ASCII string.
+return self.encode(encoding)
+def __unicode__(self):
+"""Renders this PageElement as a Unicode string."""
+return self.decode()
+def __str__(self):
+"""Renders this PageElement as a generic string.
+:return: Under Python 2, a UTF-8 bytestring; under Python 3,
+a Unicode string.
+"""
+if PY3K:
+return self.decode()
+else:
+return self.encode()
+if PY3K:
+__str__ = __repr__ = __unicode__
+def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
+indent_level=None, formatter="minimal",
+errors="xmlcharrefreplace"):
+"""Render a bytestring representation of this PageElement and its
+contents.
+:param encoding: The destination encoding.
+:param indent_level: Each line of the rendering will be
+indented this many spaces. Used internally in
+recursive calls while pretty-printing.
+:param formatter: A Formatter object, or a string naming one of
+the standard formatters.
+:param errors: An error handling strategy such as
+'xmlcharrefreplace'. This value is passed along into
+encode() and its value should be one of the constants
+defined by Python.
+:return: A bytestring.
+"""
+# Turn the data structure into Unicode, then encode the
+# Unicode.
+u = self.decode(indent_level, encoding, formatter)
+return u.encode(encoding, errors)
+def decode(self, indent_level=None,
+eventual_encoding=DEFAULT_OUTPUT_ENCODING,
+formatter="minimal"):
+"""Render a Unicode representation of this PageElement and its
+contents.
+:param indent_level: Each line of the rendering will be
+indented this many spaces. Used internally in
+recursive calls while pretty-printing.
+:param eventual_encoding: The tag is destined to be
+encoded into this encoding. This method is _not_
+responsible for performing that encoding. This information
+is passed in so that it can be substituted in if the
+document contains a <META> tag that mentions the document's
+encoding.
+:param formatter: A Formatter object, or a string naming one of
+the standard formatters.
+"""
+# First off, turn a non-Formatter `formatter` into a Formatter
+# object. This will stop the lookup from happening over and
+# over again.
+if not isinstance(formatter, Formatter):
+formatter = self.formatter_for_name(formatter)
+attributes = formatter.attributes(self)
+attrs = []
+for key, val in attributes:
+if val is None:
+decoded = key
+else:
+if isinstance(val, list) or isinstance(val, tuple):
+val = ' '.join(val)
+elif not isinstance(val, str):
+val = str(val)
+elif (
+isinstance(val, AttributeValueWithCharsetSubstitution)
+and eventual_encoding is not None
+):
+val = val.encode(eventual_encoding)
+text = formatter.attribute_value(val)
+decoded = (
+str(key) + '='
++ formatter.quoted_attribute_value(text))
+attrs.append(decoded)
+close = ''
+closeTag = ''
+prefix = ''
+if self.prefix:
+prefix = self.prefix + ":"
+if self.is_empty_element:
+close = formatter.void_element_close_prefix or ''
+else:
+closeTag = '</%s%s>' % (prefix, self.name)
+pretty_print = self._should_pretty_print(indent_level)
+space = ''
+indent_space = ''
+if indent_level is not None:
+indent_space = (' ' * (indent_level - 1))
+if pretty_print:
+space = indent_space
+indent_contents = indent_level + 1
+else:
+indent_contents = None
+contents = self.decode_contents(
+indent_contents, eventual_encoding, formatter
+)
+if self.hidden:
+# This is the 'document root' object.
+s = contents
+else:
+s = []
+attribute_string = ''
+if attrs:
+attribute_string = ' ' + ' '.join(attrs)
+if indent_level is not None:
+# Even if this particular tag is not pretty-printed,
+# we should indent up to the start of the tag.
+s.append(indent_space)
+s.append('<%s%s%s%s>' % (
+prefix, self.name, attribute_string, close))
+if pretty_print:
+s.append("\n")
+s.append(contents)
+if pretty_print and contents and contents[-1] != "\n":
+s.append("\n")
+if pretty_print and closeTag:
+s.append(space)
+s.append(closeTag)
+if indent_level is not None and closeTag and self.next_sibling:
+# Even if this particular tag is not pretty-printed,
+# we're now done with the tag, and we should add a
+# newline if appropriate.
+s.append("\n")
+s = ''.join(s)
+return s
+def _should_pretty_print(self, indent_level):
+"""Should this tag be pretty-printed?
+Most of them should, but some (such as <pre> in HTML
+documents) should not.
+"""
+return (
+indent_level is not None
+and (
+not self.preserve_whitespace_tags
+or self.name not in self.preserve_whitespace_tags
+)
+)
+def prettify(self, encoding=None, formatter="minimal"):
+"""Pretty-print this PageElement as a string.
+:param encoding: The eventual encoding of the string. If this is None,
+a Unicode string will be returned.
+:param formatter: A Formatter object, or a string naming one of
+the standard formatters.
+:return: A Unicode string (if encoding==None) or a bytestring
+(otherwise).
+"""
+if encoding is None:
+return self.decode(True, formatter=formatter)
+else:
+return self.encode(encoding, True, formatter=formatter)
+def decode_contents(self, indent_level=None,
+eventual_encoding=DEFAULT_OUTPUT_ENCODING,
+formatter="minimal"):
+"""Renders the contents of this tag as a Unicode string.
+:param indent_level: Each line of the rendering will be
+indented this many spaces. Used internally in
+recursive calls while pretty-printing.
+:param eventual_encoding: The tag is destined to be
+encoded into this encoding. decode_contents() is _not_
+responsible for performing that encoding. This information
+is passed in so that it can be substituted in if the
+document contains a <META> tag that mentions the document's
+encoding.
+:param formatter: A Formatter object, or a string naming one of
+the standard Formatters.
+"""
+# First off, turn a string formatter into a Formatter object. This
+# will stop the lookup from happening over and over again.
+if not isinstance(formatter, Formatter):
+formatter = self.formatter_for_name(formatter)
+pretty_print = (indent_level is not None)
+s = []
+for c in self:
+text = None
+if isinstance(c, NavigableString):
+text = c.output_ready(formatter)
+elif isinstance(c, Tag):
+s.append(c.decode(indent_level, eventual_encoding,
+formatter))
+preserve_whitespace = (
+self.preserve_whitespace_tags and self.name in self.preserve_whitespace_tags
+)
+if text and indent_level and not preserve_whitespace:
+text = text.strip()
+if text:
+if pretty_print and not preserve_whitespace:
+s.append(" " * (indent_level - 1))
+s.append(text)
+if pretty_print and not preserve_whitespace:
+s.append("\n")
+return ''.join(s)
+def encode_contents(
+self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
+formatter="minimal"):
+"""Renders the contents of this PageElement as a bytestring.
+:param indent_level: Each line of the rendering will be
+indented this many spaces. Used internally in
+recursive calls while pretty-printing.
+:param eventual_encoding: The bytestring will be in this encoding.
+:param formatter: A Formatter object, or a string naming one of
+the standard Formatters.
+:return: A bytestring.
+"""
+contents = self.decode_contents(indent_level, encoding, formatter)
+return contents.encode(encoding)
+# Old method for BS3 compatibility
+def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
+prettyPrint=False, indentLevel=0):
+"""Deprecated method for BS3 compatibility."""
+if not prettyPrint:
+indentLevel = None
+return self.encode_contents(
+indent_level=indentLevel, encoding=encoding)
+#Soup methods
+def find(self, name=None, attrs={}, recursive=True, text=None,
+**kwargs):
+"""Look in the children of this PageElement and find the first
+PageElement that matches the given criteria.
+All find_* methods take a common set of arguments. See the online
+documentation for detailed explanations.
+:param name: A filter on tag name.
+:param attrs: A dictionary of filters on attribute values.
+:param recursive: If this is True, find() will perform a
+recursive search of this PageElement's children. Otherwise,
+only the direct children will be considered.
+:param limit: Stop looking after finding this many results.
+:kwargs: A dictionary of filters on attribute values.
+:return: A PageElement.
+:rtype: bs4.element.Tag | bs4.element.NavigableString
+"""
+r = None
+l = self.find_all(name, attrs, recursive, text, 1, **kwargs)
+if l:
+r = l[0]
+return r
+findChild = find #BS2
+def find_all(self, name=None, attrs={}, recursive=True, text=None,
+limit=None, **kwargs):
+"""Look in the children of this PageElement and find all
+PageElements that match the given criteria.
+All find_* methods take a common set of arguments. See the online
+documentation for detailed explanations.
+:param name: A filter on tag name.
+:param attrs: A dictionary of filters on attribute values.
+:param recursive: If this is True, find_all() will perform a
+recursive search of this PageElement's children. Otherwise,
+only the direct children will be considered.
+:param limit: Stop looking after finding this many results.
+:kwargs: A dictionary of filters on attribute values.
+:return: A ResultSet of PageElements.
+:rtype: bs4.element.ResultSet
+"""
+generator = self.descendants
+if not recursive:
+generator = self.children
+return self._find_all(name, attrs, text, limit, generator, **kwargs)
+findAll = find_all       # BS3
+findChildren = find_all  # BS2
+#Generator methods
+@property
+def children(self):
+"""Iterate over all direct children of this PageElement.
+:yield: A sequence of PageElements.
+"""
+# return iter() to make the purpose of the method clear
+return iter(self.contents)  # XXX This seems to be untested.
+@property
+def descendants(self):
+"""Iterate over all children of this PageElement in a
+breadth-first sequence.
+:yield: A sequence of PageElements.
+"""
+if not len(self.contents):
+return
+stopNode = self._last_descendant().next_element
+current = self.contents[0]
+while current is not stopNode:
+yield current
+current = current.next_element
+# CSS selector code
+def select_one(self, selector, namespaces=None, **kwargs):
+"""Perform a CSS selection operation on the current element.
+:param selector: A CSS selector.
+:param namespaces: A dictionary mapping namespace prefixes
+used in the CSS selector to namespace URIs. By default,
+Beautiful Soup will use the prefixes it encountered while
+parsing the document.
+:param kwargs: Keyword arguments to be passed into SoupSieve's
+soupsieve.select() method.
+:return: A Tag.
+:rtype: bs4.element.Tag
+"""
+value = self.select(selector, namespaces, 1, **kwargs)
+if value:
+return value[0]
+return None
+def select(self, selector, namespaces=None, limit=None, **kwargs):
+"""Perform a CSS selection operation on the current element.
+This uses the SoupSieve library.
+:param selector: A string containing a CSS selector.
+:param namespaces: A dictionary mapping namespace prefixes
+used in the CSS selector to namespace URIs. By default,
+Beautiful Soup will use the prefixes it encountered while
+parsing the document.
+:param limit: After finding this number of results, stop looking.
+:param kwargs: Keyword arguments to be passed into SoupSieve's
+soupsieve.select() method.
+:return: A ResultSet of Tags.
+:rtype: bs4.element.ResultSet
+"""
+if namespaces is None:
+namespaces = self._namespaces
+if limit is None:
+limit = 0
+if soupsieve is None:
+raise NotImplementedError(
+"Cannot execute CSS selectors because the soupsieve package is not installed."
+)
+results = soupsieve.select(selector, self, namespaces, limit, **kwargs)
+# We do this because it's more consistent and because
+# ResultSet.__getattr__ has a helpful error message.
+return ResultSet(None, results)
+# Old names for backwards compatibility
+def childGenerator(self):
+"""Deprecated generator."""
+return self.children
+def recursiveChildGenerator(self):
+"""Deprecated generator."""
+return self.descendants
+def has_key(self, key):
+"""Deprecated method. This was kind of misleading because has_key()
+(attributes) was different from __in__ (contents).
+has_key() is gone in Python 3, anyway.
+"""
+warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % (
+key))
+return self.has_attr(key)
+# Next, a couple classes to represent queries and their results.
+class SoupStrainer(object):
+"""Encapsulates a number of ways of matching a markup element (tag or
+string).
+This is primarily used to underpin the find_* methods, but you can
+create one yourself and pass it in as `parse_only` to the
+`BeautifulSoup` constructor, to parse a subset of a large
+document.
+"""
+def __init__(self, name=None, attrs={}, text=None, **kwargs):
+"""Constructor.
+The SoupStrainer constructor takes the same arguments passed
+into the find_* methods. See the online documentation for
+detailed explanations.
+:param name: A filter on tag name.
+:param attrs: A dictionary of filters on attribute values.
+:param text: A filter for a NavigableString with specific text.
+:kwargs: A dictionary of filters on attribute values.
+"""
+self.name = self._normalize_search_value(name)
+if not isinstance(attrs, dict):
+# Treat a non-dict value for attrs as a search for the 'class'
+# attribute.
+kwargs['class'] = attrs
+attrs = None
+if 'class_' in kwargs:
+# Treat class_="foo" as a search for the 'class'
+# attribute, overriding any non-dict value for attrs.
+kwargs['class'] = kwargs['class_']
+del kwargs['class_']
+if kwargs:
+if attrs:
+attrs = attrs.copy()
+attrs.update(kwargs)
+else:
+attrs = kwargs
+normalized_attrs = {}
+for key, value in list(attrs.items()):
+normalized_attrs[key] = self._normalize_search_value(value)
+self.attrs = normalized_attrs
+self.text = self._normalize_search_value(text)
+def _normalize_search_value(self, value):
+# Leave it alone if it's a Unicode string, a callable, a
+# regular expression, a boolean, or None.
+if (isinstance(value, str) or isinstance(value, Callable) or hasattr(value, 'match')
+or isinstance(value, bool) or value is None):
+return value
+# If it's a bytestring, convert it to Unicode, treating it as UTF-8.
+if isinstance(value, bytes):
+return value.decode("utf8")
+# If it's listlike, convert it into a list of strings.
+if hasattr(value, '__iter__'):
+new_value = []
+for v in value:
+if (hasattr(v, '__iter__') and not isinstance(v, bytes)
+and not isinstance(v, str)):
+# This is almost certainly the user's mistake. In the
+# interests of avoiding infinite loops, we'll let
+# it through as-is rather than doing a recursive call.
+new_value.append(v)
+else:
+new_value.append(self._normalize_search_value(v))
+return new_value
+# Otherwise, convert it into a Unicode string.
+# The unicode(str()) thing is so this will do the same thing on Python 2
+# and Python 3.
+return str(str(value))
+def __str__(self):
+"""A human-readable representation of this SoupStrainer."""
+if self.text:
+return self.text
+else:
+return "%s|%s" % (self.name, self.attrs)
+def search_tag(self, markup_name=None, markup_attrs={}):
+"""Check whether a Tag with the given name and attributes would
+match this SoupStrainer.
+Used prospectively to decide whether to even bother creating a Tag
+object.
+:param markup_name: A tag name as found in some markup.
+:param markup_attrs: A dictionary of attributes as found in some markup.
+:return: True if the prospective tag would match this SoupStrainer;
+False otherwise.
+"""
+found = None
+markup = None
+if isinstance(markup_name, Tag):
+markup = markup_name
+markup_attrs = markup
+if isinstance(self.name, str):
+# Optimization for a very common case where the user is
+# searching for a tag with one specific name, and we're
+# looking at a tag with a different name.
+if markup and not markup.prefix and self.name != markup.name:
+return False
+call_function_with_tag_data = (
+isinstance(self.name, Callable)
+and not isinstance(markup_name, Tag))
+if ((not self.name)
+or call_function_with_tag_data
+or (markup and self._matches(markup, self.name))
+or (not markup and self._matches(markup_name, self.name))):
+if call_function_with_tag_data:
+match = self.name(markup_name, markup_attrs)
+else:
+match = True
+markup_attr_map = None
+for attr, match_against in list(self.attrs.items()):
+if not markup_attr_map:
+if hasattr(markup_attrs, 'get'):
+markup_attr_map = markup_attrs
+else:
+markup_attr_map = {}
+for k, v in markup_attrs:
+markup_attr_map[k] = v
+attr_value = markup_attr_map.get(attr)
+if not self._matches(attr_value, match_against):
+match = False
+break
+if match:
+if markup:
+found = markup
+else:
+found = markup_name
+if found and self.text and not self._matches(found.string, self.text):
+found = None
+return found
+# For BS3 compatibility.
+searchTag = search_tag
+def search(self, markup):
+"""Find all items in `markup` that match this SoupStrainer.
+Used by the core _find_all() method, which is ultimately
+called by all find_* methods.
+:param markup: A PageElement or a list of them.
+"""
+# print('looking for %s in %s' % (self, markup))
+found = None
+# If given a list of items, scan it for a text element that
+# matches.
+if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)):
+for element in markup:
+if isinstance(element, NavigableString) \
+and self.search(element):
+found = element
+break
+# If it's a Tag, make sure its name or attributes match.
+# Don't bother with Tags if we're searching for text.
+elif isinstance(markup, Tag):
+if not self.text or self.name or self.attrs:
+found = self.search_tag(markup)
+# If it's text, make sure the text matches.
+elif isinstance(markup, NavigableString) or \
+isinstance(markup, str):
+if not self.name and not self.attrs and self._matches(markup, self.text):
+found = markup
+else:
+raise Exception(
+"I don't know how to match against a %s" % markup.__class__)
+return found
+def _matches(self, markup, match_against, already_tried=None):
+# print(u"Matching %s against %s" % (markup, match_against))
+result = False
+if isinstance(markup, list) or isinstance(markup, tuple):
+# This should only happen when searching a multi-valued attribute
+# like 'class'.
+for item in markup:
+if self._matches(item, match_against):
+return True
+# We didn't match any particular value of the multivalue
+# attribute, but maybe we match the attribute value when
+# considered as a string.
+if self._matches(' '.join(markup), match_against):
+return True
+return False
+if match_against is True:
+# True matches any non-None value.
+return markup is not None
+if isinstance(match_against, Callable):
+return match_against(markup)
+# Custom callables take the tag as an argument, but all
+# other ways of matching match the tag name as a string.
+original_markup = markup
+if isinstance(markup, Tag):
+markup = markup.name
+# Ensure that `markup` is either a Unicode string, or None.
+markup = self._normalize_search_value(markup)
+if markup is None:
+# None matches None, False, an empty string, an empty list, and so on.
+return not match_against
+if (hasattr(match_against, '__iter__')
+and not isinstance(match_against, str)):
+# We're asked to match against an iterable of items.
+# The markup must be match at least one item in the
+# iterable. We'll try each one in turn.
+#
+# To avoid infinite recursion we need to keep track of
+# items we've already seen.
+if not already_tried:
+already_tried = set()
+for item in match_against:
+if item.__hash__:
+key = item
+else:
+key = id(item)
+if key in already_tried:
+continue
+else:
+already_tried.add(key)
+if self._matches(original_markup, item, already_tried):
+return True
+else:
+return False
+# Beyond this point we might need to run the test twice: once against
+# the tag's name and once against its prefixed name.
+match = False
+if not match and isinstance(match_against, str):
+# Exact string match
+match = markup == match_against
+if not match and hasattr(match_against, 'search'):
+# Regexp match
+return match_against.search(markup)
+if (not match
+and isinstance(original_markup, Tag)
+and original_markup.prefix):
+# Try the whole thing again with the prefixed tag name.
+return self._matches(
+original_markup.prefix + ':' + original_markup.name, match_against
+)
+return match
+class ResultSet(list):
+"""A ResultSet is just a list that keeps track of the SoupStrainer
+that created it."""
+def __init__(self, source, result=()):
+"""Constructor.
+:param source: A SoupStrainer.
+:param result: A list of PageElements.
+"""
+super(ResultSet, self).__init__(result)
+self.source = source
+def __getattr__(self, key):
+"""Raise a helpful exception to explain a common code fix."""
+raise AttributeError(
+"ResultSet object has no attribute '%s'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?" % key
+)

Mercurial > repos > shellac > sam_consensus_v3

comparison env/lib/python3.9/site-packages/bs4/element.py @ 0:4f3585e2f14b draft default tip