Mercurial > repos > shellac > sam_consensus_v3
comparison env/lib/python3.9/site-packages/bs4/element.py @ 0:4f3585e2f14b draft default tip
"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
| author | shellac |
|---|---|
| date | Mon, 22 Mar 2021 18:12:50 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:4f3585e2f14b |
|---|---|
| 1 # Use of this source code is governed by the MIT license. | |
| 2 __license__ = "MIT" | |
| 3 | |
| 4 try: | |
| 5 from collections.abc import Callable # Python 3.6 | |
| 6 except ImportError as e: | |
| 7 from collections import Callable | |
| 8 import re | |
| 9 import sys | |
| 10 import warnings | |
| 11 try: | |
| 12 import soupsieve | |
| 13 except ImportError as e: | |
| 14 soupsieve = None | |
| 15 warnings.warn( | |
| 16 'The soupsieve package is not installed. CSS selectors cannot be used.' | |
| 17 ) | |
| 18 | |
| 19 from bs4.formatter import ( | |
| 20 Formatter, | |
| 21 HTMLFormatter, | |
| 22 XMLFormatter, | |
| 23 ) | |
| 24 | |
| 25 DEFAULT_OUTPUT_ENCODING = "utf-8" | |
| 26 PY3K = (sys.version_info[0] > 2) | |
| 27 | |
| 28 nonwhitespace_re = re.compile(r"\S+") | |
| 29 | |
| 30 # NOTE: This isn't used as of 4.7.0. I'm leaving it for a little bit on | |
| 31 # the off chance someone imported it for their own use. | |
| 32 whitespace_re = re.compile(r"\s+") | |
| 33 | |
| 34 def _alias(attr): | |
| 35 """Alias one attribute name to another for backward compatibility""" | |
| 36 @property | |
| 37 def alias(self): | |
| 38 return getattr(self, attr) | |
| 39 | |
| 40 @alias.setter | |
| 41 def alias(self): | |
| 42 return setattr(self, attr) | |
| 43 return alias | |
| 44 | |
| 45 | |
| 46 # These encodings are recognized by Python (so PageElement.encode | |
| 47 # could theoretically support them) but XML and HTML don't recognize | |
| 48 # them (so they should not show up in an XML or HTML document as that | |
| 49 # document's encoding). | |
| 50 # | |
| 51 # If an XML document is encoded in one of these encodings, no encoding | |
| 52 # will be mentioned in the XML declaration. If an HTML document is | |
| 53 # encoded in one of these encodings, and the HTML document has a | |
| 54 # <meta> tag that mentions an encoding, the encoding will be given as | |
| 55 # the empty string. | |
| 56 # | |
| 57 # Source: | |
| 58 # https://docs.python.org/3/library/codecs.html#python-specific-encodings | |
| 59 PYTHON_SPECIFIC_ENCODINGS = set([ | |
| 60 "idna", | |
| 61 "mbcs", | |
| 62 "oem", | |
| 63 "palmos", | |
| 64 "punycode", | |
| 65 "raw_unicode_escape", | |
| 66 "undefined", | |
| 67 "unicode_escape", | |
| 68 "raw-unicode-escape", | |
| 69 "unicode-escape", | |
| 70 "string-escape", | |
| 71 "string_escape", | |
| 72 ]) | |
| 73 | |
| 74 | |
| 75 class NamespacedAttribute(str): | |
| 76 """A namespaced string (e.g. 'xml:lang') that remembers the namespace | |
| 77 ('xml') and the name ('lang') that were used to create it. | |
| 78 """ | |
| 79 | |
| 80 def __new__(cls, prefix, name=None, namespace=None): | |
| 81 if not name: | |
| 82 # This is the default namespace. Its name "has no value" | |
| 83 # per https://www.w3.org/TR/xml-names/#defaulting | |
| 84 name = None | |
| 85 | |
| 86 if name is None: | |
| 87 obj = str.__new__(cls, prefix) | |
| 88 elif prefix is None: | |
| 89 # Not really namespaced. | |
| 90 obj = str.__new__(cls, name) | |
| 91 else: | |
| 92 obj = str.__new__(cls, prefix + ":" + name) | |
| 93 obj.prefix = prefix | |
| 94 obj.name = name | |
| 95 obj.namespace = namespace | |
| 96 return obj | |
| 97 | |
| 98 class AttributeValueWithCharsetSubstitution(str): | |
| 99 """A stand-in object for a character encoding specified in HTML.""" | |
| 100 | |
| 101 class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): | |
| 102 """A generic stand-in for the value of a meta tag's 'charset' attribute. | |
| 103 | |
| 104 When Beautiful Soup parses the markup '<meta charset="utf8">', the | |
| 105 value of the 'charset' attribute will be one of these objects. | |
| 106 """ | |
| 107 | |
| 108 def __new__(cls, original_value): | |
| 109 obj = str.__new__(cls, original_value) | |
| 110 obj.original_value = original_value | |
| 111 return obj | |
| 112 | |
| 113 def encode(self, encoding): | |
| 114 """When an HTML document is being encoded to a given encoding, the | |
| 115 value of a meta tag's 'charset' is the name of the encoding. | |
| 116 """ | |
| 117 if encoding in PYTHON_SPECIFIC_ENCODINGS: | |
| 118 return '' | |
| 119 return encoding | |
| 120 | |
| 121 | |
| 122 class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): | |
| 123 """A generic stand-in for the value of a meta tag's 'content' attribute. | |
| 124 | |
| 125 When Beautiful Soup parses the markup: | |
| 126 <meta http-equiv="content-type" content="text/html; charset=utf8"> | |
| 127 | |
| 128 The value of the 'content' attribute will be one of these objects. | |
| 129 """ | |
| 130 | |
| 131 CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M) | |
| 132 | |
| 133 def __new__(cls, original_value): | |
| 134 match = cls.CHARSET_RE.search(original_value) | |
| 135 if match is None: | |
| 136 # No substitution necessary. | |
| 137 return str.__new__(str, original_value) | |
| 138 | |
| 139 obj = str.__new__(cls, original_value) | |
| 140 obj.original_value = original_value | |
| 141 return obj | |
| 142 | |
| 143 def encode(self, encoding): | |
| 144 if encoding in PYTHON_SPECIFIC_ENCODINGS: | |
| 145 return '' | |
| 146 def rewrite(match): | |
| 147 return match.group(1) + encoding | |
| 148 return self.CHARSET_RE.sub(rewrite, self.original_value) | |
| 149 | |
| 150 | |
| 151 class PageElement(object): | |
| 152 """Contains the navigational information for some part of the page: | |
| 153 that is, its current location in the parse tree. | |
| 154 | |
| 155 NavigableString, Tag, etc. are all subclasses of PageElement. | |
| 156 """ | |
| 157 | |
| 158 def setup(self, parent=None, previous_element=None, next_element=None, | |
| 159 previous_sibling=None, next_sibling=None): | |
| 160 """Sets up the initial relations between this element and | |
| 161 other elements. | |
| 162 | |
| 163 :param parent: The parent of this element. | |
| 164 | |
| 165 :param previous_element: The element parsed immediately before | |
| 166 this one. | |
| 167 | |
| 168 :param next_element: The element parsed immediately before | |
| 169 this one. | |
| 170 | |
| 171 :param previous_sibling: The most recently encountered element | |
| 172 on the same level of the parse tree as this one. | |
| 173 | |
| 174 :param previous_sibling: The next element to be encountered | |
| 175 on the same level of the parse tree as this one. | |
| 176 """ | |
| 177 self.parent = parent | |
| 178 | |
| 179 self.previous_element = previous_element | |
| 180 if previous_element is not None: | |
| 181 self.previous_element.next_element = self | |
| 182 | |
| 183 self.next_element = next_element | |
| 184 if self.next_element is not None: | |
| 185 self.next_element.previous_element = self | |
| 186 | |
| 187 self.next_sibling = next_sibling | |
| 188 if self.next_sibling is not None: | |
| 189 self.next_sibling.previous_sibling = self | |
| 190 | |
| 191 if (previous_sibling is None | |
| 192 and self.parent is not None and self.parent.contents): | |
| 193 previous_sibling = self.parent.contents[-1] | |
| 194 | |
| 195 self.previous_sibling = previous_sibling | |
| 196 if previous_sibling is not None: | |
| 197 self.previous_sibling.next_sibling = self | |
| 198 | |
| 199 def format_string(self, s, formatter): | |
| 200 """Format the given string using the given formatter. | |
| 201 | |
| 202 :param s: A string. | |
| 203 :param formatter: A Formatter object, or a string naming one of the standard formatters. | |
| 204 """ | |
| 205 if formatter is None: | |
| 206 return s | |
| 207 if not isinstance(formatter, Formatter): | |
| 208 formatter = self.formatter_for_name(formatter) | |
| 209 output = formatter.substitute(s) | |
| 210 return output | |
| 211 | |
| 212 def formatter_for_name(self, formatter): | |
| 213 """Look up or create a Formatter for the given identifier, | |
| 214 if necessary. | |
| 215 | |
| 216 :param formatter: Can be a Formatter object (used as-is), a | |
| 217 function (used as the entity substitution hook for an | |
| 218 XMLFormatter or HTMLFormatter), or a string (used to look | |
| 219 up an XMLFormatter or HTMLFormatter in the appropriate | |
| 220 registry. | |
| 221 """ | |
| 222 if isinstance(formatter, Formatter): | |
| 223 return formatter | |
| 224 if self._is_xml: | |
| 225 c = XMLFormatter | |
| 226 else: | |
| 227 c = HTMLFormatter | |
| 228 if isinstance(formatter, Callable): | |
| 229 return c(entity_substitution=formatter) | |
| 230 return c.REGISTRY[formatter] | |
| 231 | |
| 232 @property | |
| 233 def _is_xml(self): | |
| 234 """Is this element part of an XML tree or an HTML tree? | |
| 235 | |
| 236 This is used in formatter_for_name, when deciding whether an | |
| 237 XMLFormatter or HTMLFormatter is more appropriate. It can be | |
| 238 inefficient, but it should be called very rarely. | |
| 239 """ | |
| 240 if self.known_xml is not None: | |
| 241 # Most of the time we will have determined this when the | |
| 242 # document is parsed. | |
| 243 return self.known_xml | |
| 244 | |
| 245 # Otherwise, it's likely that this element was created by | |
| 246 # direct invocation of the constructor from within the user's | |
| 247 # Python code. | |
| 248 if self.parent is None: | |
| 249 # This is the top-level object. It should have .known_xml set | |
| 250 # from tree creation. If not, take a guess--BS is usually | |
| 251 # used on HTML markup. | |
| 252 return getattr(self, 'is_xml', False) | |
| 253 return self.parent._is_xml | |
| 254 | |
| 255 nextSibling = _alias("next_sibling") # BS3 | |
| 256 previousSibling = _alias("previous_sibling") # BS3 | |
| 257 | |
| 258 def replace_with(self, replace_with): | |
| 259 """Replace this PageElement with another one, keeping the rest of the | |
| 260 tree the same. | |
| 261 | |
| 262 :param replace_with: A PageElement. | |
| 263 :return: `self`, no longer part of the tree. | |
| 264 """ | |
| 265 if self.parent is None: | |
| 266 raise ValueError( | |
| 267 "Cannot replace one element with another when the " | |
| 268 "element to be replaced is not part of a tree.") | |
| 269 if replace_with is self: | |
| 270 return | |
| 271 if replace_with is self.parent: | |
| 272 raise ValueError("Cannot replace a Tag with its parent.") | |
| 273 old_parent = self.parent | |
| 274 my_index = self.parent.index(self) | |
| 275 self.extract(_self_index=my_index) | |
| 276 old_parent.insert(my_index, replace_with) | |
| 277 return self | |
| 278 replaceWith = replace_with # BS3 | |
| 279 | |
| 280 def unwrap(self): | |
| 281 """Replace this PageElement with its contents. | |
| 282 | |
| 283 :return: `self`, no longer part of the tree. | |
| 284 """ | |
| 285 my_parent = self.parent | |
| 286 if self.parent is None: | |
| 287 raise ValueError( | |
| 288 "Cannot replace an element with its contents when that" | |
| 289 "element is not part of a tree.") | |
| 290 my_index = self.parent.index(self) | |
| 291 self.extract(_self_index=my_index) | |
| 292 for child in reversed(self.contents[:]): | |
| 293 my_parent.insert(my_index, child) | |
| 294 return self | |
| 295 replace_with_children = unwrap | |
| 296 replaceWithChildren = unwrap # BS3 | |
| 297 | |
| 298 def wrap(self, wrap_inside): | |
| 299 """Wrap this PageElement inside another one. | |
| 300 | |
| 301 :param wrap_inside: A PageElement. | |
| 302 :return: `wrap_inside`, occupying the position in the tree that used | |
| 303 to be occupied by `self`, and with `self` inside it. | |
| 304 """ | |
| 305 me = self.replace_with(wrap_inside) | |
| 306 wrap_inside.append(me) | |
| 307 return wrap_inside | |
| 308 | |
| 309 def extract(self, _self_index=None): | |
| 310 """Destructively rips this element out of the tree. | |
| 311 | |
| 312 :param _self_index: The location of this element in its parent's | |
| 313 .contents, if known. Passing this in allows for a performance | |
| 314 optimization. | |
| 315 | |
| 316 :return: `self`, no longer part of the tree. | |
| 317 """ | |
| 318 if self.parent is not None: | |
| 319 if _self_index is None: | |
| 320 _self_index = self.parent.index(self) | |
| 321 del self.parent.contents[_self_index] | |
| 322 | |
| 323 #Find the two elements that would be next to each other if | |
| 324 #this element (and any children) hadn't been parsed. Connect | |
| 325 #the two. | |
| 326 last_child = self._last_descendant() | |
| 327 next_element = last_child.next_element | |
| 328 | |
| 329 if (self.previous_element is not None and | |
| 330 self.previous_element is not next_element): | |
| 331 self.previous_element.next_element = next_element | |
| 332 if next_element is not None and next_element is not self.previous_element: | |
| 333 next_element.previous_element = self.previous_element | |
| 334 self.previous_element = None | |
| 335 last_child.next_element = None | |
| 336 | |
| 337 self.parent = None | |
| 338 if (self.previous_sibling is not None | |
| 339 and self.previous_sibling is not self.next_sibling): | |
| 340 self.previous_sibling.next_sibling = self.next_sibling | |
| 341 if (self.next_sibling is not None | |
| 342 and self.next_sibling is not self.previous_sibling): | |
| 343 self.next_sibling.previous_sibling = self.previous_sibling | |
| 344 self.previous_sibling = self.next_sibling = None | |
| 345 return self | |
| 346 | |
| 347 def _last_descendant(self, is_initialized=True, accept_self=True): | |
| 348 """Finds the last element beneath this object to be parsed. | |
| 349 | |
| 350 :param is_initialized: Has `setup` been called on this PageElement | |
| 351 yet? | |
| 352 :param accept_self: Is `self` an acceptable answer to the question? | |
| 353 """ | |
| 354 if is_initialized and self.next_sibling is not None: | |
| 355 last_child = self.next_sibling.previous_element | |
| 356 else: | |
| 357 last_child = self | |
| 358 while isinstance(last_child, Tag) and last_child.contents: | |
| 359 last_child = last_child.contents[-1] | |
| 360 if not accept_self and last_child is self: | |
| 361 last_child = None | |
| 362 return last_child | |
| 363 # BS3: Not part of the API! | |
| 364 _lastRecursiveChild = _last_descendant | |
| 365 | |
| 366 def insert(self, position, new_child): | |
| 367 """Insert a new PageElement in the list of this PageElement's children. | |
| 368 | |
| 369 This works the same way as `list.insert`. | |
| 370 | |
| 371 :param position: The numeric position that should be occupied | |
| 372 in `self.children` by the new PageElement. | |
| 373 :param new_child: A PageElement. | |
| 374 """ | |
| 375 if new_child is None: | |
| 376 raise ValueError("Cannot insert None into a tag.") | |
| 377 if new_child is self: | |
| 378 raise ValueError("Cannot insert a tag into itself.") | |
| 379 if (isinstance(new_child, str) | |
| 380 and not isinstance(new_child, NavigableString)): | |
| 381 new_child = NavigableString(new_child) | |
| 382 | |
| 383 from bs4 import BeautifulSoup | |
| 384 if isinstance(new_child, BeautifulSoup): | |
| 385 # We don't want to end up with a situation where one BeautifulSoup | |
| 386 # object contains another. Insert the children one at a time. | |
| 387 for subchild in list(new_child.contents): | |
| 388 self.insert(position, subchild) | |
| 389 position += 1 | |
| 390 return | |
| 391 position = min(position, len(self.contents)) | |
| 392 if hasattr(new_child, 'parent') and new_child.parent is not None: | |
| 393 # We're 'inserting' an element that's already one | |
| 394 # of this object's children. | |
| 395 if new_child.parent is self: | |
| 396 current_index = self.index(new_child) | |
| 397 if current_index < position: | |
| 398 # We're moving this element further down the list | |
| 399 # of this object's children. That means that when | |
| 400 # we extract this element, our target index will | |
| 401 # jump down one. | |
| 402 position -= 1 | |
| 403 new_child.extract() | |
| 404 | |
| 405 new_child.parent = self | |
| 406 previous_child = None | |
| 407 if position == 0: | |
| 408 new_child.previous_sibling = None | |
| 409 new_child.previous_element = self | |
| 410 else: | |
| 411 previous_child = self.contents[position - 1] | |
| 412 new_child.previous_sibling = previous_child | |
| 413 new_child.previous_sibling.next_sibling = new_child | |
| 414 new_child.previous_element = previous_child._last_descendant(False) | |
| 415 if new_child.previous_element is not None: | |
| 416 new_child.previous_element.next_element = new_child | |
| 417 | |
| 418 new_childs_last_element = new_child._last_descendant(False) | |
| 419 | |
| 420 if position >= len(self.contents): | |
| 421 new_child.next_sibling = None | |
| 422 | |
| 423 parent = self | |
| 424 parents_next_sibling = None | |
| 425 while parents_next_sibling is None and parent is not None: | |
| 426 parents_next_sibling = parent.next_sibling | |
| 427 parent = parent.parent | |
| 428 if parents_next_sibling is not None: | |
| 429 # We found the element that comes next in the document. | |
| 430 break | |
| 431 if parents_next_sibling is not None: | |
| 432 new_childs_last_element.next_element = parents_next_sibling | |
| 433 else: | |
| 434 # The last element of this tag is the last element in | |
| 435 # the document. | |
| 436 new_childs_last_element.next_element = None | |
| 437 else: | |
| 438 next_child = self.contents[position] | |
| 439 new_child.next_sibling = next_child | |
| 440 if new_child.next_sibling is not None: | |
| 441 new_child.next_sibling.previous_sibling = new_child | |
| 442 new_childs_last_element.next_element = next_child | |
| 443 | |
| 444 if new_childs_last_element.next_element is not None: | |
| 445 new_childs_last_element.next_element.previous_element = new_childs_last_element | |
| 446 self.contents.insert(position, new_child) | |
| 447 | |
| 448 def append(self, tag): | |
| 449 """Appends the given PageElement to the contents of this one. | |
| 450 | |
| 451 :param tag: A PageElement. | |
| 452 """ | |
| 453 self.insert(len(self.contents), tag) | |
| 454 | |
| 455 def extend(self, tags): | |
| 456 """Appends the given PageElements to this one's contents. | |
| 457 | |
| 458 :param tags: A list of PageElements. | |
| 459 """ | |
| 460 if isinstance(tags, Tag): | |
| 461 # Calling self.append() on another tag's contents will change | |
| 462 # the list we're iterating over. Make a list that won't | |
| 463 # change. | |
| 464 tags = list(tags.contents) | |
| 465 for tag in tags: | |
| 466 self.append(tag) | |
| 467 | |
| 468 def insert_before(self, *args): | |
| 469 """Makes the given element(s) the immediate predecessor of this one. | |
| 470 | |
| 471 All the elements will have the same parent, and the given elements | |
| 472 will be immediately before this one. | |
| 473 | |
| 474 :param args: One or more PageElements. | |
| 475 """ | |
| 476 parent = self.parent | |
| 477 if parent is None: | |
| 478 raise ValueError( | |
| 479 "Element has no parent, so 'before' has no meaning.") | |
| 480 if any(x is self for x in args): | |
| 481 raise ValueError("Can't insert an element before itself.") | |
| 482 for predecessor in args: | |
| 483 # Extract first so that the index won't be screwed up if they | |
| 484 # are siblings. | |
| 485 if isinstance(predecessor, PageElement): | |
| 486 predecessor.extract() | |
| 487 index = parent.index(self) | |
| 488 parent.insert(index, predecessor) | |
| 489 | |
| 490 def insert_after(self, *args): | |
| 491 """Makes the given element(s) the immediate successor of this one. | |
| 492 | |
| 493 The elements will have the same parent, and the given elements | |
| 494 will be immediately after this one. | |
| 495 | |
| 496 :param args: One or more PageElements. | |
| 497 """ | |
| 498 # Do all error checking before modifying the tree. | |
| 499 parent = self.parent | |
| 500 if parent is None: | |
| 501 raise ValueError( | |
| 502 "Element has no parent, so 'after' has no meaning.") | |
| 503 if any(x is self for x in args): | |
| 504 raise ValueError("Can't insert an element after itself.") | |
| 505 | |
| 506 offset = 0 | |
| 507 for successor in args: | |
| 508 # Extract first so that the index won't be screwed up if they | |
| 509 # are siblings. | |
| 510 if isinstance(successor, PageElement): | |
| 511 successor.extract() | |
| 512 index = parent.index(self) | |
| 513 parent.insert(index+1+offset, successor) | |
| 514 offset += 1 | |
| 515 | |
| 516 def find_next(self, name=None, attrs={}, text=None, **kwargs): | |
| 517 """Find the first PageElement that matches the given criteria and | |
| 518 appears later in the document than this PageElement. | |
| 519 | |
| 520 All find_* methods take a common set of arguments. See the online | |
| 521 documentation for detailed explanations. | |
| 522 | |
| 523 :param name: A filter on tag name. | |
| 524 :param attrs: A dictionary of filters on attribute values. | |
| 525 :param text: A filter for a NavigableString with specific text. | |
| 526 :kwargs: A dictionary of filters on attribute values. | |
| 527 :return: A PageElement. | |
| 528 :rtype: bs4.element.Tag | bs4.element.NavigableString | |
| 529 """ | |
| 530 return self._find_one(self.find_all_next, name, attrs, text, **kwargs) | |
| 531 findNext = find_next # BS3 | |
| 532 | |
| 533 def find_all_next(self, name=None, attrs={}, text=None, limit=None, | |
| 534 **kwargs): | |
| 535 """Find all PageElements that match the given criteria and appear | |
| 536 later in the document than this PageElement. | |
| 537 | |
| 538 All find_* methods take a common set of arguments. See the online | |
| 539 documentation for detailed explanations. | |
| 540 | |
| 541 :param name: A filter on tag name. | |
| 542 :param attrs: A dictionary of filters on attribute values. | |
| 543 :param text: A filter for a NavigableString with specific text. | |
| 544 :param limit: Stop looking after finding this many results. | |
| 545 :kwargs: A dictionary of filters on attribute values. | |
| 546 :return: A ResultSet containing PageElements. | |
| 547 """ | |
| 548 return self._find_all(name, attrs, text, limit, self.next_elements, | |
| 549 **kwargs) | |
| 550 findAllNext = find_all_next # BS3 | |
| 551 | |
| 552 def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs): | |
| 553 """Find the closest sibling to this PageElement that matches the | |
| 554 given criteria and appears later in the document. | |
| 555 | |
| 556 All find_* methods take a common set of arguments. See the | |
| 557 online documentation for detailed explanations. | |
| 558 | |
| 559 :param name: A filter on tag name. | |
| 560 :param attrs: A dictionary of filters on attribute values. | |
| 561 :param text: A filter for a NavigableString with specific text. | |
| 562 :kwargs: A dictionary of filters on attribute values. | |
| 563 :return: A PageElement. | |
| 564 :rtype: bs4.element.Tag | bs4.element.NavigableString | |
| 565 """ | |
| 566 return self._find_one(self.find_next_siblings, name, attrs, text, | |
| 567 **kwargs) | |
| 568 findNextSibling = find_next_sibling # BS3 | |
| 569 | |
| 570 def find_next_siblings(self, name=None, attrs={}, text=None, limit=None, | |
| 571 **kwargs): | |
| 572 """Find all siblings of this PageElement that match the given criteria | |
| 573 and appear later in the document. | |
| 574 | |
| 575 All find_* methods take a common set of arguments. See the online | |
| 576 documentation for detailed explanations. | |
| 577 | |
| 578 :param name: A filter on tag name. | |
| 579 :param attrs: A dictionary of filters on attribute values. | |
| 580 :param text: A filter for a NavigableString with specific text. | |
| 581 :param limit: Stop looking after finding this many results. | |
| 582 :kwargs: A dictionary of filters on attribute values. | |
| 583 :return: A ResultSet of PageElements. | |
| 584 :rtype: bs4.element.ResultSet | |
| 585 """ | |
| 586 return self._find_all(name, attrs, text, limit, | |
| 587 self.next_siblings, **kwargs) | |
| 588 findNextSiblings = find_next_siblings # BS3 | |
| 589 fetchNextSiblings = find_next_siblings # BS2 | |
| 590 | |
| 591 def find_previous(self, name=None, attrs={}, text=None, **kwargs): | |
| 592 """Look backwards in the document from this PageElement and find the | |
| 593 first PageElement that matches the given criteria. | |
| 594 | |
| 595 All find_* methods take a common set of arguments. See the online | |
| 596 documentation for detailed explanations. | |
| 597 | |
| 598 :param name: A filter on tag name. | |
| 599 :param attrs: A dictionary of filters on attribute values. | |
| 600 :param text: A filter for a NavigableString with specific text. | |
| 601 :kwargs: A dictionary of filters on attribute values. | |
| 602 :return: A PageElement. | |
| 603 :rtype: bs4.element.Tag | bs4.element.NavigableString | |
| 604 """ | |
| 605 return self._find_one( | |
| 606 self.find_all_previous, name, attrs, text, **kwargs) | |
| 607 findPrevious = find_previous # BS3 | |
| 608 | |
| 609 def find_all_previous(self, name=None, attrs={}, text=None, limit=None, | |
| 610 **kwargs): | |
| 611 """Look backwards in the document from this PageElement and find all | |
| 612 PageElements that match the given criteria. | |
| 613 | |
| 614 All find_* methods take a common set of arguments. See the online | |
| 615 documentation for detailed explanations. | |
| 616 | |
| 617 :param name: A filter on tag name. | |
| 618 :param attrs: A dictionary of filters on attribute values. | |
| 619 :param text: A filter for a NavigableString with specific text. | |
| 620 :param limit: Stop looking after finding this many results. | |
| 621 :kwargs: A dictionary of filters on attribute values. | |
| 622 :return: A ResultSet of PageElements. | |
| 623 :rtype: bs4.element.ResultSet | |
| 624 """ | |
| 625 return self._find_all(name, attrs, text, limit, self.previous_elements, | |
| 626 **kwargs) | |
| 627 findAllPrevious = find_all_previous # BS3 | |
| 628 fetchPrevious = find_all_previous # BS2 | |
| 629 | |
| 630 def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs): | |
| 631 """Returns the closest sibling to this PageElement that matches the | |
| 632 given criteria and appears earlier in the document. | |
| 633 | |
| 634 All find_* methods take a common set of arguments. See the online | |
| 635 documentation for detailed explanations. | |
| 636 | |
| 637 :param name: A filter on tag name. | |
| 638 :param attrs: A dictionary of filters on attribute values. | |
| 639 :param text: A filter for a NavigableString with specific text. | |
| 640 :kwargs: A dictionary of filters on attribute values. | |
| 641 :return: A PageElement. | |
| 642 :rtype: bs4.element.Tag | bs4.element.NavigableString | |
| 643 """ | |
| 644 return self._find_one(self.find_previous_siblings, name, attrs, text, | |
| 645 **kwargs) | |
| 646 findPreviousSibling = find_previous_sibling # BS3 | |
| 647 | |
| 648 def find_previous_siblings(self, name=None, attrs={}, text=None, | |
| 649 limit=None, **kwargs): | |
| 650 """Returns all siblings to this PageElement that match the | |
| 651 given criteria and appear earlier in the document. | |
| 652 | |
| 653 All find_* methods take a common set of arguments. See the online | |
| 654 documentation for detailed explanations. | |
| 655 | |
| 656 :param name: A filter on tag name. | |
| 657 :param attrs: A dictionary of filters on attribute values. | |
| 658 :param text: A filter for a NavigableString with specific text. | |
| 659 :param limit: Stop looking after finding this many results. | |
| 660 :kwargs: A dictionary of filters on attribute values. | |
| 661 :return: A ResultSet of PageElements. | |
| 662 :rtype: bs4.element.ResultSet | |
| 663 """ | |
| 664 return self._find_all(name, attrs, text, limit, | |
| 665 self.previous_siblings, **kwargs) | |
| 666 findPreviousSiblings = find_previous_siblings # BS3 | |
| 667 fetchPreviousSiblings = find_previous_siblings # BS2 | |
| 668 | |
| 669 def find_parent(self, name=None, attrs={}, **kwargs): | |
| 670 """Find the closest parent of this PageElement that matches the given | |
| 671 criteria. | |
| 672 | |
| 673 All find_* methods take a common set of arguments. See the online | |
| 674 documentation for detailed explanations. | |
| 675 | |
| 676 :param name: A filter on tag name. | |
| 677 :param attrs: A dictionary of filters on attribute values. | |
| 678 :kwargs: A dictionary of filters on attribute values. | |
| 679 | |
| 680 :return: A PageElement. | |
| 681 :rtype: bs4.element.Tag | bs4.element.NavigableString | |
| 682 """ | |
| 683 # NOTE: We can't use _find_one because findParents takes a different | |
| 684 # set of arguments. | |
| 685 r = None | |
| 686 l = self.find_parents(name, attrs, 1, **kwargs) | |
| 687 if l: | |
| 688 r = l[0] | |
| 689 return r | |
| 690 findParent = find_parent # BS3 | |
| 691 | |
| 692 def find_parents(self, name=None, attrs={}, limit=None, **kwargs): | |
| 693 """Find all parents of this PageElement that match the given criteria. | |
| 694 | |
| 695 All find_* methods take a common set of arguments. See the online | |
| 696 documentation for detailed explanations. | |
| 697 | |
| 698 :param name: A filter on tag name. | |
| 699 :param attrs: A dictionary of filters on attribute values. | |
| 700 :param limit: Stop looking after finding this many results. | |
| 701 :kwargs: A dictionary of filters on attribute values. | |
| 702 | |
| 703 :return: A PageElement. | |
| 704 :rtype: bs4.element.Tag | bs4.element.NavigableString | |
| 705 """ | |
| 706 return self._find_all(name, attrs, None, limit, self.parents, | |
| 707 **kwargs) | |
| 708 findParents = find_parents # BS3 | |
| 709 fetchParents = find_parents # BS2 | |
| 710 | |
| 711 @property | |
| 712 def next(self): | |
| 713 """The PageElement, if any, that was parsed just after this one. | |
| 714 | |
| 715 :return: A PageElement. | |
| 716 :rtype: bs4.element.Tag | bs4.element.NavigableString | |
| 717 """ | |
| 718 return self.next_element | |
| 719 | |
| 720 @property | |
| 721 def previous(self): | |
| 722 """The PageElement, if any, that was parsed just before this one. | |
| 723 | |
| 724 :return: A PageElement. | |
| 725 :rtype: bs4.element.Tag | bs4.element.NavigableString | |
| 726 """ | |
| 727 return self.previous_element | |
| 728 | |
| 729 #These methods do the real heavy lifting. | |
| 730 | |
| 731 def _find_one(self, method, name, attrs, text, **kwargs): | |
| 732 r = None | |
| 733 l = method(name, attrs, text, 1, **kwargs) | |
| 734 if l: | |
| 735 r = l[0] | |
| 736 return r | |
| 737 | |
| 738 def _find_all(self, name, attrs, text, limit, generator, **kwargs): | |
| 739 "Iterates over a generator looking for things that match." | |
| 740 | |
| 741 if text is None and 'string' in kwargs: | |
| 742 text = kwargs['string'] | |
| 743 del kwargs['string'] | |
| 744 | |
| 745 if isinstance(name, SoupStrainer): | |
| 746 strainer = name | |
| 747 else: | |
| 748 strainer = SoupStrainer(name, attrs, text, **kwargs) | |
| 749 | |
| 750 if text is None and not limit and not attrs and not kwargs: | |
| 751 if name is True or name is None: | |
| 752 # Optimization to find all tags. | |
| 753 result = (element for element in generator | |
| 754 if isinstance(element, Tag)) | |
| 755 return ResultSet(strainer, result) | |
| 756 elif isinstance(name, str): | |
| 757 # Optimization to find all tags with a given name. | |
| 758 if name.count(':') == 1: | |
| 759 # This is a name with a prefix. If this is a namespace-aware document, | |
| 760 # we need to match the local name against tag.name. If not, | |
| 761 # we need to match the fully-qualified name against tag.name. | |
| 762 prefix, local_name = name.split(':', 1) | |
| 763 else: | |
| 764 prefix = None | |
| 765 local_name = name | |
| 766 result = (element for element in generator | |
| 767 if isinstance(element, Tag) | |
| 768 and ( | |
| 769 element.name == name | |
| 770 ) or ( | |
| 771 element.name == local_name | |
| 772 and (prefix is None or element.prefix == prefix) | |
| 773 ) | |
| 774 ) | |
| 775 return ResultSet(strainer, result) | |
| 776 results = ResultSet(strainer) | |
| 777 while True: | |
| 778 try: | |
| 779 i = next(generator) | |
| 780 except StopIteration: | |
| 781 break | |
| 782 if i: | |
| 783 found = strainer.search(i) | |
| 784 if found: | |
| 785 results.append(found) | |
| 786 if limit and len(results) >= limit: | |
| 787 break | |
| 788 return results | |
| 789 | |
| 790 #These generators can be used to navigate starting from both | |
| 791 #NavigableStrings and Tags. | |
| 792 @property | |
| 793 def next_elements(self): | |
| 794 """All PageElements that were parsed after this one. | |
| 795 | |
| 796 :yield: A sequence of PageElements. | |
| 797 """ | |
| 798 i = self.next_element | |
| 799 while i is not None: | |
| 800 yield i | |
| 801 i = i.next_element | |
| 802 | |
| 803 @property | |
| 804 def next_siblings(self): | |
| 805 """All PageElements that are siblings of this one but were parsed | |
| 806 later. | |
| 807 | |
| 808 :yield: A sequence of PageElements. | |
| 809 """ | |
| 810 i = self.next_sibling | |
| 811 while i is not None: | |
| 812 yield i | |
| 813 i = i.next_sibling | |
| 814 | |
| 815 @property | |
| 816 def previous_elements(self): | |
| 817 """All PageElements that were parsed before this one. | |
| 818 | |
| 819 :yield: A sequence of PageElements. | |
| 820 """ | |
| 821 i = self.previous_element | |
| 822 while i is not None: | |
| 823 yield i | |
| 824 i = i.previous_element | |
| 825 | |
| 826 @property | |
| 827 def previous_siblings(self): | |
| 828 """All PageElements that are siblings of this one but were parsed | |
| 829 earlier. | |
| 830 | |
| 831 :yield: A sequence of PageElements. | |
| 832 """ | |
| 833 i = self.previous_sibling | |
| 834 while i is not None: | |
| 835 yield i | |
| 836 i = i.previous_sibling | |
| 837 | |
| 838 @property | |
| 839 def parents(self): | |
| 840 """All PageElements that are parents of this PageElement. | |
| 841 | |
| 842 :yield: A sequence of PageElements. | |
| 843 """ | |
| 844 i = self.parent | |
| 845 while i is not None: | |
| 846 yield i | |
| 847 i = i.parent | |
| 848 | |
| 849 @property | |
| 850 def decomposed(self): | |
| 851 """Check whether a PageElement has been decomposed. | |
| 852 | |
| 853 :rtype: bool | |
| 854 """ | |
| 855 return getattr(self, '_decomposed', False) or False | |
| 856 | |
| 857 # Old non-property versions of the generators, for backwards | |
| 858 # compatibility with BS3. | |
| 859 def nextGenerator(self): | |
| 860 return self.next_elements | |
| 861 | |
| 862 def nextSiblingGenerator(self): | |
| 863 return self.next_siblings | |
| 864 | |
| 865 def previousGenerator(self): | |
| 866 return self.previous_elements | |
| 867 | |
| 868 def previousSiblingGenerator(self): | |
| 869 return self.previous_siblings | |
| 870 | |
| 871 def parentGenerator(self): | |
| 872 return self.parents | |
| 873 | |
| 874 | |
| 875 class NavigableString(str, PageElement): | |
| 876 """A Python Unicode string that is part of a parse tree. | |
| 877 | |
| 878 When Beautiful Soup parses the markup <b>penguin</b>, it will | |
| 879 create a NavigableString for the string "penguin". | |
| 880 """ | |
| 881 | |
| 882 PREFIX = '' | |
| 883 SUFFIX = '' | |
| 884 | |
| 885 # We can't tell just by looking at a string whether it's contained | |
| 886 # in an XML document or an HTML document. | |
| 887 | |
| 888 known_xml = None | |
| 889 | |
| 890 def __new__(cls, value): | |
| 891 """Create a new NavigableString. | |
| 892 | |
| 893 When unpickling a NavigableString, this method is called with | |
| 894 the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be | |
| 895 passed in to the superclass's __new__ or the superclass won't know | |
| 896 how to handle non-ASCII characters. | |
| 897 """ | |
| 898 if isinstance(value, str): | |
| 899 u = str.__new__(cls, value) | |
| 900 else: | |
| 901 u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) | |
| 902 u.setup() | |
| 903 return u | |
| 904 | |
| 905 def __copy__(self): | |
| 906 """A copy of a NavigableString has the same contents and class | |
| 907 as the original, but it is not connected to the parse tree. | |
| 908 """ | |
| 909 return type(self)(self) | |
| 910 | |
| 911 def __getnewargs__(self): | |
| 912 return (str(self),) | |
| 913 | |
| 914 def __getattr__(self, attr): | |
| 915 """text.string gives you text. This is for backwards | |
| 916 compatibility for Navigable*String, but for CData* it lets you | |
| 917 get the string without the CData wrapper.""" | |
| 918 if attr == 'string': | |
| 919 return self | |
| 920 else: | |
| 921 raise AttributeError( | |
| 922 "'%s' object has no attribute '%s'" % ( | |
| 923 self.__class__.__name__, attr)) | |
| 924 | |
| 925 def output_ready(self, formatter="minimal"): | |
| 926 """Run the string through the provided formatter. | |
| 927 | |
| 928 :param formatter: A Formatter object, or a string naming one of the standard formatters. | |
| 929 """ | |
| 930 output = self.format_string(self, formatter) | |
| 931 return self.PREFIX + output + self.SUFFIX | |
| 932 | |
| 933 @property | |
| 934 def name(self): | |
| 935 """Since a NavigableString is not a Tag, it has no .name. | |
| 936 | |
| 937 This property is implemented so that code like this doesn't crash | |
| 938 when run on a mixture of Tag and NavigableString objects: | |
| 939 [x.name for x in tag.children] | |
| 940 """ | |
| 941 return None | |
| 942 | |
| 943 @name.setter | |
| 944 def name(self, name): | |
| 945 """Prevent NavigableString.name from ever being set.""" | |
| 946 raise AttributeError("A NavigableString cannot be given a name.") | |
| 947 | |
| 948 | |
| 949 class PreformattedString(NavigableString): | |
| 950 """A NavigableString not subject to the normal formatting rules. | |
| 951 | |
| 952 This is an abstract class used for special kinds of strings such | |
| 953 as comments (the Comment class) and CDATA blocks (the CData | |
| 954 class). | |
| 955 """ | |
| 956 | |
| 957 PREFIX = '' | |
| 958 SUFFIX = '' | |
| 959 | |
| 960 def output_ready(self, formatter=None): | |
| 961 """Make this string ready for output by adding any subclass-specific | |
| 962 prefix or suffix. | |
| 963 | |
| 964 :param formatter: A Formatter object, or a string naming one | |
| 965 of the standard formatters. The string will be passed into the | |
| 966 Formatter, but only to trigger any side effects: the return | |
| 967 value is ignored. | |
| 968 | |
| 969 :return: The string, with any subclass-specific prefix and | |
| 970 suffix added on. | |
| 971 """ | |
| 972 if formatter is not None: | |
| 973 ignore = self.format_string(self, formatter) | |
| 974 return self.PREFIX + self + self.SUFFIX | |
| 975 | |
| 976 class CData(PreformattedString): | |
| 977 """A CDATA block.""" | |
| 978 PREFIX = '<![CDATA[' | |
| 979 SUFFIX = ']]>' | |
| 980 | |
| 981 class ProcessingInstruction(PreformattedString): | |
| 982 """A SGML processing instruction.""" | |
| 983 | |
| 984 PREFIX = '<?' | |
| 985 SUFFIX = '>' | |
| 986 | |
| 987 class XMLProcessingInstruction(ProcessingInstruction): | |
| 988 """An XML processing instruction.""" | |
| 989 PREFIX = '<?' | |
| 990 SUFFIX = '?>' | |
| 991 | |
| 992 class Comment(PreformattedString): | |
| 993 """An HTML or XML comment.""" | |
| 994 PREFIX = '<!--' | |
| 995 SUFFIX = '-->' | |
| 996 | |
| 997 | |
| 998 class Declaration(PreformattedString): | |
| 999 """An XML declaration.""" | |
| 1000 PREFIX = '<?' | |
| 1001 SUFFIX = '?>' | |
| 1002 | |
| 1003 | |
| 1004 class Doctype(PreformattedString): | |
| 1005 """A document type declaration.""" | |
| 1006 @classmethod | |
| 1007 def for_name_and_ids(cls, name, pub_id, system_id): | |
| 1008 """Generate an appropriate document type declaration for a given | |
| 1009 public ID and system ID. | |
| 1010 | |
| 1011 :param name: The name of the document's root element, e.g. 'html'. | |
| 1012 :param pub_id: The Formal Public Identifier for this document type, | |
| 1013 e.g. '-//W3C//DTD XHTML 1.1//EN' | |
| 1014 :param system_id: The system identifier for this document type, | |
| 1015 e.g. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd' | |
| 1016 | |
| 1017 :return: A Doctype. | |
| 1018 """ | |
| 1019 value = name or '' | |
| 1020 if pub_id is not None: | |
| 1021 value += ' PUBLIC "%s"' % pub_id | |
| 1022 if system_id is not None: | |
| 1023 value += ' "%s"' % system_id | |
| 1024 elif system_id is not None: | |
| 1025 value += ' SYSTEM "%s"' % system_id | |
| 1026 | |
| 1027 return Doctype(value) | |
| 1028 | |
| 1029 PREFIX = '<!DOCTYPE ' | |
| 1030 SUFFIX = '>\n' | |
| 1031 | |
| 1032 | |
| 1033 class Stylesheet(NavigableString): | |
| 1034 """A NavigableString representing an stylesheet (probably | |
| 1035 CSS). | |
| 1036 | |
| 1037 Used to distinguish embedded stylesheets from textual content. | |
| 1038 """ | |
| 1039 pass | |
| 1040 | |
| 1041 | |
| 1042 class Script(NavigableString): | |
| 1043 """A NavigableString representing an executable script (probably | |
| 1044 Javascript). | |
| 1045 | |
| 1046 Used to distinguish executable code from textual content. | |
| 1047 """ | |
| 1048 pass | |
| 1049 | |
| 1050 | |
| 1051 class TemplateString(NavigableString): | |
| 1052 """A NavigableString representing a string found inside an HTML | |
| 1053 template embedded in a larger document. | |
| 1054 | |
| 1055 Used to distinguish such strings from the main body of the document. | |
| 1056 """ | |
| 1057 pass | |
| 1058 | |
| 1059 | |
| 1060 class Tag(PageElement): | |
| 1061 """Represents an HTML or XML tag that is part of a parse tree, along | |
| 1062 with its attributes and contents. | |
| 1063 | |
| 1064 When Beautiful Soup parses the markup <b>penguin</b>, it will | |
| 1065 create a Tag object representing the <b> tag. | |
| 1066 """ | |
| 1067 | |
| 1068 def __init__(self, parser=None, builder=None, name=None, namespace=None, | |
| 1069 prefix=None, attrs=None, parent=None, previous=None, | |
| 1070 is_xml=None, sourceline=None, sourcepos=None, | |
| 1071 can_be_empty_element=None, cdata_list_attributes=None, | |
| 1072 preserve_whitespace_tags=None | |
| 1073 ): | |
| 1074 """Basic constructor. | |
| 1075 | |
| 1076 :param parser: A BeautifulSoup object. | |
| 1077 :param builder: A TreeBuilder. | |
| 1078 :param name: The name of the tag. | |
| 1079 :param namespace: The URI of this Tag's XML namespace, if any. | |
| 1080 :param prefix: The prefix for this Tag's XML namespace, if any. | |
| 1081 :param attrs: A dictionary of this Tag's attribute values. | |
| 1082 :param parent: The PageElement to use as this Tag's parent. | |
| 1083 :param previous: The PageElement that was parsed immediately before | |
| 1084 this tag. | |
| 1085 :param is_xml: If True, this is an XML tag. Otherwise, this is an | |
| 1086 HTML tag. | |
| 1087 :param sourceline: The line number where this tag was found in its | |
| 1088 source document. | |
| 1089 :param sourcepos: The character position within `sourceline` where this | |
| 1090 tag was found. | |
| 1091 :param can_be_empty_element: If True, this tag should be | |
| 1092 represented as <tag/>. If False, this tag should be represented | |
| 1093 as <tag></tag>. | |
| 1094 :param cdata_list_attributes: A list of attributes whose values should | |
| 1095 be treated as CDATA if they ever show up on this tag. | |
| 1096 :param preserve_whitespace_tags: A list of tag names whose contents | |
| 1097 should have their whitespace preserved. | |
| 1098 """ | |
| 1099 if parser is None: | |
| 1100 self.parser_class = None | |
| 1101 else: | |
| 1102 # We don't actually store the parser object: that lets extracted | |
| 1103 # chunks be garbage-collected. | |
| 1104 self.parser_class = parser.__class__ | |
| 1105 if name is None: | |
| 1106 raise ValueError("No value provided for new tag's name.") | |
| 1107 self.name = name | |
| 1108 self.namespace = namespace | |
| 1109 self.prefix = prefix | |
| 1110 if ((not builder or builder.store_line_numbers) | |
| 1111 and (sourceline is not None or sourcepos is not None)): | |
| 1112 self.sourceline = sourceline | |
| 1113 self.sourcepos = sourcepos | |
| 1114 if attrs is None: | |
| 1115 attrs = {} | |
| 1116 elif attrs: | |
| 1117 if builder is not None and builder.cdata_list_attributes: | |
| 1118 attrs = builder._replace_cdata_list_attribute_values( | |
| 1119 self.name, attrs) | |
| 1120 else: | |
| 1121 attrs = dict(attrs) | |
| 1122 else: | |
| 1123 attrs = dict(attrs) | |
| 1124 | |
| 1125 # If possible, determine ahead of time whether this tag is an | |
| 1126 # XML tag. | |
| 1127 if builder: | |
| 1128 self.known_xml = builder.is_xml | |
| 1129 else: | |
| 1130 self.known_xml = is_xml | |
| 1131 self.attrs = attrs | |
| 1132 self.contents = [] | |
| 1133 self.setup(parent, previous) | |
| 1134 self.hidden = False | |
| 1135 | |
| 1136 if builder is None: | |
| 1137 # In the absence of a TreeBuilder, use whatever values were | |
| 1138 # passed in here. They're probably None, unless this is a copy of some | |
| 1139 # other tag. | |
| 1140 self.can_be_empty_element = can_be_empty_element | |
| 1141 self.cdata_list_attributes = cdata_list_attributes | |
| 1142 self.preserve_whitespace_tags = preserve_whitespace_tags | |
| 1143 else: | |
| 1144 # Set up any substitutions for this tag, such as the charset in a META tag. | |
| 1145 builder.set_up_substitutions(self) | |
| 1146 | |
| 1147 # Ask the TreeBuilder whether this tag might be an empty-element tag. | |
| 1148 self.can_be_empty_element = builder.can_be_empty_element(name) | |
| 1149 | |
| 1150 # Keep track of the list of attributes of this tag that | |
| 1151 # might need to be treated as a list. | |
| 1152 # | |
| 1153 # For performance reasons, we store the whole data structure | |
| 1154 # rather than asking the question of every tag. Asking would | |
| 1155 # require building a new data structure every time, and | |
| 1156 # (unlike can_be_empty_element), we almost never need | |
| 1157 # to check this. | |
| 1158 self.cdata_list_attributes = builder.cdata_list_attributes | |
| 1159 | |
| 1160 # Keep track of the names that might cause this tag to be treated as a | |
| 1161 # whitespace-preserved tag. | |
| 1162 self.preserve_whitespace_tags = builder.preserve_whitespace_tags | |
| 1163 | |
| 1164 parserClass = _alias("parser_class") # BS3 | |
| 1165 | |
| 1166 def __copy__(self): | |
| 1167 """A copy of a Tag is a new Tag, unconnected to the parse tree. | |
| 1168 Its contents are a copy of the old Tag's contents. | |
| 1169 """ | |
| 1170 clone = type(self)( | |
| 1171 None, self.builder, self.name, self.namespace, | |
| 1172 self.prefix, self.attrs, is_xml=self._is_xml, | |
| 1173 sourceline=self.sourceline, sourcepos=self.sourcepos, | |
| 1174 can_be_empty_element=self.can_be_empty_element, | |
| 1175 cdata_list_attributes=self.cdata_list_attributes, | |
| 1176 preserve_whitespace_tags=self.preserve_whitespace_tags | |
| 1177 ) | |
| 1178 for attr in ('can_be_empty_element', 'hidden'): | |
| 1179 setattr(clone, attr, getattr(self, attr)) | |
| 1180 for child in self.contents: | |
| 1181 clone.append(child.__copy__()) | |
| 1182 return clone | |
| 1183 | |
| 1184 @property | |
| 1185 def is_empty_element(self): | |
| 1186 """Is this tag an empty-element tag? (aka a self-closing tag) | |
| 1187 | |
| 1188 A tag that has contents is never an empty-element tag. | |
| 1189 | |
| 1190 A tag that has no contents may or may not be an empty-element | |
| 1191 tag. It depends on the builder used to create the tag. If the | |
| 1192 builder has a designated list of empty-element tags, then only | |
| 1193 a tag whose name shows up in that list is considered an | |
| 1194 empty-element tag. | |
| 1195 | |
| 1196 If the builder has no designated list of empty-element tags, | |
| 1197 then any tag with no contents is an empty-element tag. | |
| 1198 """ | |
| 1199 return len(self.contents) == 0 and self.can_be_empty_element | |
| 1200 isSelfClosing = is_empty_element # BS3 | |
| 1201 | |
| 1202 @property | |
| 1203 def string(self): | |
| 1204 """Convenience property to get the single string within this | |
| 1205 PageElement. | |
| 1206 | |
| 1207 TODO It might make sense to have NavigableString.string return | |
| 1208 itself. | |
| 1209 | |
| 1210 :return: If this element has a single string child, return | |
| 1211 value is that string. If this element has one child tag, | |
| 1212 return value is the 'string' attribute of the child tag, | |
| 1213 recursively. If this element is itself a string, has no | |
| 1214 children, or has more than one child, return value is None. | |
| 1215 """ | |
| 1216 if len(self.contents) != 1: | |
| 1217 return None | |
| 1218 child = self.contents[0] | |
| 1219 if isinstance(child, NavigableString): | |
| 1220 return child | |
| 1221 return child.string | |
| 1222 | |
| 1223 @string.setter | |
| 1224 def string(self, string): | |
| 1225 """Replace this PageElement's contents with `string`.""" | |
| 1226 self.clear() | |
| 1227 self.append(string.__class__(string)) | |
| 1228 | |
| 1229 def _all_strings(self, strip=False, types=(NavigableString, CData)): | |
| 1230 """Yield all strings of certain classes, possibly stripping them. | |
| 1231 | |
| 1232 :param strip: If True, all strings will be stripped before being | |
| 1233 yielded. | |
| 1234 | |
| 1235 :types: A tuple of NavigableString subclasses. Any strings of | |
| 1236 a subclass not found in this list will be ignored. By | |
| 1237 default, this means only NavigableString and CData objects | |
| 1238 will be considered. So no comments, processing instructions, | |
| 1239 etc. | |
| 1240 | |
| 1241 :yield: A sequence of strings. | |
| 1242 """ | |
| 1243 for descendant in self.descendants: | |
| 1244 if ( | |
| 1245 (types is None and not isinstance(descendant, NavigableString)) | |
| 1246 or | |
| 1247 (types is not None and type(descendant) not in types)): | |
| 1248 continue | |
| 1249 if strip: | |
| 1250 descendant = descendant.strip() | |
| 1251 if len(descendant) == 0: | |
| 1252 continue | |
| 1253 yield descendant | |
| 1254 | |
| 1255 strings = property(_all_strings) | |
| 1256 | |
| 1257 @property | |
| 1258 def stripped_strings(self): | |
| 1259 """Yield all strings in the document, stripping them first. | |
| 1260 | |
| 1261 :yield: A sequence of stripped strings. | |
| 1262 """ | |
| 1263 for string in self._all_strings(True): | |
| 1264 yield string | |
| 1265 | |
| 1266 def get_text(self, separator="", strip=False, | |
| 1267 types=(NavigableString, CData)): | |
| 1268 """Get all child strings, concatenated using the given separator. | |
| 1269 | |
| 1270 :param separator: Strings will be concatenated using this separator. | |
| 1271 | |
| 1272 :param strip: If True, strings will be stripped before being | |
| 1273 concatenated. | |
| 1274 | |
| 1275 :types: A tuple of NavigableString subclasses. Any strings of | |
| 1276 a subclass not found in this list will be ignored. By | |
| 1277 default, this means only NavigableString and CData objects | |
| 1278 will be considered. So no comments, processing instructions, | |
| 1279 stylesheets, etc. | |
| 1280 | |
| 1281 :return: A string. | |
| 1282 """ | |
| 1283 return separator.join([s for s in self._all_strings( | |
| 1284 strip, types=types)]) | |
| 1285 getText = get_text | |
| 1286 text = property(get_text) | |
| 1287 | |
| 1288 def decompose(self): | |
| 1289 """Recursively destroys this PageElement and its children. | |
| 1290 | |
| 1291 This element will be removed from the tree and wiped out; so | |
| 1292 will everything beneath it. | |
| 1293 | |
| 1294 The behavior of a decomposed PageElement is undefined and you | |
| 1295 should never use one for anything, but if you need to _check_ | |
| 1296 whether an element has been decomposed, you can use the | |
| 1297 `decomposed` property. | |
| 1298 """ | |
| 1299 self.extract() | |
| 1300 i = self | |
| 1301 while i is not None: | |
| 1302 n = i.next_element | |
| 1303 i.__dict__.clear() | |
| 1304 i.contents = [] | |
| 1305 i._decomposed = True | |
| 1306 i = n | |
| 1307 | |
| 1308 def clear(self, decompose=False): | |
| 1309 """Wipe out all children of this PageElement by calling extract() | |
| 1310 on them. | |
| 1311 | |
| 1312 :param decompose: If this is True, decompose() (a more | |
| 1313 destructive method) will be called instead of extract(). | |
| 1314 """ | |
| 1315 if decompose: | |
| 1316 for element in self.contents[:]: | |
| 1317 if isinstance(element, Tag): | |
| 1318 element.decompose() | |
| 1319 else: | |
| 1320 element.extract() | |
| 1321 else: | |
| 1322 for element in self.contents[:]: | |
| 1323 element.extract() | |
| 1324 | |
| 1325 def smooth(self): | |
| 1326 """Smooth out this element's children by consolidating consecutive | |
| 1327 strings. | |
| 1328 | |
| 1329 This makes pretty-printed output look more natural following a | |
| 1330 lot of operations that modified the tree. | |
| 1331 """ | |
| 1332 # Mark the first position of every pair of children that need | |
| 1333 # to be consolidated. Do this rather than making a copy of | |
| 1334 # self.contents, since in most cases very few strings will be | |
| 1335 # affected. | |
| 1336 marked = [] | |
| 1337 for i, a in enumerate(self.contents): | |
| 1338 if isinstance(a, Tag): | |
| 1339 # Recursively smooth children. | |
| 1340 a.smooth() | |
| 1341 if i == len(self.contents)-1: | |
| 1342 # This is the last item in .contents, and it's not a | |
| 1343 # tag. There's no chance it needs any work. | |
| 1344 continue | |
| 1345 b = self.contents[i+1] | |
| 1346 if (isinstance(a, NavigableString) | |
| 1347 and isinstance(b, NavigableString) | |
| 1348 and not isinstance(a, PreformattedString) | |
| 1349 and not isinstance(b, PreformattedString) | |
| 1350 ): | |
| 1351 marked.append(i) | |
| 1352 | |
| 1353 # Go over the marked positions in reverse order, so that | |
| 1354 # removing items from .contents won't affect the remaining | |
| 1355 # positions. | |
| 1356 for i in reversed(marked): | |
| 1357 a = self.contents[i] | |
| 1358 b = self.contents[i+1] | |
| 1359 b.extract() | |
| 1360 n = NavigableString(a+b) | |
| 1361 a.replace_with(n) | |
| 1362 | |
| 1363 def index(self, element): | |
| 1364 """Find the index of a child by identity, not value. | |
| 1365 | |
| 1366 Avoids issues with tag.contents.index(element) getting the | |
| 1367 index of equal elements. | |
| 1368 | |
| 1369 :param element: Look for this PageElement in `self.contents`. | |
| 1370 """ | |
| 1371 for i, child in enumerate(self.contents): | |
| 1372 if child is element: | |
| 1373 return i | |
| 1374 raise ValueError("Tag.index: element not in tag") | |
| 1375 | |
| 1376 def get(self, key, default=None): | |
| 1377 """Returns the value of the 'key' attribute for the tag, or | |
| 1378 the value given for 'default' if it doesn't have that | |
| 1379 attribute.""" | |
| 1380 return self.attrs.get(key, default) | |
| 1381 | |
| 1382 def get_attribute_list(self, key, default=None): | |
| 1383 """The same as get(), but always returns a list. | |
| 1384 | |
| 1385 :param key: The attribute to look for. | |
| 1386 :param default: Use this value if the attribute is not present | |
| 1387 on this PageElement. | |
| 1388 :return: A list of values, probably containing only a single | |
| 1389 value. | |
| 1390 """ | |
| 1391 value = self.get(key, default) | |
| 1392 if not isinstance(value, list): | |
| 1393 value = [value] | |
| 1394 return value | |
| 1395 | |
| 1396 def has_attr(self, key): | |
| 1397 """Does this PageElement have an attribute with the given name?""" | |
| 1398 return key in self.attrs | |
| 1399 | |
| 1400 def __hash__(self): | |
| 1401 return str(self).__hash__() | |
| 1402 | |
| 1403 def __getitem__(self, key): | |
| 1404 """tag[key] returns the value of the 'key' attribute for the Tag, | |
| 1405 and throws an exception if it's not there.""" | |
| 1406 return self.attrs[key] | |
| 1407 | |
| 1408 def __iter__(self): | |
| 1409 "Iterating over a Tag iterates over its contents." | |
| 1410 return iter(self.contents) | |
| 1411 | |
| 1412 def __len__(self): | |
| 1413 "The length of a Tag is the length of its list of contents." | |
| 1414 return len(self.contents) | |
| 1415 | |
| 1416 def __contains__(self, x): | |
| 1417 return x in self.contents | |
| 1418 | |
| 1419 def __bool__(self): | |
| 1420 "A tag is non-None even if it has no contents." | |
| 1421 return True | |
| 1422 | |
| 1423 def __setitem__(self, key, value): | |
| 1424 """Setting tag[key] sets the value of the 'key' attribute for the | |
| 1425 tag.""" | |
| 1426 self.attrs[key] = value | |
| 1427 | |
| 1428 def __delitem__(self, key): | |
| 1429 "Deleting tag[key] deletes all 'key' attributes for the tag." | |
| 1430 self.attrs.pop(key, None) | |
| 1431 | |
| 1432 def __call__(self, *args, **kwargs): | |
| 1433 """Calling a Tag like a function is the same as calling its | |
| 1434 find_all() method. Eg. tag('a') returns a list of all the A tags | |
| 1435 found within this tag.""" | |
| 1436 return self.find_all(*args, **kwargs) | |
| 1437 | |
| 1438 def __getattr__(self, tag): | |
| 1439 """Calling tag.subtag is the same as calling tag.find(name="subtag")""" | |
| 1440 #print("Getattr %s.%s" % (self.__class__, tag)) | |
| 1441 if len(tag) > 3 and tag.endswith('Tag'): | |
| 1442 # BS3: soup.aTag -> "soup.find("a") | |
| 1443 tag_name = tag[:-3] | |
| 1444 warnings.warn( | |
| 1445 '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict( | |
| 1446 name=tag_name | |
| 1447 ) | |
| 1448 ) | |
| 1449 return self.find(tag_name) | |
| 1450 # We special case contents to avoid recursion. | |
| 1451 elif not tag.startswith("__") and not tag == "contents": | |
| 1452 return self.find(tag) | |
| 1453 raise AttributeError( | |
| 1454 "'%s' object has no attribute '%s'" % (self.__class__, tag)) | |
| 1455 | |
| 1456 def __eq__(self, other): | |
| 1457 """Returns true iff this Tag has the same name, the same attributes, | |
| 1458 and the same contents (recursively) as `other`.""" | |
| 1459 if self is other: | |
| 1460 return True | |
| 1461 if (not hasattr(other, 'name') or | |
| 1462 not hasattr(other, 'attrs') or | |
| 1463 not hasattr(other, 'contents') or | |
| 1464 self.name != other.name or | |
| 1465 self.attrs != other.attrs or | |
| 1466 len(self) != len(other)): | |
| 1467 return False | |
| 1468 for i, my_child in enumerate(self.contents): | |
| 1469 if my_child != other.contents[i]: | |
| 1470 return False | |
| 1471 return True | |
| 1472 | |
| 1473 def __ne__(self, other): | |
| 1474 """Returns true iff this Tag is not identical to `other`, | |
| 1475 as defined in __eq__.""" | |
| 1476 return not self == other | |
| 1477 | |
| 1478 def __repr__(self, encoding="unicode-escape"): | |
| 1479 """Renders this PageElement as a string. | |
| 1480 | |
| 1481 :param encoding: The encoding to use (Python 2 only). | |
| 1482 :return: Under Python 2, a bytestring; under Python 3, | |
| 1483 a Unicode string. | |
| 1484 """ | |
| 1485 if PY3K: | |
| 1486 # "The return value must be a string object", i.e. Unicode | |
| 1487 return self.decode() | |
| 1488 else: | |
| 1489 # "The return value must be a string object", i.e. a bytestring. | |
| 1490 # By convention, the return value of __repr__ should also be | |
| 1491 # an ASCII string. | |
| 1492 return self.encode(encoding) | |
| 1493 | |
| 1494 def __unicode__(self): | |
| 1495 """Renders this PageElement as a Unicode string.""" | |
| 1496 return self.decode() | |
| 1497 | |
| 1498 def __str__(self): | |
| 1499 """Renders this PageElement as a generic string. | |
| 1500 | |
| 1501 :return: Under Python 2, a UTF-8 bytestring; under Python 3, | |
| 1502 a Unicode string. | |
| 1503 """ | |
| 1504 if PY3K: | |
| 1505 return self.decode() | |
| 1506 else: | |
| 1507 return self.encode() | |
| 1508 | |
| 1509 if PY3K: | |
| 1510 __str__ = __repr__ = __unicode__ | |
| 1511 | |
| 1512 def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, | |
| 1513 indent_level=None, formatter="minimal", | |
| 1514 errors="xmlcharrefreplace"): | |
| 1515 """Render a bytestring representation of this PageElement and its | |
| 1516 contents. | |
| 1517 | |
| 1518 :param encoding: The destination encoding. | |
| 1519 :param indent_level: Each line of the rendering will be | |
| 1520 indented this many spaces. Used internally in | |
| 1521 recursive calls while pretty-printing. | |
| 1522 :param formatter: A Formatter object, or a string naming one of | |
| 1523 the standard formatters. | |
| 1524 :param errors: An error handling strategy such as | |
| 1525 'xmlcharrefreplace'. This value is passed along into | |
| 1526 encode() and its value should be one of the constants | |
| 1527 defined by Python. | |
| 1528 :return: A bytestring. | |
| 1529 | |
| 1530 """ | |
| 1531 # Turn the data structure into Unicode, then encode the | |
| 1532 # Unicode. | |
| 1533 u = self.decode(indent_level, encoding, formatter) | |
| 1534 return u.encode(encoding, errors) | |
| 1535 | |
| 1536 def decode(self, indent_level=None, | |
| 1537 eventual_encoding=DEFAULT_OUTPUT_ENCODING, | |
| 1538 formatter="minimal"): | |
| 1539 """Render a Unicode representation of this PageElement and its | |
| 1540 contents. | |
| 1541 | |
| 1542 :param indent_level: Each line of the rendering will be | |
| 1543 indented this many spaces. Used internally in | |
| 1544 recursive calls while pretty-printing. | |
| 1545 :param eventual_encoding: The tag is destined to be | |
| 1546 encoded into this encoding. This method is _not_ | |
| 1547 responsible for performing that encoding. This information | |
| 1548 is passed in so that it can be substituted in if the | |
| 1549 document contains a <META> tag that mentions the document's | |
| 1550 encoding. | |
| 1551 :param formatter: A Formatter object, or a string naming one of | |
| 1552 the standard formatters. | |
| 1553 """ | |
| 1554 | |
| 1555 # First off, turn a non-Formatter `formatter` into a Formatter | |
| 1556 # object. This will stop the lookup from happening over and | |
| 1557 # over again. | |
| 1558 if not isinstance(formatter, Formatter): | |
| 1559 formatter = self.formatter_for_name(formatter) | |
| 1560 attributes = formatter.attributes(self) | |
| 1561 attrs = [] | |
| 1562 for key, val in attributes: | |
| 1563 if val is None: | |
| 1564 decoded = key | |
| 1565 else: | |
| 1566 if isinstance(val, list) or isinstance(val, tuple): | |
| 1567 val = ' '.join(val) | |
| 1568 elif not isinstance(val, str): | |
| 1569 val = str(val) | |
| 1570 elif ( | |
| 1571 isinstance(val, AttributeValueWithCharsetSubstitution) | |
| 1572 and eventual_encoding is not None | |
| 1573 ): | |
| 1574 val = val.encode(eventual_encoding) | |
| 1575 | |
| 1576 text = formatter.attribute_value(val) | |
| 1577 decoded = ( | |
| 1578 str(key) + '=' | |
| 1579 + formatter.quoted_attribute_value(text)) | |
| 1580 attrs.append(decoded) | |
| 1581 close = '' | |
| 1582 closeTag = '' | |
| 1583 | |
| 1584 prefix = '' | |
| 1585 if self.prefix: | |
| 1586 prefix = self.prefix + ":" | |
| 1587 | |
| 1588 if self.is_empty_element: | |
| 1589 close = formatter.void_element_close_prefix or '' | |
| 1590 else: | |
| 1591 closeTag = '</%s%s>' % (prefix, self.name) | |
| 1592 | |
| 1593 pretty_print = self._should_pretty_print(indent_level) | |
| 1594 space = '' | |
| 1595 indent_space = '' | |
| 1596 if indent_level is not None: | |
| 1597 indent_space = (' ' * (indent_level - 1)) | |
| 1598 if pretty_print: | |
| 1599 space = indent_space | |
| 1600 indent_contents = indent_level + 1 | |
| 1601 else: | |
| 1602 indent_contents = None | |
| 1603 contents = self.decode_contents( | |
| 1604 indent_contents, eventual_encoding, formatter | |
| 1605 ) | |
| 1606 | |
| 1607 if self.hidden: | |
| 1608 # This is the 'document root' object. | |
| 1609 s = contents | |
| 1610 else: | |
| 1611 s = [] | |
| 1612 attribute_string = '' | |
| 1613 if attrs: | |
| 1614 attribute_string = ' ' + ' '.join(attrs) | |
| 1615 if indent_level is not None: | |
| 1616 # Even if this particular tag is not pretty-printed, | |
| 1617 # we should indent up to the start of the tag. | |
| 1618 s.append(indent_space) | |
| 1619 s.append('<%s%s%s%s>' % ( | |
| 1620 prefix, self.name, attribute_string, close)) | |
| 1621 if pretty_print: | |
| 1622 s.append("\n") | |
| 1623 s.append(contents) | |
| 1624 if pretty_print and contents and contents[-1] != "\n": | |
| 1625 s.append("\n") | |
| 1626 if pretty_print and closeTag: | |
| 1627 s.append(space) | |
| 1628 s.append(closeTag) | |
| 1629 if indent_level is not None and closeTag and self.next_sibling: | |
| 1630 # Even if this particular tag is not pretty-printed, | |
| 1631 # we're now done with the tag, and we should add a | |
| 1632 # newline if appropriate. | |
| 1633 s.append("\n") | |
| 1634 s = ''.join(s) | |
| 1635 return s | |
| 1636 | |
| 1637 def _should_pretty_print(self, indent_level): | |
| 1638 """Should this tag be pretty-printed? | |
| 1639 | |
| 1640 Most of them should, but some (such as <pre> in HTML | |
| 1641 documents) should not. | |
| 1642 """ | |
| 1643 return ( | |
| 1644 indent_level is not None | |
| 1645 and ( | |
| 1646 not self.preserve_whitespace_tags | |
| 1647 or self.name not in self.preserve_whitespace_tags | |
| 1648 ) | |
| 1649 ) | |
| 1650 | |
| 1651 def prettify(self, encoding=None, formatter="minimal"): | |
| 1652 """Pretty-print this PageElement as a string. | |
| 1653 | |
| 1654 :param encoding: The eventual encoding of the string. If this is None, | |
| 1655 a Unicode string will be returned. | |
| 1656 :param formatter: A Formatter object, or a string naming one of | |
| 1657 the standard formatters. | |
| 1658 :return: A Unicode string (if encoding==None) or a bytestring | |
| 1659 (otherwise). | |
| 1660 """ | |
| 1661 if encoding is None: | |
| 1662 return self.decode(True, formatter=formatter) | |
| 1663 else: | |
| 1664 return self.encode(encoding, True, formatter=formatter) | |
| 1665 | |
| 1666 def decode_contents(self, indent_level=None, | |
| 1667 eventual_encoding=DEFAULT_OUTPUT_ENCODING, | |
| 1668 formatter="minimal"): | |
| 1669 """Renders the contents of this tag as a Unicode string. | |
| 1670 | |
| 1671 :param indent_level: Each line of the rendering will be | |
| 1672 indented this many spaces. Used internally in | |
| 1673 recursive calls while pretty-printing. | |
| 1674 | |
| 1675 :param eventual_encoding: The tag is destined to be | |
| 1676 encoded into this encoding. decode_contents() is _not_ | |
| 1677 responsible for performing that encoding. This information | |
| 1678 is passed in so that it can be substituted in if the | |
| 1679 document contains a <META> tag that mentions the document's | |
| 1680 encoding. | |
| 1681 | |
| 1682 :param formatter: A Formatter object, or a string naming one of | |
| 1683 the standard Formatters. | |
| 1684 """ | |
| 1685 # First off, turn a string formatter into a Formatter object. This | |
| 1686 # will stop the lookup from happening over and over again. | |
| 1687 if not isinstance(formatter, Formatter): | |
| 1688 formatter = self.formatter_for_name(formatter) | |
| 1689 | |
| 1690 pretty_print = (indent_level is not None) | |
| 1691 s = [] | |
| 1692 for c in self: | |
| 1693 text = None | |
| 1694 if isinstance(c, NavigableString): | |
| 1695 text = c.output_ready(formatter) | |
| 1696 elif isinstance(c, Tag): | |
| 1697 s.append(c.decode(indent_level, eventual_encoding, | |
| 1698 formatter)) | |
| 1699 preserve_whitespace = ( | |
| 1700 self.preserve_whitespace_tags and self.name in self.preserve_whitespace_tags | |
| 1701 ) | |
| 1702 if text and indent_level and not preserve_whitespace: | |
| 1703 text = text.strip() | |
| 1704 if text: | |
| 1705 if pretty_print and not preserve_whitespace: | |
| 1706 s.append(" " * (indent_level - 1)) | |
| 1707 s.append(text) | |
| 1708 if pretty_print and not preserve_whitespace: | |
| 1709 s.append("\n") | |
| 1710 return ''.join(s) | |
| 1711 | |
| 1712 def encode_contents( | |
| 1713 self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING, | |
| 1714 formatter="minimal"): | |
| 1715 """Renders the contents of this PageElement as a bytestring. | |
| 1716 | |
| 1717 :param indent_level: Each line of the rendering will be | |
| 1718 indented this many spaces. Used internally in | |
| 1719 recursive calls while pretty-printing. | |
| 1720 | |
| 1721 :param eventual_encoding: The bytestring will be in this encoding. | |
| 1722 | |
| 1723 :param formatter: A Formatter object, or a string naming one of | |
| 1724 the standard Formatters. | |
| 1725 | |
| 1726 :return: A bytestring. | |
| 1727 """ | |
| 1728 contents = self.decode_contents(indent_level, encoding, formatter) | |
| 1729 return contents.encode(encoding) | |
| 1730 | |
| 1731 # Old method for BS3 compatibility | |
| 1732 def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, | |
| 1733 prettyPrint=False, indentLevel=0): | |
| 1734 """Deprecated method for BS3 compatibility.""" | |
| 1735 if not prettyPrint: | |
| 1736 indentLevel = None | |
| 1737 return self.encode_contents( | |
| 1738 indent_level=indentLevel, encoding=encoding) | |
| 1739 | |
| 1740 #Soup methods | |
| 1741 | |
| 1742 def find(self, name=None, attrs={}, recursive=True, text=None, | |
| 1743 **kwargs): | |
| 1744 """Look in the children of this PageElement and find the first | |
| 1745 PageElement that matches the given criteria. | |
| 1746 | |
| 1747 All find_* methods take a common set of arguments. See the online | |
| 1748 documentation for detailed explanations. | |
| 1749 | |
| 1750 :param name: A filter on tag name. | |
| 1751 :param attrs: A dictionary of filters on attribute values. | |
| 1752 :param recursive: If this is True, find() will perform a | |
| 1753 recursive search of this PageElement's children. Otherwise, | |
| 1754 only the direct children will be considered. | |
| 1755 :param limit: Stop looking after finding this many results. | |
| 1756 :kwargs: A dictionary of filters on attribute values. | |
| 1757 :return: A PageElement. | |
| 1758 :rtype: bs4.element.Tag | bs4.element.NavigableString | |
| 1759 """ | |
| 1760 r = None | |
| 1761 l = self.find_all(name, attrs, recursive, text, 1, **kwargs) | |
| 1762 if l: | |
| 1763 r = l[0] | |
| 1764 return r | |
| 1765 findChild = find #BS2 | |
| 1766 | |
| 1767 def find_all(self, name=None, attrs={}, recursive=True, text=None, | |
| 1768 limit=None, **kwargs): | |
| 1769 """Look in the children of this PageElement and find all | |
| 1770 PageElements that match the given criteria. | |
| 1771 | |
| 1772 All find_* methods take a common set of arguments. See the online | |
| 1773 documentation for detailed explanations. | |
| 1774 | |
| 1775 :param name: A filter on tag name. | |
| 1776 :param attrs: A dictionary of filters on attribute values. | |
| 1777 :param recursive: If this is True, find_all() will perform a | |
| 1778 recursive search of this PageElement's children. Otherwise, | |
| 1779 only the direct children will be considered. | |
| 1780 :param limit: Stop looking after finding this many results. | |
| 1781 :kwargs: A dictionary of filters on attribute values. | |
| 1782 :return: A ResultSet of PageElements. | |
| 1783 :rtype: bs4.element.ResultSet | |
| 1784 """ | |
| 1785 generator = self.descendants | |
| 1786 if not recursive: | |
| 1787 generator = self.children | |
| 1788 return self._find_all(name, attrs, text, limit, generator, **kwargs) | |
| 1789 findAll = find_all # BS3 | |
| 1790 findChildren = find_all # BS2 | |
| 1791 | |
| 1792 #Generator methods | |
| 1793 @property | |
| 1794 def children(self): | |
| 1795 """Iterate over all direct children of this PageElement. | |
| 1796 | |
| 1797 :yield: A sequence of PageElements. | |
| 1798 """ | |
| 1799 # return iter() to make the purpose of the method clear | |
| 1800 return iter(self.contents) # XXX This seems to be untested. | |
| 1801 | |
| 1802 @property | |
| 1803 def descendants(self): | |
| 1804 """Iterate over all children of this PageElement in a | |
| 1805 breadth-first sequence. | |
| 1806 | |
| 1807 :yield: A sequence of PageElements. | |
| 1808 """ | |
| 1809 if not len(self.contents): | |
| 1810 return | |
| 1811 stopNode = self._last_descendant().next_element | |
| 1812 current = self.contents[0] | |
| 1813 while current is not stopNode: | |
| 1814 yield current | |
| 1815 current = current.next_element | |
| 1816 | |
| 1817 # CSS selector code | |
| 1818 def select_one(self, selector, namespaces=None, **kwargs): | |
| 1819 """Perform a CSS selection operation on the current element. | |
| 1820 | |
| 1821 :param selector: A CSS selector. | |
| 1822 | |
| 1823 :param namespaces: A dictionary mapping namespace prefixes | |
| 1824 used in the CSS selector to namespace URIs. By default, | |
| 1825 Beautiful Soup will use the prefixes it encountered while | |
| 1826 parsing the document. | |
| 1827 | |
| 1828 :param kwargs: Keyword arguments to be passed into SoupSieve's | |
| 1829 soupsieve.select() method. | |
| 1830 | |
| 1831 :return: A Tag. | |
| 1832 :rtype: bs4.element.Tag | |
| 1833 """ | |
| 1834 value = self.select(selector, namespaces, 1, **kwargs) | |
| 1835 if value: | |
| 1836 return value[0] | |
| 1837 return None | |
| 1838 | |
| 1839 def select(self, selector, namespaces=None, limit=None, **kwargs): | |
| 1840 """Perform a CSS selection operation on the current element. | |
| 1841 | |
| 1842 This uses the SoupSieve library. | |
| 1843 | |
| 1844 :param selector: A string containing a CSS selector. | |
| 1845 | |
| 1846 :param namespaces: A dictionary mapping namespace prefixes | |
| 1847 used in the CSS selector to namespace URIs. By default, | |
| 1848 Beautiful Soup will use the prefixes it encountered while | |
| 1849 parsing the document. | |
| 1850 | |
| 1851 :param limit: After finding this number of results, stop looking. | |
| 1852 | |
| 1853 :param kwargs: Keyword arguments to be passed into SoupSieve's | |
| 1854 soupsieve.select() method. | |
| 1855 | |
| 1856 :return: A ResultSet of Tags. | |
| 1857 :rtype: bs4.element.ResultSet | |
| 1858 """ | |
| 1859 if namespaces is None: | |
| 1860 namespaces = self._namespaces | |
| 1861 | |
| 1862 if limit is None: | |
| 1863 limit = 0 | |
| 1864 if soupsieve is None: | |
| 1865 raise NotImplementedError( | |
| 1866 "Cannot execute CSS selectors because the soupsieve package is not installed." | |
| 1867 ) | |
| 1868 | |
| 1869 results = soupsieve.select(selector, self, namespaces, limit, **kwargs) | |
| 1870 | |
| 1871 # We do this because it's more consistent and because | |
| 1872 # ResultSet.__getattr__ has a helpful error message. | |
| 1873 return ResultSet(None, results) | |
| 1874 | |
| 1875 # Old names for backwards compatibility | |
| 1876 def childGenerator(self): | |
| 1877 """Deprecated generator.""" | |
| 1878 return self.children | |
| 1879 | |
| 1880 def recursiveChildGenerator(self): | |
| 1881 """Deprecated generator.""" | |
| 1882 return self.descendants | |
| 1883 | |
| 1884 def has_key(self, key): | |
| 1885 """Deprecated method. This was kind of misleading because has_key() | |
| 1886 (attributes) was different from __in__ (contents). | |
| 1887 | |
| 1888 has_key() is gone in Python 3, anyway. | |
| 1889 """ | |
| 1890 warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % ( | |
| 1891 key)) | |
| 1892 return self.has_attr(key) | |
| 1893 | |
| 1894 # Next, a couple classes to represent queries and their results. | |
| 1895 class SoupStrainer(object): | |
| 1896 """Encapsulates a number of ways of matching a markup element (tag or | |
| 1897 string). | |
| 1898 | |
| 1899 This is primarily used to underpin the find_* methods, but you can | |
| 1900 create one yourself and pass it in as `parse_only` to the | |
| 1901 `BeautifulSoup` constructor, to parse a subset of a large | |
| 1902 document. | |
| 1903 """ | |
| 1904 | |
| 1905 def __init__(self, name=None, attrs={}, text=None, **kwargs): | |
| 1906 """Constructor. | |
| 1907 | |
| 1908 The SoupStrainer constructor takes the same arguments passed | |
| 1909 into the find_* methods. See the online documentation for | |
| 1910 detailed explanations. | |
| 1911 | |
| 1912 :param name: A filter on tag name. | |
| 1913 :param attrs: A dictionary of filters on attribute values. | |
| 1914 :param text: A filter for a NavigableString with specific text. | |
| 1915 :kwargs: A dictionary of filters on attribute values. | |
| 1916 """ | |
| 1917 self.name = self._normalize_search_value(name) | |
| 1918 if not isinstance(attrs, dict): | |
| 1919 # Treat a non-dict value for attrs as a search for the 'class' | |
| 1920 # attribute. | |
| 1921 kwargs['class'] = attrs | |
| 1922 attrs = None | |
| 1923 | |
| 1924 if 'class_' in kwargs: | |
| 1925 # Treat class_="foo" as a search for the 'class' | |
| 1926 # attribute, overriding any non-dict value for attrs. | |
| 1927 kwargs['class'] = kwargs['class_'] | |
| 1928 del kwargs['class_'] | |
| 1929 | |
| 1930 if kwargs: | |
| 1931 if attrs: | |
| 1932 attrs = attrs.copy() | |
| 1933 attrs.update(kwargs) | |
| 1934 else: | |
| 1935 attrs = kwargs | |
| 1936 normalized_attrs = {} | |
| 1937 for key, value in list(attrs.items()): | |
| 1938 normalized_attrs[key] = self._normalize_search_value(value) | |
| 1939 | |
| 1940 self.attrs = normalized_attrs | |
| 1941 self.text = self._normalize_search_value(text) | |
| 1942 | |
| 1943 def _normalize_search_value(self, value): | |
| 1944 # Leave it alone if it's a Unicode string, a callable, a | |
| 1945 # regular expression, a boolean, or None. | |
| 1946 if (isinstance(value, str) or isinstance(value, Callable) or hasattr(value, 'match') | |
| 1947 or isinstance(value, bool) or value is None): | |
| 1948 return value | |
| 1949 | |
| 1950 # If it's a bytestring, convert it to Unicode, treating it as UTF-8. | |
| 1951 if isinstance(value, bytes): | |
| 1952 return value.decode("utf8") | |
| 1953 | |
| 1954 # If it's listlike, convert it into a list of strings. | |
| 1955 if hasattr(value, '__iter__'): | |
| 1956 new_value = [] | |
| 1957 for v in value: | |
| 1958 if (hasattr(v, '__iter__') and not isinstance(v, bytes) | |
| 1959 and not isinstance(v, str)): | |
| 1960 # This is almost certainly the user's mistake. In the | |
| 1961 # interests of avoiding infinite loops, we'll let | |
| 1962 # it through as-is rather than doing a recursive call. | |
| 1963 new_value.append(v) | |
| 1964 else: | |
| 1965 new_value.append(self._normalize_search_value(v)) | |
| 1966 return new_value | |
| 1967 | |
| 1968 # Otherwise, convert it into a Unicode string. | |
| 1969 # The unicode(str()) thing is so this will do the same thing on Python 2 | |
| 1970 # and Python 3. | |
| 1971 return str(str(value)) | |
| 1972 | |
| 1973 def __str__(self): | |
| 1974 """A human-readable representation of this SoupStrainer.""" | |
| 1975 if self.text: | |
| 1976 return self.text | |
| 1977 else: | |
| 1978 return "%s|%s" % (self.name, self.attrs) | |
| 1979 | |
| 1980 def search_tag(self, markup_name=None, markup_attrs={}): | |
| 1981 """Check whether a Tag with the given name and attributes would | |
| 1982 match this SoupStrainer. | |
| 1983 | |
| 1984 Used prospectively to decide whether to even bother creating a Tag | |
| 1985 object. | |
| 1986 | |
| 1987 :param markup_name: A tag name as found in some markup. | |
| 1988 :param markup_attrs: A dictionary of attributes as found in some markup. | |
| 1989 | |
| 1990 :return: True if the prospective tag would match this SoupStrainer; | |
| 1991 False otherwise. | |
| 1992 """ | |
| 1993 found = None | |
| 1994 markup = None | |
| 1995 if isinstance(markup_name, Tag): | |
| 1996 markup = markup_name | |
| 1997 markup_attrs = markup | |
| 1998 | |
| 1999 if isinstance(self.name, str): | |
| 2000 # Optimization for a very common case where the user is | |
| 2001 # searching for a tag with one specific name, and we're | |
| 2002 # looking at a tag with a different name. | |
| 2003 if markup and not markup.prefix and self.name != markup.name: | |
| 2004 return False | |
| 2005 | |
| 2006 call_function_with_tag_data = ( | |
| 2007 isinstance(self.name, Callable) | |
| 2008 and not isinstance(markup_name, Tag)) | |
| 2009 | |
| 2010 if ((not self.name) | |
| 2011 or call_function_with_tag_data | |
| 2012 or (markup and self._matches(markup, self.name)) | |
| 2013 or (not markup and self._matches(markup_name, self.name))): | |
| 2014 if call_function_with_tag_data: | |
| 2015 match = self.name(markup_name, markup_attrs) | |
| 2016 else: | |
| 2017 match = True | |
| 2018 markup_attr_map = None | |
| 2019 for attr, match_against in list(self.attrs.items()): | |
| 2020 if not markup_attr_map: | |
| 2021 if hasattr(markup_attrs, 'get'): | |
| 2022 markup_attr_map = markup_attrs | |
| 2023 else: | |
| 2024 markup_attr_map = {} | |
| 2025 for k, v in markup_attrs: | |
| 2026 markup_attr_map[k] = v | |
| 2027 attr_value = markup_attr_map.get(attr) | |
| 2028 if not self._matches(attr_value, match_against): | |
| 2029 match = False | |
| 2030 break | |
| 2031 if match: | |
| 2032 if markup: | |
| 2033 found = markup | |
| 2034 else: | |
| 2035 found = markup_name | |
| 2036 if found and self.text and not self._matches(found.string, self.text): | |
| 2037 found = None | |
| 2038 return found | |
| 2039 | |
| 2040 # For BS3 compatibility. | |
| 2041 searchTag = search_tag | |
| 2042 | |
| 2043 def search(self, markup): | |
| 2044 """Find all items in `markup` that match this SoupStrainer. | |
| 2045 | |
| 2046 Used by the core _find_all() method, which is ultimately | |
| 2047 called by all find_* methods. | |
| 2048 | |
| 2049 :param markup: A PageElement or a list of them. | |
| 2050 """ | |
| 2051 # print('looking for %s in %s' % (self, markup)) | |
| 2052 found = None | |
| 2053 # If given a list of items, scan it for a text element that | |
| 2054 # matches. | |
| 2055 if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)): | |
| 2056 for element in markup: | |
| 2057 if isinstance(element, NavigableString) \ | |
| 2058 and self.search(element): | |
| 2059 found = element | |
| 2060 break | |
| 2061 # If it's a Tag, make sure its name or attributes match. | |
| 2062 # Don't bother with Tags if we're searching for text. | |
| 2063 elif isinstance(markup, Tag): | |
| 2064 if not self.text or self.name or self.attrs: | |
| 2065 found = self.search_tag(markup) | |
| 2066 # If it's text, make sure the text matches. | |
| 2067 elif isinstance(markup, NavigableString) or \ | |
| 2068 isinstance(markup, str): | |
| 2069 if not self.name and not self.attrs and self._matches(markup, self.text): | |
| 2070 found = markup | |
| 2071 else: | |
| 2072 raise Exception( | |
| 2073 "I don't know how to match against a %s" % markup.__class__) | |
| 2074 return found | |
| 2075 | |
| 2076 def _matches(self, markup, match_against, already_tried=None): | |
| 2077 # print(u"Matching %s against %s" % (markup, match_against)) | |
| 2078 result = False | |
| 2079 if isinstance(markup, list) or isinstance(markup, tuple): | |
| 2080 # This should only happen when searching a multi-valued attribute | |
| 2081 # like 'class'. | |
| 2082 for item in markup: | |
| 2083 if self._matches(item, match_against): | |
| 2084 return True | |
| 2085 # We didn't match any particular value of the multivalue | |
| 2086 # attribute, but maybe we match the attribute value when | |
| 2087 # considered as a string. | |
| 2088 if self._matches(' '.join(markup), match_against): | |
| 2089 return True | |
| 2090 return False | |
| 2091 | |
| 2092 if match_against is True: | |
| 2093 # True matches any non-None value. | |
| 2094 return markup is not None | |
| 2095 | |
| 2096 if isinstance(match_against, Callable): | |
| 2097 return match_against(markup) | |
| 2098 | |
| 2099 # Custom callables take the tag as an argument, but all | |
| 2100 # other ways of matching match the tag name as a string. | |
| 2101 original_markup = markup | |
| 2102 if isinstance(markup, Tag): | |
| 2103 markup = markup.name | |
| 2104 | |
| 2105 # Ensure that `markup` is either a Unicode string, or None. | |
| 2106 markup = self._normalize_search_value(markup) | |
| 2107 | |
| 2108 if markup is None: | |
| 2109 # None matches None, False, an empty string, an empty list, and so on. | |
| 2110 return not match_against | |
| 2111 | |
| 2112 if (hasattr(match_against, '__iter__') | |
| 2113 and not isinstance(match_against, str)): | |
| 2114 # We're asked to match against an iterable of items. | |
| 2115 # The markup must be match at least one item in the | |
| 2116 # iterable. We'll try each one in turn. | |
| 2117 # | |
| 2118 # To avoid infinite recursion we need to keep track of | |
| 2119 # items we've already seen. | |
| 2120 if not already_tried: | |
| 2121 already_tried = set() | |
| 2122 for item in match_against: | |
| 2123 if item.__hash__: | |
| 2124 key = item | |
| 2125 else: | |
| 2126 key = id(item) | |
| 2127 if key in already_tried: | |
| 2128 continue | |
| 2129 else: | |
| 2130 already_tried.add(key) | |
| 2131 if self._matches(original_markup, item, already_tried): | |
| 2132 return True | |
| 2133 else: | |
| 2134 return False | |
| 2135 | |
| 2136 # Beyond this point we might need to run the test twice: once against | |
| 2137 # the tag's name and once against its prefixed name. | |
| 2138 match = False | |
| 2139 | |
| 2140 if not match and isinstance(match_against, str): | |
| 2141 # Exact string match | |
| 2142 match = markup == match_against | |
| 2143 | |
| 2144 if not match and hasattr(match_against, 'search'): | |
| 2145 # Regexp match | |
| 2146 return match_against.search(markup) | |
| 2147 | |
| 2148 if (not match | |
| 2149 and isinstance(original_markup, Tag) | |
| 2150 and original_markup.prefix): | |
| 2151 # Try the whole thing again with the prefixed tag name. | |
| 2152 return self._matches( | |
| 2153 original_markup.prefix + ':' + original_markup.name, match_against | |
| 2154 ) | |
| 2155 | |
| 2156 return match | |
| 2157 | |
| 2158 | |
| 2159 class ResultSet(list): | |
| 2160 """A ResultSet is just a list that keeps track of the SoupStrainer | |
| 2161 that created it.""" | |
| 2162 def __init__(self, source, result=()): | |
| 2163 """Constructor. | |
| 2164 | |
| 2165 :param source: A SoupStrainer. | |
| 2166 :param result: A list of PageElements. | |
| 2167 """ | |
| 2168 super(ResultSet, self).__init__(result) | |
| 2169 self.source = source | |
| 2170 | |
| 2171 def __getattr__(self, key): | |
| 2172 """Raise a helpful exception to explain a common code fix.""" | |
| 2173 raise AttributeError( | |
| 2174 "ResultSet object has no attribute '%s'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?" % key | |
| 2175 ) |
