comparison env/lib/python3.9/site-packages/bs4/element.py @ 0:4f3585e2f14b draft default tip

"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author shellac
date Mon, 22 Mar 2021 18:12:50 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4f3585e2f14b
1 # Use of this source code is governed by the MIT license.
2 __license__ = "MIT"
3
4 try:
5 from collections.abc import Callable # Python 3.6
6 except ImportError as e:
7 from collections import Callable
8 import re
9 import sys
10 import warnings
11 try:
12 import soupsieve
13 except ImportError as e:
14 soupsieve = None
15 warnings.warn(
16 'The soupsieve package is not installed. CSS selectors cannot be used.'
17 )
18
19 from bs4.formatter import (
20 Formatter,
21 HTMLFormatter,
22 XMLFormatter,
23 )
24
25 DEFAULT_OUTPUT_ENCODING = "utf-8"
26 PY3K = (sys.version_info[0] > 2)
27
28 nonwhitespace_re = re.compile(r"\S+")
29
30 # NOTE: This isn't used as of 4.7.0. I'm leaving it for a little bit on
31 # the off chance someone imported it for their own use.
32 whitespace_re = re.compile(r"\s+")
33
34 def _alias(attr):
35 """Alias one attribute name to another for backward compatibility"""
36 @property
37 def alias(self):
38 return getattr(self, attr)
39
40 @alias.setter
41 def alias(self):
42 return setattr(self, attr)
43 return alias
44
45
46 # These encodings are recognized by Python (so PageElement.encode
47 # could theoretically support them) but XML and HTML don't recognize
48 # them (so they should not show up in an XML or HTML document as that
49 # document's encoding).
50 #
51 # If an XML document is encoded in one of these encodings, no encoding
52 # will be mentioned in the XML declaration. If an HTML document is
53 # encoded in one of these encodings, and the HTML document has a
54 # <meta> tag that mentions an encoding, the encoding will be given as
55 # the empty string.
56 #
57 # Source:
58 # https://docs.python.org/3/library/codecs.html#python-specific-encodings
59 PYTHON_SPECIFIC_ENCODINGS = set([
60 "idna",
61 "mbcs",
62 "oem",
63 "palmos",
64 "punycode",
65 "raw_unicode_escape",
66 "undefined",
67 "unicode_escape",
68 "raw-unicode-escape",
69 "unicode-escape",
70 "string-escape",
71 "string_escape",
72 ])
73
74
75 class NamespacedAttribute(str):
76 """A namespaced string (e.g. 'xml:lang') that remembers the namespace
77 ('xml') and the name ('lang') that were used to create it.
78 """
79
80 def __new__(cls, prefix, name=None, namespace=None):
81 if not name:
82 # This is the default namespace. Its name "has no value"
83 # per https://www.w3.org/TR/xml-names/#defaulting
84 name = None
85
86 if name is None:
87 obj = str.__new__(cls, prefix)
88 elif prefix is None:
89 # Not really namespaced.
90 obj = str.__new__(cls, name)
91 else:
92 obj = str.__new__(cls, prefix + ":" + name)
93 obj.prefix = prefix
94 obj.name = name
95 obj.namespace = namespace
96 return obj
97
98 class AttributeValueWithCharsetSubstitution(str):
99 """A stand-in object for a character encoding specified in HTML."""
100
101 class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
102 """A generic stand-in for the value of a meta tag's 'charset' attribute.
103
104 When Beautiful Soup parses the markup '<meta charset="utf8">', the
105 value of the 'charset' attribute will be one of these objects.
106 """
107
108 def __new__(cls, original_value):
109 obj = str.__new__(cls, original_value)
110 obj.original_value = original_value
111 return obj
112
113 def encode(self, encoding):
114 """When an HTML document is being encoded to a given encoding, the
115 value of a meta tag's 'charset' is the name of the encoding.
116 """
117 if encoding in PYTHON_SPECIFIC_ENCODINGS:
118 return ''
119 return encoding
120
121
122 class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
123 """A generic stand-in for the value of a meta tag's 'content' attribute.
124
125 When Beautiful Soup parses the markup:
126 <meta http-equiv="content-type" content="text/html; charset=utf8">
127
128 The value of the 'content' attribute will be one of these objects.
129 """
130
131 CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M)
132
133 def __new__(cls, original_value):
134 match = cls.CHARSET_RE.search(original_value)
135 if match is None:
136 # No substitution necessary.
137 return str.__new__(str, original_value)
138
139 obj = str.__new__(cls, original_value)
140 obj.original_value = original_value
141 return obj
142
143 def encode(self, encoding):
144 if encoding in PYTHON_SPECIFIC_ENCODINGS:
145 return ''
146 def rewrite(match):
147 return match.group(1) + encoding
148 return self.CHARSET_RE.sub(rewrite, self.original_value)
149
150
151 class PageElement(object):
152 """Contains the navigational information for some part of the page:
153 that is, its current location in the parse tree.
154
155 NavigableString, Tag, etc. are all subclasses of PageElement.
156 """
157
158 def setup(self, parent=None, previous_element=None, next_element=None,
159 previous_sibling=None, next_sibling=None):
160 """Sets up the initial relations between this element and
161 other elements.
162
163 :param parent: The parent of this element.
164
165 :param previous_element: The element parsed immediately before
166 this one.
167
168 :param next_element: The element parsed immediately before
169 this one.
170
171 :param previous_sibling: The most recently encountered element
172 on the same level of the parse tree as this one.
173
174 :param previous_sibling: The next element to be encountered
175 on the same level of the parse tree as this one.
176 """
177 self.parent = parent
178
179 self.previous_element = previous_element
180 if previous_element is not None:
181 self.previous_element.next_element = self
182
183 self.next_element = next_element
184 if self.next_element is not None:
185 self.next_element.previous_element = self
186
187 self.next_sibling = next_sibling
188 if self.next_sibling is not None:
189 self.next_sibling.previous_sibling = self
190
191 if (previous_sibling is None
192 and self.parent is not None and self.parent.contents):
193 previous_sibling = self.parent.contents[-1]
194
195 self.previous_sibling = previous_sibling
196 if previous_sibling is not None:
197 self.previous_sibling.next_sibling = self
198
199 def format_string(self, s, formatter):
200 """Format the given string using the given formatter.
201
202 :param s: A string.
203 :param formatter: A Formatter object, or a string naming one of the standard formatters.
204 """
205 if formatter is None:
206 return s
207 if not isinstance(formatter, Formatter):
208 formatter = self.formatter_for_name(formatter)
209 output = formatter.substitute(s)
210 return output
211
212 def formatter_for_name(self, formatter):
213 """Look up or create a Formatter for the given identifier,
214 if necessary.
215
216 :param formatter: Can be a Formatter object (used as-is), a
217 function (used as the entity substitution hook for an
218 XMLFormatter or HTMLFormatter), or a string (used to look
219 up an XMLFormatter or HTMLFormatter in the appropriate
220 registry.
221 """
222 if isinstance(formatter, Formatter):
223 return formatter
224 if self._is_xml:
225 c = XMLFormatter
226 else:
227 c = HTMLFormatter
228 if isinstance(formatter, Callable):
229 return c(entity_substitution=formatter)
230 return c.REGISTRY[formatter]
231
232 @property
233 def _is_xml(self):
234 """Is this element part of an XML tree or an HTML tree?
235
236 This is used in formatter_for_name, when deciding whether an
237 XMLFormatter or HTMLFormatter is more appropriate. It can be
238 inefficient, but it should be called very rarely.
239 """
240 if self.known_xml is not None:
241 # Most of the time we will have determined this when the
242 # document is parsed.
243 return self.known_xml
244
245 # Otherwise, it's likely that this element was created by
246 # direct invocation of the constructor from within the user's
247 # Python code.
248 if self.parent is None:
249 # This is the top-level object. It should have .known_xml set
250 # from tree creation. If not, take a guess--BS is usually
251 # used on HTML markup.
252 return getattr(self, 'is_xml', False)
253 return self.parent._is_xml
254
255 nextSibling = _alias("next_sibling") # BS3
256 previousSibling = _alias("previous_sibling") # BS3
257
258 def replace_with(self, replace_with):
259 """Replace this PageElement with another one, keeping the rest of the
260 tree the same.
261
262 :param replace_with: A PageElement.
263 :return: `self`, no longer part of the tree.
264 """
265 if self.parent is None:
266 raise ValueError(
267 "Cannot replace one element with another when the "
268 "element to be replaced is not part of a tree.")
269 if replace_with is self:
270 return
271 if replace_with is self.parent:
272 raise ValueError("Cannot replace a Tag with its parent.")
273 old_parent = self.parent
274 my_index = self.parent.index(self)
275 self.extract(_self_index=my_index)
276 old_parent.insert(my_index, replace_with)
277 return self
278 replaceWith = replace_with # BS3
279
280 def unwrap(self):
281 """Replace this PageElement with its contents.
282
283 :return: `self`, no longer part of the tree.
284 """
285 my_parent = self.parent
286 if self.parent is None:
287 raise ValueError(
288 "Cannot replace an element with its contents when that"
289 "element is not part of a tree.")
290 my_index = self.parent.index(self)
291 self.extract(_self_index=my_index)
292 for child in reversed(self.contents[:]):
293 my_parent.insert(my_index, child)
294 return self
295 replace_with_children = unwrap
296 replaceWithChildren = unwrap # BS3
297
298 def wrap(self, wrap_inside):
299 """Wrap this PageElement inside another one.
300
301 :param wrap_inside: A PageElement.
302 :return: `wrap_inside`, occupying the position in the tree that used
303 to be occupied by `self`, and with `self` inside it.
304 """
305 me = self.replace_with(wrap_inside)
306 wrap_inside.append(me)
307 return wrap_inside
308
309 def extract(self, _self_index=None):
310 """Destructively rips this element out of the tree.
311
312 :param _self_index: The location of this element in its parent's
313 .contents, if known. Passing this in allows for a performance
314 optimization.
315
316 :return: `self`, no longer part of the tree.
317 """
318 if self.parent is not None:
319 if _self_index is None:
320 _self_index = self.parent.index(self)
321 del self.parent.contents[_self_index]
322
323 #Find the two elements that would be next to each other if
324 #this element (and any children) hadn't been parsed. Connect
325 #the two.
326 last_child = self._last_descendant()
327 next_element = last_child.next_element
328
329 if (self.previous_element is not None and
330 self.previous_element is not next_element):
331 self.previous_element.next_element = next_element
332 if next_element is not None and next_element is not self.previous_element:
333 next_element.previous_element = self.previous_element
334 self.previous_element = None
335 last_child.next_element = None
336
337 self.parent = None
338 if (self.previous_sibling is not None
339 and self.previous_sibling is not self.next_sibling):
340 self.previous_sibling.next_sibling = self.next_sibling
341 if (self.next_sibling is not None
342 and self.next_sibling is not self.previous_sibling):
343 self.next_sibling.previous_sibling = self.previous_sibling
344 self.previous_sibling = self.next_sibling = None
345 return self
346
347 def _last_descendant(self, is_initialized=True, accept_self=True):
348 """Finds the last element beneath this object to be parsed.
349
350 :param is_initialized: Has `setup` been called on this PageElement
351 yet?
352 :param accept_self: Is `self` an acceptable answer to the question?
353 """
354 if is_initialized and self.next_sibling is not None:
355 last_child = self.next_sibling.previous_element
356 else:
357 last_child = self
358 while isinstance(last_child, Tag) and last_child.contents:
359 last_child = last_child.contents[-1]
360 if not accept_self and last_child is self:
361 last_child = None
362 return last_child
363 # BS3: Not part of the API!
364 _lastRecursiveChild = _last_descendant
365
366 def insert(self, position, new_child):
367 """Insert a new PageElement in the list of this PageElement's children.
368
369 This works the same way as `list.insert`.
370
371 :param position: The numeric position that should be occupied
372 in `self.children` by the new PageElement.
373 :param new_child: A PageElement.
374 """
375 if new_child is None:
376 raise ValueError("Cannot insert None into a tag.")
377 if new_child is self:
378 raise ValueError("Cannot insert a tag into itself.")
379 if (isinstance(new_child, str)
380 and not isinstance(new_child, NavigableString)):
381 new_child = NavigableString(new_child)
382
383 from bs4 import BeautifulSoup
384 if isinstance(new_child, BeautifulSoup):
385 # We don't want to end up with a situation where one BeautifulSoup
386 # object contains another. Insert the children one at a time.
387 for subchild in list(new_child.contents):
388 self.insert(position, subchild)
389 position += 1
390 return
391 position = min(position, len(self.contents))
392 if hasattr(new_child, 'parent') and new_child.parent is not None:
393 # We're 'inserting' an element that's already one
394 # of this object's children.
395 if new_child.parent is self:
396 current_index = self.index(new_child)
397 if current_index < position:
398 # We're moving this element further down the list
399 # of this object's children. That means that when
400 # we extract this element, our target index will
401 # jump down one.
402 position -= 1
403 new_child.extract()
404
405 new_child.parent = self
406 previous_child = None
407 if position == 0:
408 new_child.previous_sibling = None
409 new_child.previous_element = self
410 else:
411 previous_child = self.contents[position - 1]
412 new_child.previous_sibling = previous_child
413 new_child.previous_sibling.next_sibling = new_child
414 new_child.previous_element = previous_child._last_descendant(False)
415 if new_child.previous_element is not None:
416 new_child.previous_element.next_element = new_child
417
418 new_childs_last_element = new_child._last_descendant(False)
419
420 if position >= len(self.contents):
421 new_child.next_sibling = None
422
423 parent = self
424 parents_next_sibling = None
425 while parents_next_sibling is None and parent is not None:
426 parents_next_sibling = parent.next_sibling
427 parent = parent.parent
428 if parents_next_sibling is not None:
429 # We found the element that comes next in the document.
430 break
431 if parents_next_sibling is not None:
432 new_childs_last_element.next_element = parents_next_sibling
433 else:
434 # The last element of this tag is the last element in
435 # the document.
436 new_childs_last_element.next_element = None
437 else:
438 next_child = self.contents[position]
439 new_child.next_sibling = next_child
440 if new_child.next_sibling is not None:
441 new_child.next_sibling.previous_sibling = new_child
442 new_childs_last_element.next_element = next_child
443
444 if new_childs_last_element.next_element is not None:
445 new_childs_last_element.next_element.previous_element = new_childs_last_element
446 self.contents.insert(position, new_child)
447
448 def append(self, tag):
449 """Appends the given PageElement to the contents of this one.
450
451 :param tag: A PageElement.
452 """
453 self.insert(len(self.contents), tag)
454
455 def extend(self, tags):
456 """Appends the given PageElements to this one's contents.
457
458 :param tags: A list of PageElements.
459 """
460 if isinstance(tags, Tag):
461 # Calling self.append() on another tag's contents will change
462 # the list we're iterating over. Make a list that won't
463 # change.
464 tags = list(tags.contents)
465 for tag in tags:
466 self.append(tag)
467
468 def insert_before(self, *args):
469 """Makes the given element(s) the immediate predecessor of this one.
470
471 All the elements will have the same parent, and the given elements
472 will be immediately before this one.
473
474 :param args: One or more PageElements.
475 """
476 parent = self.parent
477 if parent is None:
478 raise ValueError(
479 "Element has no parent, so 'before' has no meaning.")
480 if any(x is self for x in args):
481 raise ValueError("Can't insert an element before itself.")
482 for predecessor in args:
483 # Extract first so that the index won't be screwed up if they
484 # are siblings.
485 if isinstance(predecessor, PageElement):
486 predecessor.extract()
487 index = parent.index(self)
488 parent.insert(index, predecessor)
489
490 def insert_after(self, *args):
491 """Makes the given element(s) the immediate successor of this one.
492
493 The elements will have the same parent, and the given elements
494 will be immediately after this one.
495
496 :param args: One or more PageElements.
497 """
498 # Do all error checking before modifying the tree.
499 parent = self.parent
500 if parent is None:
501 raise ValueError(
502 "Element has no parent, so 'after' has no meaning.")
503 if any(x is self for x in args):
504 raise ValueError("Can't insert an element after itself.")
505
506 offset = 0
507 for successor in args:
508 # Extract first so that the index won't be screwed up if they
509 # are siblings.
510 if isinstance(successor, PageElement):
511 successor.extract()
512 index = parent.index(self)
513 parent.insert(index+1+offset, successor)
514 offset += 1
515
516 def find_next(self, name=None, attrs={}, text=None, **kwargs):
517 """Find the first PageElement that matches the given criteria and
518 appears later in the document than this PageElement.
519
520 All find_* methods take a common set of arguments. See the online
521 documentation for detailed explanations.
522
523 :param name: A filter on tag name.
524 :param attrs: A dictionary of filters on attribute values.
525 :param text: A filter for a NavigableString with specific text.
526 :kwargs: A dictionary of filters on attribute values.
527 :return: A PageElement.
528 :rtype: bs4.element.Tag | bs4.element.NavigableString
529 """
530 return self._find_one(self.find_all_next, name, attrs, text, **kwargs)
531 findNext = find_next # BS3
532
533 def find_all_next(self, name=None, attrs={}, text=None, limit=None,
534 **kwargs):
535 """Find all PageElements that match the given criteria and appear
536 later in the document than this PageElement.
537
538 All find_* methods take a common set of arguments. See the online
539 documentation for detailed explanations.
540
541 :param name: A filter on tag name.
542 :param attrs: A dictionary of filters on attribute values.
543 :param text: A filter for a NavigableString with specific text.
544 :param limit: Stop looking after finding this many results.
545 :kwargs: A dictionary of filters on attribute values.
546 :return: A ResultSet containing PageElements.
547 """
548 return self._find_all(name, attrs, text, limit, self.next_elements,
549 **kwargs)
550 findAllNext = find_all_next # BS3
551
552 def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs):
553 """Find the closest sibling to this PageElement that matches the
554 given criteria and appears later in the document.
555
556 All find_* methods take a common set of arguments. See the
557 online documentation for detailed explanations.
558
559 :param name: A filter on tag name.
560 :param attrs: A dictionary of filters on attribute values.
561 :param text: A filter for a NavigableString with specific text.
562 :kwargs: A dictionary of filters on attribute values.
563 :return: A PageElement.
564 :rtype: bs4.element.Tag | bs4.element.NavigableString
565 """
566 return self._find_one(self.find_next_siblings, name, attrs, text,
567 **kwargs)
568 findNextSibling = find_next_sibling # BS3
569
570 def find_next_siblings(self, name=None, attrs={}, text=None, limit=None,
571 **kwargs):
572 """Find all siblings of this PageElement that match the given criteria
573 and appear later in the document.
574
575 All find_* methods take a common set of arguments. See the online
576 documentation for detailed explanations.
577
578 :param name: A filter on tag name.
579 :param attrs: A dictionary of filters on attribute values.
580 :param text: A filter for a NavigableString with specific text.
581 :param limit: Stop looking after finding this many results.
582 :kwargs: A dictionary of filters on attribute values.
583 :return: A ResultSet of PageElements.
584 :rtype: bs4.element.ResultSet
585 """
586 return self._find_all(name, attrs, text, limit,
587 self.next_siblings, **kwargs)
588 findNextSiblings = find_next_siblings # BS3
589 fetchNextSiblings = find_next_siblings # BS2
590
591 def find_previous(self, name=None, attrs={}, text=None, **kwargs):
592 """Look backwards in the document from this PageElement and find the
593 first PageElement that matches the given criteria.
594
595 All find_* methods take a common set of arguments. See the online
596 documentation for detailed explanations.
597
598 :param name: A filter on tag name.
599 :param attrs: A dictionary of filters on attribute values.
600 :param text: A filter for a NavigableString with specific text.
601 :kwargs: A dictionary of filters on attribute values.
602 :return: A PageElement.
603 :rtype: bs4.element.Tag | bs4.element.NavigableString
604 """
605 return self._find_one(
606 self.find_all_previous, name, attrs, text, **kwargs)
607 findPrevious = find_previous # BS3
608
609 def find_all_previous(self, name=None, attrs={}, text=None, limit=None,
610 **kwargs):
611 """Look backwards in the document from this PageElement and find all
612 PageElements that match the given criteria.
613
614 All find_* methods take a common set of arguments. See the online
615 documentation for detailed explanations.
616
617 :param name: A filter on tag name.
618 :param attrs: A dictionary of filters on attribute values.
619 :param text: A filter for a NavigableString with specific text.
620 :param limit: Stop looking after finding this many results.
621 :kwargs: A dictionary of filters on attribute values.
622 :return: A ResultSet of PageElements.
623 :rtype: bs4.element.ResultSet
624 """
625 return self._find_all(name, attrs, text, limit, self.previous_elements,
626 **kwargs)
627 findAllPrevious = find_all_previous # BS3
628 fetchPrevious = find_all_previous # BS2
629
630 def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs):
631 """Returns the closest sibling to this PageElement that matches the
632 given criteria and appears earlier in the document.
633
634 All find_* methods take a common set of arguments. See the online
635 documentation for detailed explanations.
636
637 :param name: A filter on tag name.
638 :param attrs: A dictionary of filters on attribute values.
639 :param text: A filter for a NavigableString with specific text.
640 :kwargs: A dictionary of filters on attribute values.
641 :return: A PageElement.
642 :rtype: bs4.element.Tag | bs4.element.NavigableString
643 """
644 return self._find_one(self.find_previous_siblings, name, attrs, text,
645 **kwargs)
646 findPreviousSibling = find_previous_sibling # BS3
647
648 def find_previous_siblings(self, name=None, attrs={}, text=None,
649 limit=None, **kwargs):
650 """Returns all siblings to this PageElement that match the
651 given criteria and appear earlier in the document.
652
653 All find_* methods take a common set of arguments. See the online
654 documentation for detailed explanations.
655
656 :param name: A filter on tag name.
657 :param attrs: A dictionary of filters on attribute values.
658 :param text: A filter for a NavigableString with specific text.
659 :param limit: Stop looking after finding this many results.
660 :kwargs: A dictionary of filters on attribute values.
661 :return: A ResultSet of PageElements.
662 :rtype: bs4.element.ResultSet
663 """
664 return self._find_all(name, attrs, text, limit,
665 self.previous_siblings, **kwargs)
666 findPreviousSiblings = find_previous_siblings # BS3
667 fetchPreviousSiblings = find_previous_siblings # BS2
668
669 def find_parent(self, name=None, attrs={}, **kwargs):
670 """Find the closest parent of this PageElement that matches the given
671 criteria.
672
673 All find_* methods take a common set of arguments. See the online
674 documentation for detailed explanations.
675
676 :param name: A filter on tag name.
677 :param attrs: A dictionary of filters on attribute values.
678 :kwargs: A dictionary of filters on attribute values.
679
680 :return: A PageElement.
681 :rtype: bs4.element.Tag | bs4.element.NavigableString
682 """
683 # NOTE: We can't use _find_one because findParents takes a different
684 # set of arguments.
685 r = None
686 l = self.find_parents(name, attrs, 1, **kwargs)
687 if l:
688 r = l[0]
689 return r
690 findParent = find_parent # BS3
691
692 def find_parents(self, name=None, attrs={}, limit=None, **kwargs):
693 """Find all parents of this PageElement that match the given criteria.
694
695 All find_* methods take a common set of arguments. See the online
696 documentation for detailed explanations.
697
698 :param name: A filter on tag name.
699 :param attrs: A dictionary of filters on attribute values.
700 :param limit: Stop looking after finding this many results.
701 :kwargs: A dictionary of filters on attribute values.
702
703 :return: A PageElement.
704 :rtype: bs4.element.Tag | bs4.element.NavigableString
705 """
706 return self._find_all(name, attrs, None, limit, self.parents,
707 **kwargs)
708 findParents = find_parents # BS3
709 fetchParents = find_parents # BS2
710
711 @property
712 def next(self):
713 """The PageElement, if any, that was parsed just after this one.
714
715 :return: A PageElement.
716 :rtype: bs4.element.Tag | bs4.element.NavigableString
717 """
718 return self.next_element
719
720 @property
721 def previous(self):
722 """The PageElement, if any, that was parsed just before this one.
723
724 :return: A PageElement.
725 :rtype: bs4.element.Tag | bs4.element.NavigableString
726 """
727 return self.previous_element
728
729 #These methods do the real heavy lifting.
730
731 def _find_one(self, method, name, attrs, text, **kwargs):
732 r = None
733 l = method(name, attrs, text, 1, **kwargs)
734 if l:
735 r = l[0]
736 return r
737
738 def _find_all(self, name, attrs, text, limit, generator, **kwargs):
739 "Iterates over a generator looking for things that match."
740
741 if text is None and 'string' in kwargs:
742 text = kwargs['string']
743 del kwargs['string']
744
745 if isinstance(name, SoupStrainer):
746 strainer = name
747 else:
748 strainer = SoupStrainer(name, attrs, text, **kwargs)
749
750 if text is None and not limit and not attrs and not kwargs:
751 if name is True or name is None:
752 # Optimization to find all tags.
753 result = (element for element in generator
754 if isinstance(element, Tag))
755 return ResultSet(strainer, result)
756 elif isinstance(name, str):
757 # Optimization to find all tags with a given name.
758 if name.count(':') == 1:
759 # This is a name with a prefix. If this is a namespace-aware document,
760 # we need to match the local name against tag.name. If not,
761 # we need to match the fully-qualified name against tag.name.
762 prefix, local_name = name.split(':', 1)
763 else:
764 prefix = None
765 local_name = name
766 result = (element for element in generator
767 if isinstance(element, Tag)
768 and (
769 element.name == name
770 ) or (
771 element.name == local_name
772 and (prefix is None or element.prefix == prefix)
773 )
774 )
775 return ResultSet(strainer, result)
776 results = ResultSet(strainer)
777 while True:
778 try:
779 i = next(generator)
780 except StopIteration:
781 break
782 if i:
783 found = strainer.search(i)
784 if found:
785 results.append(found)
786 if limit and len(results) >= limit:
787 break
788 return results
789
790 #These generators can be used to navigate starting from both
791 #NavigableStrings and Tags.
792 @property
793 def next_elements(self):
794 """All PageElements that were parsed after this one.
795
796 :yield: A sequence of PageElements.
797 """
798 i = self.next_element
799 while i is not None:
800 yield i
801 i = i.next_element
802
803 @property
804 def next_siblings(self):
805 """All PageElements that are siblings of this one but were parsed
806 later.
807
808 :yield: A sequence of PageElements.
809 """
810 i = self.next_sibling
811 while i is not None:
812 yield i
813 i = i.next_sibling
814
815 @property
816 def previous_elements(self):
817 """All PageElements that were parsed before this one.
818
819 :yield: A sequence of PageElements.
820 """
821 i = self.previous_element
822 while i is not None:
823 yield i
824 i = i.previous_element
825
826 @property
827 def previous_siblings(self):
828 """All PageElements that are siblings of this one but were parsed
829 earlier.
830
831 :yield: A sequence of PageElements.
832 """
833 i = self.previous_sibling
834 while i is not None:
835 yield i
836 i = i.previous_sibling
837
838 @property
839 def parents(self):
840 """All PageElements that are parents of this PageElement.
841
842 :yield: A sequence of PageElements.
843 """
844 i = self.parent
845 while i is not None:
846 yield i
847 i = i.parent
848
849 @property
850 def decomposed(self):
851 """Check whether a PageElement has been decomposed.
852
853 :rtype: bool
854 """
855 return getattr(self, '_decomposed', False) or False
856
857 # Old non-property versions of the generators, for backwards
858 # compatibility with BS3.
859 def nextGenerator(self):
860 return self.next_elements
861
862 def nextSiblingGenerator(self):
863 return self.next_siblings
864
865 def previousGenerator(self):
866 return self.previous_elements
867
868 def previousSiblingGenerator(self):
869 return self.previous_siblings
870
871 def parentGenerator(self):
872 return self.parents
873
874
875 class NavigableString(str, PageElement):
876 """A Python Unicode string that is part of a parse tree.
877
878 When Beautiful Soup parses the markup <b>penguin</b>, it will
879 create a NavigableString for the string "penguin".
880 """
881
882 PREFIX = ''
883 SUFFIX = ''
884
885 # We can't tell just by looking at a string whether it's contained
886 # in an XML document or an HTML document.
887
888 known_xml = None
889
890 def __new__(cls, value):
891 """Create a new NavigableString.
892
893 When unpickling a NavigableString, this method is called with
894 the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
895 passed in to the superclass's __new__ or the superclass won't know
896 how to handle non-ASCII characters.
897 """
898 if isinstance(value, str):
899 u = str.__new__(cls, value)
900 else:
901 u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
902 u.setup()
903 return u
904
905 def __copy__(self):
906 """A copy of a NavigableString has the same contents and class
907 as the original, but it is not connected to the parse tree.
908 """
909 return type(self)(self)
910
911 def __getnewargs__(self):
912 return (str(self),)
913
914 def __getattr__(self, attr):
915 """text.string gives you text. This is for backwards
916 compatibility for Navigable*String, but for CData* it lets you
917 get the string without the CData wrapper."""
918 if attr == 'string':
919 return self
920 else:
921 raise AttributeError(
922 "'%s' object has no attribute '%s'" % (
923 self.__class__.__name__, attr))
924
925 def output_ready(self, formatter="minimal"):
926 """Run the string through the provided formatter.
927
928 :param formatter: A Formatter object, or a string naming one of the standard formatters.
929 """
930 output = self.format_string(self, formatter)
931 return self.PREFIX + output + self.SUFFIX
932
933 @property
934 def name(self):
935 """Since a NavigableString is not a Tag, it has no .name.
936
937 This property is implemented so that code like this doesn't crash
938 when run on a mixture of Tag and NavigableString objects:
939 [x.name for x in tag.children]
940 """
941 return None
942
943 @name.setter
944 def name(self, name):
945 """Prevent NavigableString.name from ever being set."""
946 raise AttributeError("A NavigableString cannot be given a name.")
947
948
949 class PreformattedString(NavigableString):
950 """A NavigableString not subject to the normal formatting rules.
951
952 This is an abstract class used for special kinds of strings such
953 as comments (the Comment class) and CDATA blocks (the CData
954 class).
955 """
956
957 PREFIX = ''
958 SUFFIX = ''
959
960 def output_ready(self, formatter=None):
961 """Make this string ready for output by adding any subclass-specific
962 prefix or suffix.
963
964 :param formatter: A Formatter object, or a string naming one
965 of the standard formatters. The string will be passed into the
966 Formatter, but only to trigger any side effects: the return
967 value is ignored.
968
969 :return: The string, with any subclass-specific prefix and
970 suffix added on.
971 """
972 if formatter is not None:
973 ignore = self.format_string(self, formatter)
974 return self.PREFIX + self + self.SUFFIX
975
976 class CData(PreformattedString):
977 """A CDATA block."""
978 PREFIX = '<![CDATA['
979 SUFFIX = ']]>'
980
981 class ProcessingInstruction(PreformattedString):
982 """A SGML processing instruction."""
983
984 PREFIX = '<?'
985 SUFFIX = '>'
986
987 class XMLProcessingInstruction(ProcessingInstruction):
988 """An XML processing instruction."""
989 PREFIX = '<?'
990 SUFFIX = '?>'
991
992 class Comment(PreformattedString):
993 """An HTML or XML comment."""
994 PREFIX = '<!--'
995 SUFFIX = '-->'
996
997
998 class Declaration(PreformattedString):
999 """An XML declaration."""
1000 PREFIX = '<?'
1001 SUFFIX = '?>'
1002
1003
1004 class Doctype(PreformattedString):
1005 """A document type declaration."""
1006 @classmethod
1007 def for_name_and_ids(cls, name, pub_id, system_id):
1008 """Generate an appropriate document type declaration for a given
1009 public ID and system ID.
1010
1011 :param name: The name of the document's root element, e.g. 'html'.
1012 :param pub_id: The Formal Public Identifier for this document type,
1013 e.g. '-//W3C//DTD XHTML 1.1//EN'
1014 :param system_id: The system identifier for this document type,
1015 e.g. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'
1016
1017 :return: A Doctype.
1018 """
1019 value = name or ''
1020 if pub_id is not None:
1021 value += ' PUBLIC "%s"' % pub_id
1022 if system_id is not None:
1023 value += ' "%s"' % system_id
1024 elif system_id is not None:
1025 value += ' SYSTEM "%s"' % system_id
1026
1027 return Doctype(value)
1028
1029 PREFIX = '<!DOCTYPE '
1030 SUFFIX = '>\n'
1031
1032
1033 class Stylesheet(NavigableString):
1034 """A NavigableString representing an stylesheet (probably
1035 CSS).
1036
1037 Used to distinguish embedded stylesheets from textual content.
1038 """
1039 pass
1040
1041
1042 class Script(NavigableString):
1043 """A NavigableString representing an executable script (probably
1044 Javascript).
1045
1046 Used to distinguish executable code from textual content.
1047 """
1048 pass
1049
1050
1051 class TemplateString(NavigableString):
1052 """A NavigableString representing a string found inside an HTML
1053 template embedded in a larger document.
1054
1055 Used to distinguish such strings from the main body of the document.
1056 """
1057 pass
1058
1059
1060 class Tag(PageElement):
1061 """Represents an HTML or XML tag that is part of a parse tree, along
1062 with its attributes and contents.
1063
1064 When Beautiful Soup parses the markup <b>penguin</b>, it will
1065 create a Tag object representing the <b> tag.
1066 """
1067
1068 def __init__(self, parser=None, builder=None, name=None, namespace=None,
1069 prefix=None, attrs=None, parent=None, previous=None,
1070 is_xml=None, sourceline=None, sourcepos=None,
1071 can_be_empty_element=None, cdata_list_attributes=None,
1072 preserve_whitespace_tags=None
1073 ):
1074 """Basic constructor.
1075
1076 :param parser: A BeautifulSoup object.
1077 :param builder: A TreeBuilder.
1078 :param name: The name of the tag.
1079 :param namespace: The URI of this Tag's XML namespace, if any.
1080 :param prefix: The prefix for this Tag's XML namespace, if any.
1081 :param attrs: A dictionary of this Tag's attribute values.
1082 :param parent: The PageElement to use as this Tag's parent.
1083 :param previous: The PageElement that was parsed immediately before
1084 this tag.
1085 :param is_xml: If True, this is an XML tag. Otherwise, this is an
1086 HTML tag.
1087 :param sourceline: The line number where this tag was found in its
1088 source document.
1089 :param sourcepos: The character position within `sourceline` where this
1090 tag was found.
1091 :param can_be_empty_element: If True, this tag should be
1092 represented as <tag/>. If False, this tag should be represented
1093 as <tag></tag>.
1094 :param cdata_list_attributes: A list of attributes whose values should
1095 be treated as CDATA if they ever show up on this tag.
1096 :param preserve_whitespace_tags: A list of tag names whose contents
1097 should have their whitespace preserved.
1098 """
1099 if parser is None:
1100 self.parser_class = None
1101 else:
1102 # We don't actually store the parser object: that lets extracted
1103 # chunks be garbage-collected.
1104 self.parser_class = parser.__class__
1105 if name is None:
1106 raise ValueError("No value provided for new tag's name.")
1107 self.name = name
1108 self.namespace = namespace
1109 self.prefix = prefix
1110 if ((not builder or builder.store_line_numbers)
1111 and (sourceline is not None or sourcepos is not None)):
1112 self.sourceline = sourceline
1113 self.sourcepos = sourcepos
1114 if attrs is None:
1115 attrs = {}
1116 elif attrs:
1117 if builder is not None and builder.cdata_list_attributes:
1118 attrs = builder._replace_cdata_list_attribute_values(
1119 self.name, attrs)
1120 else:
1121 attrs = dict(attrs)
1122 else:
1123 attrs = dict(attrs)
1124
1125 # If possible, determine ahead of time whether this tag is an
1126 # XML tag.
1127 if builder:
1128 self.known_xml = builder.is_xml
1129 else:
1130 self.known_xml = is_xml
1131 self.attrs = attrs
1132 self.contents = []
1133 self.setup(parent, previous)
1134 self.hidden = False
1135
1136 if builder is None:
1137 # In the absence of a TreeBuilder, use whatever values were
1138 # passed in here. They're probably None, unless this is a copy of some
1139 # other tag.
1140 self.can_be_empty_element = can_be_empty_element
1141 self.cdata_list_attributes = cdata_list_attributes
1142 self.preserve_whitespace_tags = preserve_whitespace_tags
1143 else:
1144 # Set up any substitutions for this tag, such as the charset in a META tag.
1145 builder.set_up_substitutions(self)
1146
1147 # Ask the TreeBuilder whether this tag might be an empty-element tag.
1148 self.can_be_empty_element = builder.can_be_empty_element(name)
1149
1150 # Keep track of the list of attributes of this tag that
1151 # might need to be treated as a list.
1152 #
1153 # For performance reasons, we store the whole data structure
1154 # rather than asking the question of every tag. Asking would
1155 # require building a new data structure every time, and
1156 # (unlike can_be_empty_element), we almost never need
1157 # to check this.
1158 self.cdata_list_attributes = builder.cdata_list_attributes
1159
1160 # Keep track of the names that might cause this tag to be treated as a
1161 # whitespace-preserved tag.
1162 self.preserve_whitespace_tags = builder.preserve_whitespace_tags
1163
1164 parserClass = _alias("parser_class") # BS3
1165
1166 def __copy__(self):
1167 """A copy of a Tag is a new Tag, unconnected to the parse tree.
1168 Its contents are a copy of the old Tag's contents.
1169 """
1170 clone = type(self)(
1171 None, self.builder, self.name, self.namespace,
1172 self.prefix, self.attrs, is_xml=self._is_xml,
1173 sourceline=self.sourceline, sourcepos=self.sourcepos,
1174 can_be_empty_element=self.can_be_empty_element,
1175 cdata_list_attributes=self.cdata_list_attributes,
1176 preserve_whitespace_tags=self.preserve_whitespace_tags
1177 )
1178 for attr in ('can_be_empty_element', 'hidden'):
1179 setattr(clone, attr, getattr(self, attr))
1180 for child in self.contents:
1181 clone.append(child.__copy__())
1182 return clone
1183
1184 @property
1185 def is_empty_element(self):
1186 """Is this tag an empty-element tag? (aka a self-closing tag)
1187
1188 A tag that has contents is never an empty-element tag.
1189
1190 A tag that has no contents may or may not be an empty-element
1191 tag. It depends on the builder used to create the tag. If the
1192 builder has a designated list of empty-element tags, then only
1193 a tag whose name shows up in that list is considered an
1194 empty-element tag.
1195
1196 If the builder has no designated list of empty-element tags,
1197 then any tag with no contents is an empty-element tag.
1198 """
1199 return len(self.contents) == 0 and self.can_be_empty_element
1200 isSelfClosing = is_empty_element # BS3
1201
1202 @property
1203 def string(self):
1204 """Convenience property to get the single string within this
1205 PageElement.
1206
1207 TODO It might make sense to have NavigableString.string return
1208 itself.
1209
1210 :return: If this element has a single string child, return
1211 value is that string. If this element has one child tag,
1212 return value is the 'string' attribute of the child tag,
1213 recursively. If this element is itself a string, has no
1214 children, or has more than one child, return value is None.
1215 """
1216 if len(self.contents) != 1:
1217 return None
1218 child = self.contents[0]
1219 if isinstance(child, NavigableString):
1220 return child
1221 return child.string
1222
1223 @string.setter
1224 def string(self, string):
1225 """Replace this PageElement's contents with `string`."""
1226 self.clear()
1227 self.append(string.__class__(string))
1228
1229 def _all_strings(self, strip=False, types=(NavigableString, CData)):
1230 """Yield all strings of certain classes, possibly stripping them.
1231
1232 :param strip: If True, all strings will be stripped before being
1233 yielded.
1234
1235 :types: A tuple of NavigableString subclasses. Any strings of
1236 a subclass not found in this list will be ignored. By
1237 default, this means only NavigableString and CData objects
1238 will be considered. So no comments, processing instructions,
1239 etc.
1240
1241 :yield: A sequence of strings.
1242 """
1243 for descendant in self.descendants:
1244 if (
1245 (types is None and not isinstance(descendant, NavigableString))
1246 or
1247 (types is not None and type(descendant) not in types)):
1248 continue
1249 if strip:
1250 descendant = descendant.strip()
1251 if len(descendant) == 0:
1252 continue
1253 yield descendant
1254
1255 strings = property(_all_strings)
1256
1257 @property
1258 def stripped_strings(self):
1259 """Yield all strings in the document, stripping them first.
1260
1261 :yield: A sequence of stripped strings.
1262 """
1263 for string in self._all_strings(True):
1264 yield string
1265
1266 def get_text(self, separator="", strip=False,
1267 types=(NavigableString, CData)):
1268 """Get all child strings, concatenated using the given separator.
1269
1270 :param separator: Strings will be concatenated using this separator.
1271
1272 :param strip: If True, strings will be stripped before being
1273 concatenated.
1274
1275 :types: A tuple of NavigableString subclasses. Any strings of
1276 a subclass not found in this list will be ignored. By
1277 default, this means only NavigableString and CData objects
1278 will be considered. So no comments, processing instructions,
1279 stylesheets, etc.
1280
1281 :return: A string.
1282 """
1283 return separator.join([s for s in self._all_strings(
1284 strip, types=types)])
1285 getText = get_text
1286 text = property(get_text)
1287
1288 def decompose(self):
1289 """Recursively destroys this PageElement and its children.
1290
1291 This element will be removed from the tree and wiped out; so
1292 will everything beneath it.
1293
1294 The behavior of a decomposed PageElement is undefined and you
1295 should never use one for anything, but if you need to _check_
1296 whether an element has been decomposed, you can use the
1297 `decomposed` property.
1298 """
1299 self.extract()
1300 i = self
1301 while i is not None:
1302 n = i.next_element
1303 i.__dict__.clear()
1304 i.contents = []
1305 i._decomposed = True
1306 i = n
1307
1308 def clear(self, decompose=False):
1309 """Wipe out all children of this PageElement by calling extract()
1310 on them.
1311
1312 :param decompose: If this is True, decompose() (a more
1313 destructive method) will be called instead of extract().
1314 """
1315 if decompose:
1316 for element in self.contents[:]:
1317 if isinstance(element, Tag):
1318 element.decompose()
1319 else:
1320 element.extract()
1321 else:
1322 for element in self.contents[:]:
1323 element.extract()
1324
1325 def smooth(self):
1326 """Smooth out this element's children by consolidating consecutive
1327 strings.
1328
1329 This makes pretty-printed output look more natural following a
1330 lot of operations that modified the tree.
1331 """
1332 # Mark the first position of every pair of children that need
1333 # to be consolidated. Do this rather than making a copy of
1334 # self.contents, since in most cases very few strings will be
1335 # affected.
1336 marked = []
1337 for i, a in enumerate(self.contents):
1338 if isinstance(a, Tag):
1339 # Recursively smooth children.
1340 a.smooth()
1341 if i == len(self.contents)-1:
1342 # This is the last item in .contents, and it's not a
1343 # tag. There's no chance it needs any work.
1344 continue
1345 b = self.contents[i+1]
1346 if (isinstance(a, NavigableString)
1347 and isinstance(b, NavigableString)
1348 and not isinstance(a, PreformattedString)
1349 and not isinstance(b, PreformattedString)
1350 ):
1351 marked.append(i)
1352
1353 # Go over the marked positions in reverse order, so that
1354 # removing items from .contents won't affect the remaining
1355 # positions.
1356 for i in reversed(marked):
1357 a = self.contents[i]
1358 b = self.contents[i+1]
1359 b.extract()
1360 n = NavigableString(a+b)
1361 a.replace_with(n)
1362
1363 def index(self, element):
1364 """Find the index of a child by identity, not value.
1365
1366 Avoids issues with tag.contents.index(element) getting the
1367 index of equal elements.
1368
1369 :param element: Look for this PageElement in `self.contents`.
1370 """
1371 for i, child in enumerate(self.contents):
1372 if child is element:
1373 return i
1374 raise ValueError("Tag.index: element not in tag")
1375
1376 def get(self, key, default=None):
1377 """Returns the value of the 'key' attribute for the tag, or
1378 the value given for 'default' if it doesn't have that
1379 attribute."""
1380 return self.attrs.get(key, default)
1381
1382 def get_attribute_list(self, key, default=None):
1383 """The same as get(), but always returns a list.
1384
1385 :param key: The attribute to look for.
1386 :param default: Use this value if the attribute is not present
1387 on this PageElement.
1388 :return: A list of values, probably containing only a single
1389 value.
1390 """
1391 value = self.get(key, default)
1392 if not isinstance(value, list):
1393 value = [value]
1394 return value
1395
1396 def has_attr(self, key):
1397 """Does this PageElement have an attribute with the given name?"""
1398 return key in self.attrs
1399
1400 def __hash__(self):
1401 return str(self).__hash__()
1402
1403 def __getitem__(self, key):
1404 """tag[key] returns the value of the 'key' attribute for the Tag,
1405 and throws an exception if it's not there."""
1406 return self.attrs[key]
1407
1408 def __iter__(self):
1409 "Iterating over a Tag iterates over its contents."
1410 return iter(self.contents)
1411
1412 def __len__(self):
1413 "The length of a Tag is the length of its list of contents."
1414 return len(self.contents)
1415
1416 def __contains__(self, x):
1417 return x in self.contents
1418
1419 def __bool__(self):
1420 "A tag is non-None even if it has no contents."
1421 return True
1422
1423 def __setitem__(self, key, value):
1424 """Setting tag[key] sets the value of the 'key' attribute for the
1425 tag."""
1426 self.attrs[key] = value
1427
1428 def __delitem__(self, key):
1429 "Deleting tag[key] deletes all 'key' attributes for the tag."
1430 self.attrs.pop(key, None)
1431
1432 def __call__(self, *args, **kwargs):
1433 """Calling a Tag like a function is the same as calling its
1434 find_all() method. Eg. tag('a') returns a list of all the A tags
1435 found within this tag."""
1436 return self.find_all(*args, **kwargs)
1437
1438 def __getattr__(self, tag):
1439 """Calling tag.subtag is the same as calling tag.find(name="subtag")"""
1440 #print("Getattr %s.%s" % (self.__class__, tag))
1441 if len(tag) > 3 and tag.endswith('Tag'):
1442 # BS3: soup.aTag -> "soup.find("a")
1443 tag_name = tag[:-3]
1444 warnings.warn(
1445 '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict(
1446 name=tag_name
1447 )
1448 )
1449 return self.find(tag_name)
1450 # We special case contents to avoid recursion.
1451 elif not tag.startswith("__") and not tag == "contents":
1452 return self.find(tag)
1453 raise AttributeError(
1454 "'%s' object has no attribute '%s'" % (self.__class__, tag))
1455
1456 def __eq__(self, other):
1457 """Returns true iff this Tag has the same name, the same attributes,
1458 and the same contents (recursively) as `other`."""
1459 if self is other:
1460 return True
1461 if (not hasattr(other, 'name') or
1462 not hasattr(other, 'attrs') or
1463 not hasattr(other, 'contents') or
1464 self.name != other.name or
1465 self.attrs != other.attrs or
1466 len(self) != len(other)):
1467 return False
1468 for i, my_child in enumerate(self.contents):
1469 if my_child != other.contents[i]:
1470 return False
1471 return True
1472
1473 def __ne__(self, other):
1474 """Returns true iff this Tag is not identical to `other`,
1475 as defined in __eq__."""
1476 return not self == other
1477
1478 def __repr__(self, encoding="unicode-escape"):
1479 """Renders this PageElement as a string.
1480
1481 :param encoding: The encoding to use (Python 2 only).
1482 :return: Under Python 2, a bytestring; under Python 3,
1483 a Unicode string.
1484 """
1485 if PY3K:
1486 # "The return value must be a string object", i.e. Unicode
1487 return self.decode()
1488 else:
1489 # "The return value must be a string object", i.e. a bytestring.
1490 # By convention, the return value of __repr__ should also be
1491 # an ASCII string.
1492 return self.encode(encoding)
1493
1494 def __unicode__(self):
1495 """Renders this PageElement as a Unicode string."""
1496 return self.decode()
1497
1498 def __str__(self):
1499 """Renders this PageElement as a generic string.
1500
1501 :return: Under Python 2, a UTF-8 bytestring; under Python 3,
1502 a Unicode string.
1503 """
1504 if PY3K:
1505 return self.decode()
1506 else:
1507 return self.encode()
1508
1509 if PY3K:
1510 __str__ = __repr__ = __unicode__
1511
1512 def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
1513 indent_level=None, formatter="minimal",
1514 errors="xmlcharrefreplace"):
1515 """Render a bytestring representation of this PageElement and its
1516 contents.
1517
1518 :param encoding: The destination encoding.
1519 :param indent_level: Each line of the rendering will be
1520 indented this many spaces. Used internally in
1521 recursive calls while pretty-printing.
1522 :param formatter: A Formatter object, or a string naming one of
1523 the standard formatters.
1524 :param errors: An error handling strategy such as
1525 'xmlcharrefreplace'. This value is passed along into
1526 encode() and its value should be one of the constants
1527 defined by Python.
1528 :return: A bytestring.
1529
1530 """
1531 # Turn the data structure into Unicode, then encode the
1532 # Unicode.
1533 u = self.decode(indent_level, encoding, formatter)
1534 return u.encode(encoding, errors)
1535
1536 def decode(self, indent_level=None,
1537 eventual_encoding=DEFAULT_OUTPUT_ENCODING,
1538 formatter="minimal"):
1539 """Render a Unicode representation of this PageElement and its
1540 contents.
1541
1542 :param indent_level: Each line of the rendering will be
1543 indented this many spaces. Used internally in
1544 recursive calls while pretty-printing.
1545 :param eventual_encoding: The tag is destined to be
1546 encoded into this encoding. This method is _not_
1547 responsible for performing that encoding. This information
1548 is passed in so that it can be substituted in if the
1549 document contains a <META> tag that mentions the document's
1550 encoding.
1551 :param formatter: A Formatter object, or a string naming one of
1552 the standard formatters.
1553 """
1554
1555 # First off, turn a non-Formatter `formatter` into a Formatter
1556 # object. This will stop the lookup from happening over and
1557 # over again.
1558 if not isinstance(formatter, Formatter):
1559 formatter = self.formatter_for_name(formatter)
1560 attributes = formatter.attributes(self)
1561 attrs = []
1562 for key, val in attributes:
1563 if val is None:
1564 decoded = key
1565 else:
1566 if isinstance(val, list) or isinstance(val, tuple):
1567 val = ' '.join(val)
1568 elif not isinstance(val, str):
1569 val = str(val)
1570 elif (
1571 isinstance(val, AttributeValueWithCharsetSubstitution)
1572 and eventual_encoding is not None
1573 ):
1574 val = val.encode(eventual_encoding)
1575
1576 text = formatter.attribute_value(val)
1577 decoded = (
1578 str(key) + '='
1579 + formatter.quoted_attribute_value(text))
1580 attrs.append(decoded)
1581 close = ''
1582 closeTag = ''
1583
1584 prefix = ''
1585 if self.prefix:
1586 prefix = self.prefix + ":"
1587
1588 if self.is_empty_element:
1589 close = formatter.void_element_close_prefix or ''
1590 else:
1591 closeTag = '</%s%s>' % (prefix, self.name)
1592
1593 pretty_print = self._should_pretty_print(indent_level)
1594 space = ''
1595 indent_space = ''
1596 if indent_level is not None:
1597 indent_space = (' ' * (indent_level - 1))
1598 if pretty_print:
1599 space = indent_space
1600 indent_contents = indent_level + 1
1601 else:
1602 indent_contents = None
1603 contents = self.decode_contents(
1604 indent_contents, eventual_encoding, formatter
1605 )
1606
1607 if self.hidden:
1608 # This is the 'document root' object.
1609 s = contents
1610 else:
1611 s = []
1612 attribute_string = ''
1613 if attrs:
1614 attribute_string = ' ' + ' '.join(attrs)
1615 if indent_level is not None:
1616 # Even if this particular tag is not pretty-printed,
1617 # we should indent up to the start of the tag.
1618 s.append(indent_space)
1619 s.append('<%s%s%s%s>' % (
1620 prefix, self.name, attribute_string, close))
1621 if pretty_print:
1622 s.append("\n")
1623 s.append(contents)
1624 if pretty_print and contents and contents[-1] != "\n":
1625 s.append("\n")
1626 if pretty_print and closeTag:
1627 s.append(space)
1628 s.append(closeTag)
1629 if indent_level is not None and closeTag and self.next_sibling:
1630 # Even if this particular tag is not pretty-printed,
1631 # we're now done with the tag, and we should add a
1632 # newline if appropriate.
1633 s.append("\n")
1634 s = ''.join(s)
1635 return s
1636
1637 def _should_pretty_print(self, indent_level):
1638 """Should this tag be pretty-printed?
1639
1640 Most of them should, but some (such as <pre> in HTML
1641 documents) should not.
1642 """
1643 return (
1644 indent_level is not None
1645 and (
1646 not self.preserve_whitespace_tags
1647 or self.name not in self.preserve_whitespace_tags
1648 )
1649 )
1650
1651 def prettify(self, encoding=None, formatter="minimal"):
1652 """Pretty-print this PageElement as a string.
1653
1654 :param encoding: The eventual encoding of the string. If this is None,
1655 a Unicode string will be returned.
1656 :param formatter: A Formatter object, or a string naming one of
1657 the standard formatters.
1658 :return: A Unicode string (if encoding==None) or a bytestring
1659 (otherwise).
1660 """
1661 if encoding is None:
1662 return self.decode(True, formatter=formatter)
1663 else:
1664 return self.encode(encoding, True, formatter=formatter)
1665
1666 def decode_contents(self, indent_level=None,
1667 eventual_encoding=DEFAULT_OUTPUT_ENCODING,
1668 formatter="minimal"):
1669 """Renders the contents of this tag as a Unicode string.
1670
1671 :param indent_level: Each line of the rendering will be
1672 indented this many spaces. Used internally in
1673 recursive calls while pretty-printing.
1674
1675 :param eventual_encoding: The tag is destined to be
1676 encoded into this encoding. decode_contents() is _not_
1677 responsible for performing that encoding. This information
1678 is passed in so that it can be substituted in if the
1679 document contains a <META> tag that mentions the document's
1680 encoding.
1681
1682 :param formatter: A Formatter object, or a string naming one of
1683 the standard Formatters.
1684 """
1685 # First off, turn a string formatter into a Formatter object. This
1686 # will stop the lookup from happening over and over again.
1687 if not isinstance(formatter, Formatter):
1688 formatter = self.formatter_for_name(formatter)
1689
1690 pretty_print = (indent_level is not None)
1691 s = []
1692 for c in self:
1693 text = None
1694 if isinstance(c, NavigableString):
1695 text = c.output_ready(formatter)
1696 elif isinstance(c, Tag):
1697 s.append(c.decode(indent_level, eventual_encoding,
1698 formatter))
1699 preserve_whitespace = (
1700 self.preserve_whitespace_tags and self.name in self.preserve_whitespace_tags
1701 )
1702 if text and indent_level and not preserve_whitespace:
1703 text = text.strip()
1704 if text:
1705 if pretty_print and not preserve_whitespace:
1706 s.append(" " * (indent_level - 1))
1707 s.append(text)
1708 if pretty_print and not preserve_whitespace:
1709 s.append("\n")
1710 return ''.join(s)
1711
1712 def encode_contents(
1713 self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
1714 formatter="minimal"):
1715 """Renders the contents of this PageElement as a bytestring.
1716
1717 :param indent_level: Each line of the rendering will be
1718 indented this many spaces. Used internally in
1719 recursive calls while pretty-printing.
1720
1721 :param eventual_encoding: The bytestring will be in this encoding.
1722
1723 :param formatter: A Formatter object, or a string naming one of
1724 the standard Formatters.
1725
1726 :return: A bytestring.
1727 """
1728 contents = self.decode_contents(indent_level, encoding, formatter)
1729 return contents.encode(encoding)
1730
1731 # Old method for BS3 compatibility
1732 def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
1733 prettyPrint=False, indentLevel=0):
1734 """Deprecated method for BS3 compatibility."""
1735 if not prettyPrint:
1736 indentLevel = None
1737 return self.encode_contents(
1738 indent_level=indentLevel, encoding=encoding)
1739
1740 #Soup methods
1741
1742 def find(self, name=None, attrs={}, recursive=True, text=None,
1743 **kwargs):
1744 """Look in the children of this PageElement and find the first
1745 PageElement that matches the given criteria.
1746
1747 All find_* methods take a common set of arguments. See the online
1748 documentation for detailed explanations.
1749
1750 :param name: A filter on tag name.
1751 :param attrs: A dictionary of filters on attribute values.
1752 :param recursive: If this is True, find() will perform a
1753 recursive search of this PageElement's children. Otherwise,
1754 only the direct children will be considered.
1755 :param limit: Stop looking after finding this many results.
1756 :kwargs: A dictionary of filters on attribute values.
1757 :return: A PageElement.
1758 :rtype: bs4.element.Tag | bs4.element.NavigableString
1759 """
1760 r = None
1761 l = self.find_all(name, attrs, recursive, text, 1, **kwargs)
1762 if l:
1763 r = l[0]
1764 return r
1765 findChild = find #BS2
1766
1767 def find_all(self, name=None, attrs={}, recursive=True, text=None,
1768 limit=None, **kwargs):
1769 """Look in the children of this PageElement and find all
1770 PageElements that match the given criteria.
1771
1772 All find_* methods take a common set of arguments. See the online
1773 documentation for detailed explanations.
1774
1775 :param name: A filter on tag name.
1776 :param attrs: A dictionary of filters on attribute values.
1777 :param recursive: If this is True, find_all() will perform a
1778 recursive search of this PageElement's children. Otherwise,
1779 only the direct children will be considered.
1780 :param limit: Stop looking after finding this many results.
1781 :kwargs: A dictionary of filters on attribute values.
1782 :return: A ResultSet of PageElements.
1783 :rtype: bs4.element.ResultSet
1784 """
1785 generator = self.descendants
1786 if not recursive:
1787 generator = self.children
1788 return self._find_all(name, attrs, text, limit, generator, **kwargs)
1789 findAll = find_all # BS3
1790 findChildren = find_all # BS2
1791
1792 #Generator methods
1793 @property
1794 def children(self):
1795 """Iterate over all direct children of this PageElement.
1796
1797 :yield: A sequence of PageElements.
1798 """
1799 # return iter() to make the purpose of the method clear
1800 return iter(self.contents) # XXX This seems to be untested.
1801
1802 @property
1803 def descendants(self):
1804 """Iterate over all children of this PageElement in a
1805 breadth-first sequence.
1806
1807 :yield: A sequence of PageElements.
1808 """
1809 if not len(self.contents):
1810 return
1811 stopNode = self._last_descendant().next_element
1812 current = self.contents[0]
1813 while current is not stopNode:
1814 yield current
1815 current = current.next_element
1816
1817 # CSS selector code
1818 def select_one(self, selector, namespaces=None, **kwargs):
1819 """Perform a CSS selection operation on the current element.
1820
1821 :param selector: A CSS selector.
1822
1823 :param namespaces: A dictionary mapping namespace prefixes
1824 used in the CSS selector to namespace URIs. By default,
1825 Beautiful Soup will use the prefixes it encountered while
1826 parsing the document.
1827
1828 :param kwargs: Keyword arguments to be passed into SoupSieve's
1829 soupsieve.select() method.
1830
1831 :return: A Tag.
1832 :rtype: bs4.element.Tag
1833 """
1834 value = self.select(selector, namespaces, 1, **kwargs)
1835 if value:
1836 return value[0]
1837 return None
1838
1839 def select(self, selector, namespaces=None, limit=None, **kwargs):
1840 """Perform a CSS selection operation on the current element.
1841
1842 This uses the SoupSieve library.
1843
1844 :param selector: A string containing a CSS selector.
1845
1846 :param namespaces: A dictionary mapping namespace prefixes
1847 used in the CSS selector to namespace URIs. By default,
1848 Beautiful Soup will use the prefixes it encountered while
1849 parsing the document.
1850
1851 :param limit: After finding this number of results, stop looking.
1852
1853 :param kwargs: Keyword arguments to be passed into SoupSieve's
1854 soupsieve.select() method.
1855
1856 :return: A ResultSet of Tags.
1857 :rtype: bs4.element.ResultSet
1858 """
1859 if namespaces is None:
1860 namespaces = self._namespaces
1861
1862 if limit is None:
1863 limit = 0
1864 if soupsieve is None:
1865 raise NotImplementedError(
1866 "Cannot execute CSS selectors because the soupsieve package is not installed."
1867 )
1868
1869 results = soupsieve.select(selector, self, namespaces, limit, **kwargs)
1870
1871 # We do this because it's more consistent and because
1872 # ResultSet.__getattr__ has a helpful error message.
1873 return ResultSet(None, results)
1874
1875 # Old names for backwards compatibility
1876 def childGenerator(self):
1877 """Deprecated generator."""
1878 return self.children
1879
1880 def recursiveChildGenerator(self):
1881 """Deprecated generator."""
1882 return self.descendants
1883
1884 def has_key(self, key):
1885 """Deprecated method. This was kind of misleading because has_key()
1886 (attributes) was different from __in__ (contents).
1887
1888 has_key() is gone in Python 3, anyway.
1889 """
1890 warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % (
1891 key))
1892 return self.has_attr(key)
1893
1894 # Next, a couple classes to represent queries and their results.
1895 class SoupStrainer(object):
1896 """Encapsulates a number of ways of matching a markup element (tag or
1897 string).
1898
1899 This is primarily used to underpin the find_* methods, but you can
1900 create one yourself and pass it in as `parse_only` to the
1901 `BeautifulSoup` constructor, to parse a subset of a large
1902 document.
1903 """
1904
1905 def __init__(self, name=None, attrs={}, text=None, **kwargs):
1906 """Constructor.
1907
1908 The SoupStrainer constructor takes the same arguments passed
1909 into the find_* methods. See the online documentation for
1910 detailed explanations.
1911
1912 :param name: A filter on tag name.
1913 :param attrs: A dictionary of filters on attribute values.
1914 :param text: A filter for a NavigableString with specific text.
1915 :kwargs: A dictionary of filters on attribute values.
1916 """
1917 self.name = self._normalize_search_value(name)
1918 if not isinstance(attrs, dict):
1919 # Treat a non-dict value for attrs as a search for the 'class'
1920 # attribute.
1921 kwargs['class'] = attrs
1922 attrs = None
1923
1924 if 'class_' in kwargs:
1925 # Treat class_="foo" as a search for the 'class'
1926 # attribute, overriding any non-dict value for attrs.
1927 kwargs['class'] = kwargs['class_']
1928 del kwargs['class_']
1929
1930 if kwargs:
1931 if attrs:
1932 attrs = attrs.copy()
1933 attrs.update(kwargs)
1934 else:
1935 attrs = kwargs
1936 normalized_attrs = {}
1937 for key, value in list(attrs.items()):
1938 normalized_attrs[key] = self._normalize_search_value(value)
1939
1940 self.attrs = normalized_attrs
1941 self.text = self._normalize_search_value(text)
1942
1943 def _normalize_search_value(self, value):
1944 # Leave it alone if it's a Unicode string, a callable, a
1945 # regular expression, a boolean, or None.
1946 if (isinstance(value, str) or isinstance(value, Callable) or hasattr(value, 'match')
1947 or isinstance(value, bool) or value is None):
1948 return value
1949
1950 # If it's a bytestring, convert it to Unicode, treating it as UTF-8.
1951 if isinstance(value, bytes):
1952 return value.decode("utf8")
1953
1954 # If it's listlike, convert it into a list of strings.
1955 if hasattr(value, '__iter__'):
1956 new_value = []
1957 for v in value:
1958 if (hasattr(v, '__iter__') and not isinstance(v, bytes)
1959 and not isinstance(v, str)):
1960 # This is almost certainly the user's mistake. In the
1961 # interests of avoiding infinite loops, we'll let
1962 # it through as-is rather than doing a recursive call.
1963 new_value.append(v)
1964 else:
1965 new_value.append(self._normalize_search_value(v))
1966 return new_value
1967
1968 # Otherwise, convert it into a Unicode string.
1969 # The unicode(str()) thing is so this will do the same thing on Python 2
1970 # and Python 3.
1971 return str(str(value))
1972
1973 def __str__(self):
1974 """A human-readable representation of this SoupStrainer."""
1975 if self.text:
1976 return self.text
1977 else:
1978 return "%s|%s" % (self.name, self.attrs)
1979
1980 def search_tag(self, markup_name=None, markup_attrs={}):
1981 """Check whether a Tag with the given name and attributes would
1982 match this SoupStrainer.
1983
1984 Used prospectively to decide whether to even bother creating a Tag
1985 object.
1986
1987 :param markup_name: A tag name as found in some markup.
1988 :param markup_attrs: A dictionary of attributes as found in some markup.
1989
1990 :return: True if the prospective tag would match this SoupStrainer;
1991 False otherwise.
1992 """
1993 found = None
1994 markup = None
1995 if isinstance(markup_name, Tag):
1996 markup = markup_name
1997 markup_attrs = markup
1998
1999 if isinstance(self.name, str):
2000 # Optimization for a very common case where the user is
2001 # searching for a tag with one specific name, and we're
2002 # looking at a tag with a different name.
2003 if markup and not markup.prefix and self.name != markup.name:
2004 return False
2005
2006 call_function_with_tag_data = (
2007 isinstance(self.name, Callable)
2008 and not isinstance(markup_name, Tag))
2009
2010 if ((not self.name)
2011 or call_function_with_tag_data
2012 or (markup and self._matches(markup, self.name))
2013 or (not markup and self._matches(markup_name, self.name))):
2014 if call_function_with_tag_data:
2015 match = self.name(markup_name, markup_attrs)
2016 else:
2017 match = True
2018 markup_attr_map = None
2019 for attr, match_against in list(self.attrs.items()):
2020 if not markup_attr_map:
2021 if hasattr(markup_attrs, 'get'):
2022 markup_attr_map = markup_attrs
2023 else:
2024 markup_attr_map = {}
2025 for k, v in markup_attrs:
2026 markup_attr_map[k] = v
2027 attr_value = markup_attr_map.get(attr)
2028 if not self._matches(attr_value, match_against):
2029 match = False
2030 break
2031 if match:
2032 if markup:
2033 found = markup
2034 else:
2035 found = markup_name
2036 if found and self.text and not self._matches(found.string, self.text):
2037 found = None
2038 return found
2039
2040 # For BS3 compatibility.
2041 searchTag = search_tag
2042
2043 def search(self, markup):
2044 """Find all items in `markup` that match this SoupStrainer.
2045
2046 Used by the core _find_all() method, which is ultimately
2047 called by all find_* methods.
2048
2049 :param markup: A PageElement or a list of them.
2050 """
2051 # print('looking for %s in %s' % (self, markup))
2052 found = None
2053 # If given a list of items, scan it for a text element that
2054 # matches.
2055 if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)):
2056 for element in markup:
2057 if isinstance(element, NavigableString) \
2058 and self.search(element):
2059 found = element
2060 break
2061 # If it's a Tag, make sure its name or attributes match.
2062 # Don't bother with Tags if we're searching for text.
2063 elif isinstance(markup, Tag):
2064 if not self.text or self.name or self.attrs:
2065 found = self.search_tag(markup)
2066 # If it's text, make sure the text matches.
2067 elif isinstance(markup, NavigableString) or \
2068 isinstance(markup, str):
2069 if not self.name and not self.attrs and self._matches(markup, self.text):
2070 found = markup
2071 else:
2072 raise Exception(
2073 "I don't know how to match against a %s" % markup.__class__)
2074 return found
2075
2076 def _matches(self, markup, match_against, already_tried=None):
2077 # print(u"Matching %s against %s" % (markup, match_against))
2078 result = False
2079 if isinstance(markup, list) or isinstance(markup, tuple):
2080 # This should only happen when searching a multi-valued attribute
2081 # like 'class'.
2082 for item in markup:
2083 if self._matches(item, match_against):
2084 return True
2085 # We didn't match any particular value of the multivalue
2086 # attribute, but maybe we match the attribute value when
2087 # considered as a string.
2088 if self._matches(' '.join(markup), match_against):
2089 return True
2090 return False
2091
2092 if match_against is True:
2093 # True matches any non-None value.
2094 return markup is not None
2095
2096 if isinstance(match_against, Callable):
2097 return match_against(markup)
2098
2099 # Custom callables take the tag as an argument, but all
2100 # other ways of matching match the tag name as a string.
2101 original_markup = markup
2102 if isinstance(markup, Tag):
2103 markup = markup.name
2104
2105 # Ensure that `markup` is either a Unicode string, or None.
2106 markup = self._normalize_search_value(markup)
2107
2108 if markup is None:
2109 # None matches None, False, an empty string, an empty list, and so on.
2110 return not match_against
2111
2112 if (hasattr(match_against, '__iter__')
2113 and not isinstance(match_against, str)):
2114 # We're asked to match against an iterable of items.
2115 # The markup must be match at least one item in the
2116 # iterable. We'll try each one in turn.
2117 #
2118 # To avoid infinite recursion we need to keep track of
2119 # items we've already seen.
2120 if not already_tried:
2121 already_tried = set()
2122 for item in match_against:
2123 if item.__hash__:
2124 key = item
2125 else:
2126 key = id(item)
2127 if key in already_tried:
2128 continue
2129 else:
2130 already_tried.add(key)
2131 if self._matches(original_markup, item, already_tried):
2132 return True
2133 else:
2134 return False
2135
2136 # Beyond this point we might need to run the test twice: once against
2137 # the tag's name and once against its prefixed name.
2138 match = False
2139
2140 if not match and isinstance(match_against, str):
2141 # Exact string match
2142 match = markup == match_against
2143
2144 if not match and hasattr(match_against, 'search'):
2145 # Regexp match
2146 return match_against.search(markup)
2147
2148 if (not match
2149 and isinstance(original_markup, Tag)
2150 and original_markup.prefix):
2151 # Try the whole thing again with the prefixed tag name.
2152 return self._matches(
2153 original_markup.prefix + ':' + original_markup.name, match_against
2154 )
2155
2156 return match
2157
2158
2159 class ResultSet(list):
2160 """A ResultSet is just a list that keeps track of the SoupStrainer
2161 that created it."""
2162 def __init__(self, source, result=()):
2163 """Constructor.
2164
2165 :param source: A SoupStrainer.
2166 :param result: A list of PageElements.
2167 """
2168 super(ResultSet, self).__init__(result)
2169 self.source = source
2170
2171 def __getattr__(self, key):
2172 """Raise a helpful exception to explain a common code fix."""
2173 raise AttributeError(
2174 "ResultSet object has no attribute '%s'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?" % key
2175 )