comparison env/lib/python3.9/site-packages/bs4/__init__.py @ 0:4f3585e2f14b draft default tip

"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author shellac
date Mon, 22 Mar 2021 18:12:50 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4f3585e2f14b
1 """Beautiful Soup Elixir and Tonic - "The Screen-Scraper's Friend".
2
3 http://www.crummy.com/software/BeautifulSoup/
4
5 Beautiful Soup uses a pluggable XML or HTML parser to parse a
6 (possibly invalid) document into a tree representation. Beautiful Soup
7 provides methods and Pythonic idioms that make it easy to navigate,
8 search, and modify the parse tree.
9
10 Beautiful Soup works with Python 2.7 and up. It works better if lxml
11 and/or html5lib is installed.
12
13 For more than you ever wanted to know about Beautiful Soup, see the
14 documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
15 """
16
17 __author__ = "Leonard Richardson (leonardr@segfault.org)"
18 __version__ = "4.9.3"
19 __copyright__ = "Copyright (c) 2004-2020 Leonard Richardson"
20 # Use of this source code is governed by the MIT license.
21 __license__ = "MIT"
22
23 __all__ = ['BeautifulSoup']
24
25 from collections import Counter
26 import os
27 import re
28 import sys
29 import traceback
30 import warnings
31
32 from .builder import builder_registry, ParserRejectedMarkup
33 from .dammit import UnicodeDammit
34 from .element import (
35 CData,
36 Comment,
37 DEFAULT_OUTPUT_ENCODING,
38 Declaration,
39 Doctype,
40 NavigableString,
41 PageElement,
42 ProcessingInstruction,
43 PYTHON_SPECIFIC_ENCODINGS,
44 ResultSet,
45 Script,
46 Stylesheet,
47 SoupStrainer,
48 Tag,
49 TemplateString,
50 )
51
52 # The very first thing we do is give a useful error if someone is
53 # running this code under Python 3 without converting it.
54 'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
55
56 # Define some custom warnings.
57 class GuessedAtParserWarning(UserWarning):
58 """The warning issued when BeautifulSoup has to guess what parser to
59 use -- probably because no parser was specified in the constructor.
60 """
61
62 class MarkupResemblesLocatorWarning(UserWarning):
63 """The warning issued when BeautifulSoup is given 'markup' that
64 actually looks like a resource locator -- a URL or a path to a file
65 on disk.
66 """
67
68
69 class BeautifulSoup(Tag):
70 """A data structure representing a parsed HTML or XML document.
71
72 Most of the methods you'll call on a BeautifulSoup object are inherited from
73 PageElement or Tag.
74
75 Internally, this class defines the basic interface called by the
76 tree builders when converting an HTML/XML document into a data
77 structure. The interface abstracts away the differences between
78 parsers. To write a new tree builder, you'll need to understand
79 these methods as a whole.
80
81 These methods will be called by the BeautifulSoup constructor:
82 * reset()
83 * feed(markup)
84
85 The tree builder may call these methods from its feed() implementation:
86 * handle_starttag(name, attrs) # See note about return value
87 * handle_endtag(name)
88 * handle_data(data) # Appends to the current data node
89 * endData(containerClass) # Ends the current data node
90
91 No matter how complicated the underlying parser is, you should be
92 able to build a tree using 'start tag' events, 'end tag' events,
93 'data' events, and "done with data" events.
94
95 If you encounter an empty-element tag (aka a self-closing tag,
96 like HTML's <br> tag), call handle_starttag and then
97 handle_endtag.
98 """
99
100 # Since BeautifulSoup subclasses Tag, it's possible to treat it as
101 # a Tag with a .name. This name makes it clear the BeautifulSoup
102 # object isn't a real markup tag.
103 ROOT_TAG_NAME = '[document]'
104
105 # If the end-user gives no indication which tree builder they
106 # want, look for one with these features.
107 DEFAULT_BUILDER_FEATURES = ['html', 'fast']
108
109 # A string containing all ASCII whitespace characters, used in
110 # endData() to detect data chunks that seem 'empty'.
111 ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
112
113 NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"
114
115 def __init__(self, markup="", features=None, builder=None,
116 parse_only=None, from_encoding=None, exclude_encodings=None,
117 element_classes=None, **kwargs):
118 """Constructor.
119
120 :param markup: A string or a file-like object representing
121 markup to be parsed.
122
123 :param features: Desirable features of the parser to be
124 used. This may be the name of a specific parser ("lxml",
125 "lxml-xml", "html.parser", or "html5lib") or it may be the
126 type of markup to be used ("html", "html5", "xml"). It's
127 recommended that you name a specific parser, so that
128 Beautiful Soup gives you the same results across platforms
129 and virtual environments.
130
131 :param builder: A TreeBuilder subclass to instantiate (or
132 instance to use) instead of looking one up based on
133 `features`. You only need to use this if you've implemented a
134 custom TreeBuilder.
135
136 :param parse_only: A SoupStrainer. Only parts of the document
137 matching the SoupStrainer will be considered. This is useful
138 when parsing part of a document that would otherwise be too
139 large to fit into memory.
140
141 :param from_encoding: A string indicating the encoding of the
142 document to be parsed. Pass this in if Beautiful Soup is
143 guessing wrongly about the document's encoding.
144
145 :param exclude_encodings: A list of strings indicating
146 encodings known to be wrong. Pass this in if you don't know
147 the document's encoding but you know Beautiful Soup's guess is
148 wrong.
149
150 :param element_classes: A dictionary mapping BeautifulSoup
151 classes like Tag and NavigableString, to other classes you'd
152 like to be instantiated instead as the parse tree is
153 built. This is useful for subclassing Tag or NavigableString
154 to modify default behavior.
155
156 :param kwargs: For backwards compatibility purposes, the
157 constructor accepts certain keyword arguments used in
158 Beautiful Soup 3. None of these arguments do anything in
159 Beautiful Soup 4; they will result in a warning and then be
160 ignored.
161
162 Apart from this, any keyword arguments passed into the
163 BeautifulSoup constructor are propagated to the TreeBuilder
164 constructor. This makes it possible to configure a
165 TreeBuilder by passing in arguments, not just by saying which
166 one to use.
167 """
168 if 'convertEntities' in kwargs:
169 del kwargs['convertEntities']
170 warnings.warn(
171 "BS4 does not respect the convertEntities argument to the "
172 "BeautifulSoup constructor. Entities are always converted "
173 "to Unicode characters.")
174
175 if 'markupMassage' in kwargs:
176 del kwargs['markupMassage']
177 warnings.warn(
178 "BS4 does not respect the markupMassage argument to the "
179 "BeautifulSoup constructor. The tree builder is responsible "
180 "for any necessary markup massage.")
181
182 if 'smartQuotesTo' in kwargs:
183 del kwargs['smartQuotesTo']
184 warnings.warn(
185 "BS4 does not respect the smartQuotesTo argument to the "
186 "BeautifulSoup constructor. Smart quotes are always converted "
187 "to Unicode characters.")
188
189 if 'selfClosingTags' in kwargs:
190 del kwargs['selfClosingTags']
191 warnings.warn(
192 "BS4 does not respect the selfClosingTags argument to the "
193 "BeautifulSoup constructor. The tree builder is responsible "
194 "for understanding self-closing tags.")
195
196 if 'isHTML' in kwargs:
197 del kwargs['isHTML']
198 warnings.warn(
199 "BS4 does not respect the isHTML argument to the "
200 "BeautifulSoup constructor. Suggest you use "
201 "features='lxml' for HTML and features='lxml-xml' for "
202 "XML.")
203
204 def deprecated_argument(old_name, new_name):
205 if old_name in kwargs:
206 warnings.warn(
207 'The "%s" argument to the BeautifulSoup constructor '
208 'has been renamed to "%s."' % (old_name, new_name))
209 value = kwargs[old_name]
210 del kwargs[old_name]
211 return value
212 return None
213
214 parse_only = parse_only or deprecated_argument(
215 "parseOnlyThese", "parse_only")
216
217 from_encoding = from_encoding or deprecated_argument(
218 "fromEncoding", "from_encoding")
219
220 if from_encoding and isinstance(markup, str):
221 warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.")
222 from_encoding = None
223
224 self.element_classes = element_classes or dict()
225
226 # We need this information to track whether or not the builder
227 # was specified well enough that we can omit the 'you need to
228 # specify a parser' warning.
229 original_builder = builder
230 original_features = features
231
232 if isinstance(builder, type):
233 # A builder class was passed in; it needs to be instantiated.
234 builder_class = builder
235 builder = None
236 elif builder is None:
237 if isinstance(features, str):
238 features = [features]
239 if features is None or len(features) == 0:
240 features = self.DEFAULT_BUILDER_FEATURES
241 builder_class = builder_registry.lookup(*features)
242 if builder_class is None:
243 raise FeatureNotFound(
244 "Couldn't find a tree builder with the features you "
245 "requested: %s. Do you need to install a parser library?"
246 % ",".join(features))
247
248 # At this point either we have a TreeBuilder instance in
249 # builder, or we have a builder_class that we can instantiate
250 # with the remaining **kwargs.
251 if builder is None:
252 builder = builder_class(**kwargs)
253 if not original_builder and not (
254 original_features == builder.NAME or
255 original_features in builder.ALTERNATE_NAMES
256 ) and markup:
257 # The user did not tell us which TreeBuilder to use,
258 # and we had to guess. Issue a warning.
259 if builder.is_xml:
260 markup_type = "XML"
261 else:
262 markup_type = "HTML"
263
264 # This code adapted from warnings.py so that we get the same line
265 # of code as our warnings.warn() call gets, even if the answer is wrong
266 # (as it may be in a multithreading situation).
267 caller = None
268 try:
269 caller = sys._getframe(1)
270 except ValueError:
271 pass
272 if caller:
273 globals = caller.f_globals
274 line_number = caller.f_lineno
275 else:
276 globals = sys.__dict__
277 line_number= 1
278 filename = globals.get('__file__')
279 if filename:
280 fnl = filename.lower()
281 if fnl.endswith((".pyc", ".pyo")):
282 filename = filename[:-1]
283 if filename:
284 # If there is no filename at all, the user is most likely in a REPL,
285 # and the warning is not necessary.
286 values = dict(
287 filename=filename,
288 line_number=line_number,
289 parser=builder.NAME,
290 markup_type=markup_type
291 )
292 warnings.warn(
293 self.NO_PARSER_SPECIFIED_WARNING % values,
294 GuessedAtParserWarning, stacklevel=2
295 )
296 else:
297 if kwargs:
298 warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.")
299
300 self.builder = builder
301 self.is_xml = builder.is_xml
302 self.known_xml = self.is_xml
303 self._namespaces = dict()
304 self.parse_only = parse_only
305
306 self.builder.initialize_soup(self)
307
308 if hasattr(markup, 'read'): # It's a file-type object.
309 markup = markup.read()
310 elif len(markup) <= 256 and (
311 (isinstance(markup, bytes) and not b'<' in markup)
312 or (isinstance(markup, str) and not '<' in markup)
313 ):
314 # Print out warnings for a couple beginner problems
315 # involving passing non-markup to Beautiful Soup.
316 # Beautiful Soup will still parse the input as markup,
317 # just in case that's what the user really wants.
318 if (isinstance(markup, str)
319 and not os.path.supports_unicode_filenames):
320 possible_filename = markup.encode("utf8")
321 else:
322 possible_filename = markup
323 is_file = False
324 try:
325 is_file = os.path.exists(possible_filename)
326 except Exception as e:
327 # This is almost certainly a problem involving
328 # characters not valid in filenames on this
329 # system. Just let it go.
330 pass
331 if is_file:
332 warnings.warn(
333 '"%s" looks like a filename, not markup. You should'
334 ' probably open this file and pass the filehandle into'
335 ' Beautiful Soup.' % self._decode_markup(markup),
336 MarkupResemblesLocatorWarning
337 )
338 self._check_markup_is_url(markup)
339
340 rejections = []
341 success = False
342 for (self.markup, self.original_encoding, self.declared_html_encoding,
343 self.contains_replacement_characters) in (
344 self.builder.prepare_markup(
345 markup, from_encoding, exclude_encodings=exclude_encodings)):
346 self.reset()
347 try:
348 self._feed()
349 success = True
350 break
351 except ParserRejectedMarkup as e:
352 rejections.append(e)
353 pass
354
355 if not success:
356 other_exceptions = [str(e) for e in rejections]
357 raise ParserRejectedMarkup(
358 "The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.\n\nOriginal exception(s) from parser:\n " + "\n ".join(other_exceptions)
359 )
360
361 # Clear out the markup and remove the builder's circular
362 # reference to this object.
363 self.markup = None
364 self.builder.soup = None
365
366 def __copy__(self):
367 """Copy a BeautifulSoup object by converting the document to a string and parsing it again."""
368 copy = type(self)(
369 self.encode('utf-8'), builder=self.builder, from_encoding='utf-8'
370 )
371
372 # Although we encoded the tree to UTF-8, that may not have
373 # been the encoding of the original markup. Set the copy's
374 # .original_encoding to reflect the original object's
375 # .original_encoding.
376 copy.original_encoding = self.original_encoding
377 return copy
378
379 def __getstate__(self):
380 # Frequently a tree builder can't be pickled.
381 d = dict(self.__dict__)
382 if 'builder' in d and not self.builder.picklable:
383 d['builder'] = None
384 return d
385
386 @classmethod
387 def _decode_markup(cls, markup):
388 """Ensure `markup` is bytes so it's safe to send into warnings.warn.
389
390 TODO: warnings.warn had this problem back in 2010 but it might not
391 anymore.
392 """
393 if isinstance(markup, bytes):
394 decoded = markup.decode('utf-8', 'replace')
395 else:
396 decoded = markup
397 return decoded
398
399 @classmethod
400 def _check_markup_is_url(cls, markup):
401 """Error-handling method to raise a warning if incoming markup looks
402 like a URL.
403
404 :param markup: A string.
405 """
406 if isinstance(markup, bytes):
407 space = b' '
408 cant_start_with = (b"http:", b"https:")
409 elif isinstance(markup, str):
410 space = ' '
411 cant_start_with = ("http:", "https:")
412 else:
413 return
414
415 if any(markup.startswith(prefix) for prefix in cant_start_with):
416 if not space in markup:
417 warnings.warn(
418 '"%s" looks like a URL. Beautiful Soup is not an'
419 ' HTTP client. You should probably use an HTTP client like'
420 ' requests to get the document behind the URL, and feed'
421 ' that document to Beautiful Soup.' % cls._decode_markup(
422 markup
423 ),
424 MarkupResemblesLocatorWarning
425 )
426
427 def _feed(self):
428 """Internal method that parses previously set markup, creating a large
429 number of Tag and NavigableString objects.
430 """
431 # Convert the document to Unicode.
432 self.builder.reset()
433
434 self.builder.feed(self.markup)
435 # Close out any unfinished strings and close all the open tags.
436 self.endData()
437 while self.currentTag.name != self.ROOT_TAG_NAME:
438 self.popTag()
439
440 def reset(self):
441 """Reset this object to a state as though it had never parsed any
442 markup.
443 """
444 Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
445 self.hidden = 1
446 self.builder.reset()
447 self.current_data = []
448 self.currentTag = None
449 self.tagStack = []
450 self.open_tag_counter = Counter()
451 self.preserve_whitespace_tag_stack = []
452 self.string_container_stack = []
453 self.pushTag(self)
454
455 def new_tag(self, name, namespace=None, nsprefix=None, attrs={},
456 sourceline=None, sourcepos=None, **kwattrs):
457 """Create a new Tag associated with this BeautifulSoup object.
458
459 :param name: The name of the new Tag.
460 :param namespace: The URI of the new Tag's XML namespace, if any.
461 :param prefix: The prefix for the new Tag's XML namespace, if any.
462 :param attrs: A dictionary of this Tag's attribute values; can
463 be used instead of `kwattrs` for attributes like 'class'
464 that are reserved words in Python.
465 :param sourceline: The line number where this tag was
466 (purportedly) found in its source document.
467 :param sourcepos: The character position within `sourceline` where this
468 tag was (purportedly) found.
469 :param kwattrs: Keyword arguments for the new Tag's attribute values.
470
471 """
472 kwattrs.update(attrs)
473 return self.element_classes.get(Tag, Tag)(
474 None, self.builder, name, namespace, nsprefix, kwattrs,
475 sourceline=sourceline, sourcepos=sourcepos
476 )
477
478 def string_container(self, base_class=None):
479 container = base_class or NavigableString
480
481 # There may be a general override of NavigableString.
482 container = self.element_classes.get(
483 container, container
484 )
485
486 # On top of that, we may be inside a tag that needs a special
487 # container class.
488 if self.string_container_stack:
489 container = self.builder.string_containers.get(
490 self.string_container_stack[-1].name, container
491 )
492 return container
493
494 def new_string(self, s, subclass=None):
495 """Create a new NavigableString associated with this BeautifulSoup
496 object.
497 """
498 container = self.string_container(subclass)
499 return container(s)
500
501 def insert_before(self, *args):
502 """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement
503 it because there is nothing before or after it in the parse tree.
504 """
505 raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
506
507 def insert_after(self, *args):
508 """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement
509 it because there is nothing before or after it in the parse tree.
510 """
511 raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
512
513 def popTag(self):
514 """Internal method called by _popToTag when a tag is closed."""
515 tag = self.tagStack.pop()
516 if tag.name in self.open_tag_counter:
517 self.open_tag_counter[tag.name] -= 1
518 if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]:
519 self.preserve_whitespace_tag_stack.pop()
520 if self.string_container_stack and tag == self.string_container_stack[-1]:
521 self.string_container_stack.pop()
522 #print("Pop", tag.name)
523 if self.tagStack:
524 self.currentTag = self.tagStack[-1]
525 return self.currentTag
526
527 def pushTag(self, tag):
528 """Internal method called by handle_starttag when a tag is opened."""
529 #print("Push", tag.name)
530 if self.currentTag is not None:
531 self.currentTag.contents.append(tag)
532 self.tagStack.append(tag)
533 self.currentTag = self.tagStack[-1]
534 if tag.name != self.ROOT_TAG_NAME:
535 self.open_tag_counter[tag.name] += 1
536 if tag.name in self.builder.preserve_whitespace_tags:
537 self.preserve_whitespace_tag_stack.append(tag)
538 if tag.name in self.builder.string_containers:
539 self.string_container_stack.append(tag)
540
541 def endData(self, containerClass=None):
542 """Method called by the TreeBuilder when the end of a data segment
543 occurs.
544 """
545 containerClass = self.string_container(containerClass)
546
547 if self.current_data:
548 current_data = ''.join(self.current_data)
549 # If whitespace is not preserved, and this string contains
550 # nothing but ASCII spaces, replace it with a single space
551 # or newline.
552 if not self.preserve_whitespace_tag_stack:
553 strippable = True
554 for i in current_data:
555 if i not in self.ASCII_SPACES:
556 strippable = False
557 break
558 if strippable:
559 if '\n' in current_data:
560 current_data = '\n'
561 else:
562 current_data = ' '
563
564 # Reset the data collector.
565 self.current_data = []
566
567 # Should we add this string to the tree at all?
568 if self.parse_only and len(self.tagStack) <= 1 and \
569 (not self.parse_only.text or \
570 not self.parse_only.search(current_data)):
571 return
572
573 o = containerClass(current_data)
574 self.object_was_parsed(o)
575
576 def object_was_parsed(self, o, parent=None, most_recent_element=None):
577 """Method called by the TreeBuilder to integrate an object into the parse tree."""
578 if parent is None:
579 parent = self.currentTag
580 if most_recent_element is not None:
581 previous_element = most_recent_element
582 else:
583 previous_element = self._most_recent_element
584
585 next_element = previous_sibling = next_sibling = None
586 if isinstance(o, Tag):
587 next_element = o.next_element
588 next_sibling = o.next_sibling
589 previous_sibling = o.previous_sibling
590 if previous_element is None:
591 previous_element = o.previous_element
592
593 fix = parent.next_element is not None
594
595 o.setup(parent, previous_element, next_element, previous_sibling, next_sibling)
596
597 self._most_recent_element = o
598 parent.contents.append(o)
599
600 # Check if we are inserting into an already parsed node.
601 if fix:
602 self._linkage_fixer(parent)
603
604 def _linkage_fixer(self, el):
605 """Make sure linkage of this fragment is sound."""
606
607 first = el.contents[0]
608 child = el.contents[-1]
609 descendant = child
610
611 if child is first and el.parent is not None:
612 # Parent should be linked to first child
613 el.next_element = child
614 # We are no longer linked to whatever this element is
615 prev_el = child.previous_element
616 if prev_el is not None and prev_el is not el:
617 prev_el.next_element = None
618 # First child should be linked to the parent, and no previous siblings.
619 child.previous_element = el
620 child.previous_sibling = None
621
622 # We have no sibling as we've been appended as the last.
623 child.next_sibling = None
624
625 # This index is a tag, dig deeper for a "last descendant"
626 if isinstance(child, Tag) and child.contents:
627 descendant = child._last_descendant(False)
628
629 # As the final step, link last descendant. It should be linked
630 # to the parent's next sibling (if found), else walk up the chain
631 # and find a parent with a sibling. It should have no next sibling.
632 descendant.next_element = None
633 descendant.next_sibling = None
634 target = el
635 while True:
636 if target is None:
637 break
638 elif target.next_sibling is not None:
639 descendant.next_element = target.next_sibling
640 target.next_sibling.previous_element = child
641 break
642 target = target.parent
643
644 def _popToTag(self, name, nsprefix=None, inclusivePop=True):
645 """Pops the tag stack up to and including the most recent
646 instance of the given tag.
647
648 If there are no open tags with the given name, nothing will be
649 popped.
650
651 :param name: Pop up to the most recent tag with this name.
652 :param nsprefix: The namespace prefix that goes with `name`.
653 :param inclusivePop: It this is false, pops the tag stack up
654 to but *not* including the most recent instqance of the
655 given tag.
656
657 """
658 #print("Popping to %s" % name)
659 if name == self.ROOT_TAG_NAME:
660 # The BeautifulSoup object itself can never be popped.
661 return
662
663 most_recently_popped = None
664
665 stack_size = len(self.tagStack)
666 for i in range(stack_size - 1, 0, -1):
667 if not self.open_tag_counter.get(name):
668 break
669 t = self.tagStack[i]
670 if (name == t.name and nsprefix == t.prefix):
671 if inclusivePop:
672 most_recently_popped = self.popTag()
673 break
674 most_recently_popped = self.popTag()
675
676 return most_recently_popped
677
678 def handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None,
679 sourcepos=None):
680 """Called by the tree builder when a new tag is encountered.
681
682 :param name: Name of the tag.
683 :param nsprefix: Namespace prefix for the tag.
684 :param attrs: A dictionary of attribute values.
685 :param sourceline: The line number where this tag was found in its
686 source document.
687 :param sourcepos: The character position within `sourceline` where this
688 tag was found.
689
690 If this method returns None, the tag was rejected by an active
691 SoupStrainer. You should proceed as if the tag had not occurred
692 in the document. For instance, if this was a self-closing tag,
693 don't call handle_endtag.
694 """
695 # print("Start tag %s: %s" % (name, attrs))
696 self.endData()
697
698 if (self.parse_only and len(self.tagStack) <= 1
699 and (self.parse_only.text
700 or not self.parse_only.search_tag(name, attrs))):
701 return None
702
703 tag = self.element_classes.get(Tag, Tag)(
704 self, self.builder, name, namespace, nsprefix, attrs,
705 self.currentTag, self._most_recent_element,
706 sourceline=sourceline, sourcepos=sourcepos
707 )
708 if tag is None:
709 return tag
710 if self._most_recent_element is not None:
711 self._most_recent_element.next_element = tag
712 self._most_recent_element = tag
713 self.pushTag(tag)
714 return tag
715
716 def handle_endtag(self, name, nsprefix=None):
717 """Called by the tree builder when an ending tag is encountered.
718
719 :param name: Name of the tag.
720 :param nsprefix: Namespace prefix for the tag.
721 """
722 #print("End tag: " + name)
723 self.endData()
724 self._popToTag(name, nsprefix)
725
726 def handle_data(self, data):
727 """Called by the tree builder when a chunk of textual data is encountered."""
728 self.current_data.append(data)
729
730 def decode(self, pretty_print=False,
731 eventual_encoding=DEFAULT_OUTPUT_ENCODING,
732 formatter="minimal"):
733 """Returns a string or Unicode representation of the parse tree
734 as an HTML or XML document.
735
736 :param pretty_print: If this is True, indentation will be used to
737 make the document more readable.
738 :param eventual_encoding: The encoding of the final document.
739 If this is None, the document will be a Unicode string.
740 """
741 if self.is_xml:
742 # Print the XML declaration
743 encoding_part = ''
744 if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS:
745 # This is a special Python encoding; it can't actually
746 # go into an XML document because it means nothing
747 # outside of Python.
748 eventual_encoding = None
749 if eventual_encoding != None:
750 encoding_part = ' encoding="%s"' % eventual_encoding
751 prefix = '<?xml version="1.0"%s?>\n' % encoding_part
752 else:
753 prefix = ''
754 if not pretty_print:
755 indent_level = None
756 else:
757 indent_level = 0
758 return prefix + super(BeautifulSoup, self).decode(
759 indent_level, eventual_encoding, formatter)
760
761 # Aliases to make it easier to get started quickly, e.g. 'from bs4 import _soup'
762 _s = BeautifulSoup
763 _soup = BeautifulSoup
764
765 class BeautifulStoneSoup(BeautifulSoup):
766 """Deprecated interface to an XML parser."""
767
768 def __init__(self, *args, **kwargs):
769 kwargs['features'] = 'xml'
770 warnings.warn(
771 'The BeautifulStoneSoup class is deprecated. Instead of using '
772 'it, pass features="xml" into the BeautifulSoup constructor.')
773 super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
774
775
776 class StopParsing(Exception):
777 """Exception raised by a TreeBuilder if it's unable to continue parsing."""
778 pass
779
780 class FeatureNotFound(ValueError):
781 """Exception raised by the BeautifulSoup constructor if no parser with the
782 requested features is found.
783 """
784 pass
785
786
787 #If this file is run as a script, act as an HTML pretty-printer.
788 if __name__ == '__main__':
789 import sys
790 soup = BeautifulSoup(sys.stdin)
791 print((soup.prettify()))