Mercurial > repos > shellac > sam_consensus_v3
comparison env/lib/python3.9/site-packages/bs4/__init__.py @ 0:4f3585e2f14b draft default tip
"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author | shellac |
---|---|
date | Mon, 22 Mar 2021 18:12:50 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:4f3585e2f14b |
---|---|
1 """Beautiful Soup Elixir and Tonic - "The Screen-Scraper's Friend". | |
2 | |
3 http://www.crummy.com/software/BeautifulSoup/ | |
4 | |
5 Beautiful Soup uses a pluggable XML or HTML parser to parse a | |
6 (possibly invalid) document into a tree representation. Beautiful Soup | |
7 provides methods and Pythonic idioms that make it easy to navigate, | |
8 search, and modify the parse tree. | |
9 | |
10 Beautiful Soup works with Python 2.7 and up. It works better if lxml | |
11 and/or html5lib is installed. | |
12 | |
13 For more than you ever wanted to know about Beautiful Soup, see the | |
14 documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/ | |
15 """ | |
16 | |
17 __author__ = "Leonard Richardson (leonardr@segfault.org)" | |
18 __version__ = "4.9.3" | |
19 __copyright__ = "Copyright (c) 2004-2020 Leonard Richardson" | |
20 # Use of this source code is governed by the MIT license. | |
21 __license__ = "MIT" | |
22 | |
23 __all__ = ['BeautifulSoup'] | |
24 | |
25 from collections import Counter | |
26 import os | |
27 import re | |
28 import sys | |
29 import traceback | |
30 import warnings | |
31 | |
32 from .builder import builder_registry, ParserRejectedMarkup | |
33 from .dammit import UnicodeDammit | |
34 from .element import ( | |
35 CData, | |
36 Comment, | |
37 DEFAULT_OUTPUT_ENCODING, | |
38 Declaration, | |
39 Doctype, | |
40 NavigableString, | |
41 PageElement, | |
42 ProcessingInstruction, | |
43 PYTHON_SPECIFIC_ENCODINGS, | |
44 ResultSet, | |
45 Script, | |
46 Stylesheet, | |
47 SoupStrainer, | |
48 Tag, | |
49 TemplateString, | |
50 ) | |
51 | |
52 # The very first thing we do is give a useful error if someone is | |
53 # running this code under Python 3 without converting it. | |
54 'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' | |
55 | |
56 # Define some custom warnings. | |
57 class GuessedAtParserWarning(UserWarning): | |
58 """The warning issued when BeautifulSoup has to guess what parser to | |
59 use -- probably because no parser was specified in the constructor. | |
60 """ | |
61 | |
62 class MarkupResemblesLocatorWarning(UserWarning): | |
63 """The warning issued when BeautifulSoup is given 'markup' that | |
64 actually looks like a resource locator -- a URL or a path to a file | |
65 on disk. | |
66 """ | |
67 | |
68 | |
69 class BeautifulSoup(Tag): | |
70 """A data structure representing a parsed HTML or XML document. | |
71 | |
72 Most of the methods you'll call on a BeautifulSoup object are inherited from | |
73 PageElement or Tag. | |
74 | |
75 Internally, this class defines the basic interface called by the | |
76 tree builders when converting an HTML/XML document into a data | |
77 structure. The interface abstracts away the differences between | |
78 parsers. To write a new tree builder, you'll need to understand | |
79 these methods as a whole. | |
80 | |
81 These methods will be called by the BeautifulSoup constructor: | |
82 * reset() | |
83 * feed(markup) | |
84 | |
85 The tree builder may call these methods from its feed() implementation: | |
86 * handle_starttag(name, attrs) # See note about return value | |
87 * handle_endtag(name) | |
88 * handle_data(data) # Appends to the current data node | |
89 * endData(containerClass) # Ends the current data node | |
90 | |
91 No matter how complicated the underlying parser is, you should be | |
92 able to build a tree using 'start tag' events, 'end tag' events, | |
93 'data' events, and "done with data" events. | |
94 | |
95 If you encounter an empty-element tag (aka a self-closing tag, | |
96 like HTML's <br> tag), call handle_starttag and then | |
97 handle_endtag. | |
98 """ | |
99 | |
100 # Since BeautifulSoup subclasses Tag, it's possible to treat it as | |
101 # a Tag with a .name. This name makes it clear the BeautifulSoup | |
102 # object isn't a real markup tag. | |
103 ROOT_TAG_NAME = '[document]' | |
104 | |
105 # If the end-user gives no indication which tree builder they | |
106 # want, look for one with these features. | |
107 DEFAULT_BUILDER_FEATURES = ['html', 'fast'] | |
108 | |
109 # A string containing all ASCII whitespace characters, used in | |
110 # endData() to detect data chunks that seem 'empty'. | |
111 ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' | |
112 | |
113 NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n" | |
114 | |
115 def __init__(self, markup="", features=None, builder=None, | |
116 parse_only=None, from_encoding=None, exclude_encodings=None, | |
117 element_classes=None, **kwargs): | |
118 """Constructor. | |
119 | |
120 :param markup: A string or a file-like object representing | |
121 markup to be parsed. | |
122 | |
123 :param features: Desirable features of the parser to be | |
124 used. This may be the name of a specific parser ("lxml", | |
125 "lxml-xml", "html.parser", or "html5lib") or it may be the | |
126 type of markup to be used ("html", "html5", "xml"). It's | |
127 recommended that you name a specific parser, so that | |
128 Beautiful Soup gives you the same results across platforms | |
129 and virtual environments. | |
130 | |
131 :param builder: A TreeBuilder subclass to instantiate (or | |
132 instance to use) instead of looking one up based on | |
133 `features`. You only need to use this if you've implemented a | |
134 custom TreeBuilder. | |
135 | |
136 :param parse_only: A SoupStrainer. Only parts of the document | |
137 matching the SoupStrainer will be considered. This is useful | |
138 when parsing part of a document that would otherwise be too | |
139 large to fit into memory. | |
140 | |
141 :param from_encoding: A string indicating the encoding of the | |
142 document to be parsed. Pass this in if Beautiful Soup is | |
143 guessing wrongly about the document's encoding. | |
144 | |
145 :param exclude_encodings: A list of strings indicating | |
146 encodings known to be wrong. Pass this in if you don't know | |
147 the document's encoding but you know Beautiful Soup's guess is | |
148 wrong. | |
149 | |
150 :param element_classes: A dictionary mapping BeautifulSoup | |
151 classes like Tag and NavigableString, to other classes you'd | |
152 like to be instantiated instead as the parse tree is | |
153 built. This is useful for subclassing Tag or NavigableString | |
154 to modify default behavior. | |
155 | |
156 :param kwargs: For backwards compatibility purposes, the | |
157 constructor accepts certain keyword arguments used in | |
158 Beautiful Soup 3. None of these arguments do anything in | |
159 Beautiful Soup 4; they will result in a warning and then be | |
160 ignored. | |
161 | |
162 Apart from this, any keyword arguments passed into the | |
163 BeautifulSoup constructor are propagated to the TreeBuilder | |
164 constructor. This makes it possible to configure a | |
165 TreeBuilder by passing in arguments, not just by saying which | |
166 one to use. | |
167 """ | |
168 if 'convertEntities' in kwargs: | |
169 del kwargs['convertEntities'] | |
170 warnings.warn( | |
171 "BS4 does not respect the convertEntities argument to the " | |
172 "BeautifulSoup constructor. Entities are always converted " | |
173 "to Unicode characters.") | |
174 | |
175 if 'markupMassage' in kwargs: | |
176 del kwargs['markupMassage'] | |
177 warnings.warn( | |
178 "BS4 does not respect the markupMassage argument to the " | |
179 "BeautifulSoup constructor. The tree builder is responsible " | |
180 "for any necessary markup massage.") | |
181 | |
182 if 'smartQuotesTo' in kwargs: | |
183 del kwargs['smartQuotesTo'] | |
184 warnings.warn( | |
185 "BS4 does not respect the smartQuotesTo argument to the " | |
186 "BeautifulSoup constructor. Smart quotes are always converted " | |
187 "to Unicode characters.") | |
188 | |
189 if 'selfClosingTags' in kwargs: | |
190 del kwargs['selfClosingTags'] | |
191 warnings.warn( | |
192 "BS4 does not respect the selfClosingTags argument to the " | |
193 "BeautifulSoup constructor. The tree builder is responsible " | |
194 "for understanding self-closing tags.") | |
195 | |
196 if 'isHTML' in kwargs: | |
197 del kwargs['isHTML'] | |
198 warnings.warn( | |
199 "BS4 does not respect the isHTML argument to the " | |
200 "BeautifulSoup constructor. Suggest you use " | |
201 "features='lxml' for HTML and features='lxml-xml' for " | |
202 "XML.") | |
203 | |
204 def deprecated_argument(old_name, new_name): | |
205 if old_name in kwargs: | |
206 warnings.warn( | |
207 'The "%s" argument to the BeautifulSoup constructor ' | |
208 'has been renamed to "%s."' % (old_name, new_name)) | |
209 value = kwargs[old_name] | |
210 del kwargs[old_name] | |
211 return value | |
212 return None | |
213 | |
214 parse_only = parse_only or deprecated_argument( | |
215 "parseOnlyThese", "parse_only") | |
216 | |
217 from_encoding = from_encoding or deprecated_argument( | |
218 "fromEncoding", "from_encoding") | |
219 | |
220 if from_encoding and isinstance(markup, str): | |
221 warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.") | |
222 from_encoding = None | |
223 | |
224 self.element_classes = element_classes or dict() | |
225 | |
226 # We need this information to track whether or not the builder | |
227 # was specified well enough that we can omit the 'you need to | |
228 # specify a parser' warning. | |
229 original_builder = builder | |
230 original_features = features | |
231 | |
232 if isinstance(builder, type): | |
233 # A builder class was passed in; it needs to be instantiated. | |
234 builder_class = builder | |
235 builder = None | |
236 elif builder is None: | |
237 if isinstance(features, str): | |
238 features = [features] | |
239 if features is None or len(features) == 0: | |
240 features = self.DEFAULT_BUILDER_FEATURES | |
241 builder_class = builder_registry.lookup(*features) | |
242 if builder_class is None: | |
243 raise FeatureNotFound( | |
244 "Couldn't find a tree builder with the features you " | |
245 "requested: %s. Do you need to install a parser library?" | |
246 % ",".join(features)) | |
247 | |
248 # At this point either we have a TreeBuilder instance in | |
249 # builder, or we have a builder_class that we can instantiate | |
250 # with the remaining **kwargs. | |
251 if builder is None: | |
252 builder = builder_class(**kwargs) | |
253 if not original_builder and not ( | |
254 original_features == builder.NAME or | |
255 original_features in builder.ALTERNATE_NAMES | |
256 ) and markup: | |
257 # The user did not tell us which TreeBuilder to use, | |
258 # and we had to guess. Issue a warning. | |
259 if builder.is_xml: | |
260 markup_type = "XML" | |
261 else: | |
262 markup_type = "HTML" | |
263 | |
264 # This code adapted from warnings.py so that we get the same line | |
265 # of code as our warnings.warn() call gets, even if the answer is wrong | |
266 # (as it may be in a multithreading situation). | |
267 caller = None | |
268 try: | |
269 caller = sys._getframe(1) | |
270 except ValueError: | |
271 pass | |
272 if caller: | |
273 globals = caller.f_globals | |
274 line_number = caller.f_lineno | |
275 else: | |
276 globals = sys.__dict__ | |
277 line_number= 1 | |
278 filename = globals.get('__file__') | |
279 if filename: | |
280 fnl = filename.lower() | |
281 if fnl.endswith((".pyc", ".pyo")): | |
282 filename = filename[:-1] | |
283 if filename: | |
284 # If there is no filename at all, the user is most likely in a REPL, | |
285 # and the warning is not necessary. | |
286 values = dict( | |
287 filename=filename, | |
288 line_number=line_number, | |
289 parser=builder.NAME, | |
290 markup_type=markup_type | |
291 ) | |
292 warnings.warn( | |
293 self.NO_PARSER_SPECIFIED_WARNING % values, | |
294 GuessedAtParserWarning, stacklevel=2 | |
295 ) | |
296 else: | |
297 if kwargs: | |
298 warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.") | |
299 | |
300 self.builder = builder | |
301 self.is_xml = builder.is_xml | |
302 self.known_xml = self.is_xml | |
303 self._namespaces = dict() | |
304 self.parse_only = parse_only | |
305 | |
306 self.builder.initialize_soup(self) | |
307 | |
308 if hasattr(markup, 'read'): # It's a file-type object. | |
309 markup = markup.read() | |
310 elif len(markup) <= 256 and ( | |
311 (isinstance(markup, bytes) and not b'<' in markup) | |
312 or (isinstance(markup, str) and not '<' in markup) | |
313 ): | |
314 # Print out warnings for a couple beginner problems | |
315 # involving passing non-markup to Beautiful Soup. | |
316 # Beautiful Soup will still parse the input as markup, | |
317 # just in case that's what the user really wants. | |
318 if (isinstance(markup, str) | |
319 and not os.path.supports_unicode_filenames): | |
320 possible_filename = markup.encode("utf8") | |
321 else: | |
322 possible_filename = markup | |
323 is_file = False | |
324 try: | |
325 is_file = os.path.exists(possible_filename) | |
326 except Exception as e: | |
327 # This is almost certainly a problem involving | |
328 # characters not valid in filenames on this | |
329 # system. Just let it go. | |
330 pass | |
331 if is_file: | |
332 warnings.warn( | |
333 '"%s" looks like a filename, not markup. You should' | |
334 ' probably open this file and pass the filehandle into' | |
335 ' Beautiful Soup.' % self._decode_markup(markup), | |
336 MarkupResemblesLocatorWarning | |
337 ) | |
338 self._check_markup_is_url(markup) | |
339 | |
340 rejections = [] | |
341 success = False | |
342 for (self.markup, self.original_encoding, self.declared_html_encoding, | |
343 self.contains_replacement_characters) in ( | |
344 self.builder.prepare_markup( | |
345 markup, from_encoding, exclude_encodings=exclude_encodings)): | |
346 self.reset() | |
347 try: | |
348 self._feed() | |
349 success = True | |
350 break | |
351 except ParserRejectedMarkup as e: | |
352 rejections.append(e) | |
353 pass | |
354 | |
355 if not success: | |
356 other_exceptions = [str(e) for e in rejections] | |
357 raise ParserRejectedMarkup( | |
358 "The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.\n\nOriginal exception(s) from parser:\n " + "\n ".join(other_exceptions) | |
359 ) | |
360 | |
361 # Clear out the markup and remove the builder's circular | |
362 # reference to this object. | |
363 self.markup = None | |
364 self.builder.soup = None | |
365 | |
366 def __copy__(self): | |
367 """Copy a BeautifulSoup object by converting the document to a string and parsing it again.""" | |
368 copy = type(self)( | |
369 self.encode('utf-8'), builder=self.builder, from_encoding='utf-8' | |
370 ) | |
371 | |
372 # Although we encoded the tree to UTF-8, that may not have | |
373 # been the encoding of the original markup. Set the copy's | |
374 # .original_encoding to reflect the original object's | |
375 # .original_encoding. | |
376 copy.original_encoding = self.original_encoding | |
377 return copy | |
378 | |
379 def __getstate__(self): | |
380 # Frequently a tree builder can't be pickled. | |
381 d = dict(self.__dict__) | |
382 if 'builder' in d and not self.builder.picklable: | |
383 d['builder'] = None | |
384 return d | |
385 | |
386 @classmethod | |
387 def _decode_markup(cls, markup): | |
388 """Ensure `markup` is bytes so it's safe to send into warnings.warn. | |
389 | |
390 TODO: warnings.warn had this problem back in 2010 but it might not | |
391 anymore. | |
392 """ | |
393 if isinstance(markup, bytes): | |
394 decoded = markup.decode('utf-8', 'replace') | |
395 else: | |
396 decoded = markup | |
397 return decoded | |
398 | |
399 @classmethod | |
400 def _check_markup_is_url(cls, markup): | |
401 """Error-handling method to raise a warning if incoming markup looks | |
402 like a URL. | |
403 | |
404 :param markup: A string. | |
405 """ | |
406 if isinstance(markup, bytes): | |
407 space = b' ' | |
408 cant_start_with = (b"http:", b"https:") | |
409 elif isinstance(markup, str): | |
410 space = ' ' | |
411 cant_start_with = ("http:", "https:") | |
412 else: | |
413 return | |
414 | |
415 if any(markup.startswith(prefix) for prefix in cant_start_with): | |
416 if not space in markup: | |
417 warnings.warn( | |
418 '"%s" looks like a URL. Beautiful Soup is not an' | |
419 ' HTTP client. You should probably use an HTTP client like' | |
420 ' requests to get the document behind the URL, and feed' | |
421 ' that document to Beautiful Soup.' % cls._decode_markup( | |
422 markup | |
423 ), | |
424 MarkupResemblesLocatorWarning | |
425 ) | |
426 | |
427 def _feed(self): | |
428 """Internal method that parses previously set markup, creating a large | |
429 number of Tag and NavigableString objects. | |
430 """ | |
431 # Convert the document to Unicode. | |
432 self.builder.reset() | |
433 | |
434 self.builder.feed(self.markup) | |
435 # Close out any unfinished strings and close all the open tags. | |
436 self.endData() | |
437 while self.currentTag.name != self.ROOT_TAG_NAME: | |
438 self.popTag() | |
439 | |
440 def reset(self): | |
441 """Reset this object to a state as though it had never parsed any | |
442 markup. | |
443 """ | |
444 Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME) | |
445 self.hidden = 1 | |
446 self.builder.reset() | |
447 self.current_data = [] | |
448 self.currentTag = None | |
449 self.tagStack = [] | |
450 self.open_tag_counter = Counter() | |
451 self.preserve_whitespace_tag_stack = [] | |
452 self.string_container_stack = [] | |
453 self.pushTag(self) | |
454 | |
455 def new_tag(self, name, namespace=None, nsprefix=None, attrs={}, | |
456 sourceline=None, sourcepos=None, **kwattrs): | |
457 """Create a new Tag associated with this BeautifulSoup object. | |
458 | |
459 :param name: The name of the new Tag. | |
460 :param namespace: The URI of the new Tag's XML namespace, if any. | |
461 :param prefix: The prefix for the new Tag's XML namespace, if any. | |
462 :param attrs: A dictionary of this Tag's attribute values; can | |
463 be used instead of `kwattrs` for attributes like 'class' | |
464 that are reserved words in Python. | |
465 :param sourceline: The line number where this tag was | |
466 (purportedly) found in its source document. | |
467 :param sourcepos: The character position within `sourceline` where this | |
468 tag was (purportedly) found. | |
469 :param kwattrs: Keyword arguments for the new Tag's attribute values. | |
470 | |
471 """ | |
472 kwattrs.update(attrs) | |
473 return self.element_classes.get(Tag, Tag)( | |
474 None, self.builder, name, namespace, nsprefix, kwattrs, | |
475 sourceline=sourceline, sourcepos=sourcepos | |
476 ) | |
477 | |
478 def string_container(self, base_class=None): | |
479 container = base_class or NavigableString | |
480 | |
481 # There may be a general override of NavigableString. | |
482 container = self.element_classes.get( | |
483 container, container | |
484 ) | |
485 | |
486 # On top of that, we may be inside a tag that needs a special | |
487 # container class. | |
488 if self.string_container_stack: | |
489 container = self.builder.string_containers.get( | |
490 self.string_container_stack[-1].name, container | |
491 ) | |
492 return container | |
493 | |
494 def new_string(self, s, subclass=None): | |
495 """Create a new NavigableString associated with this BeautifulSoup | |
496 object. | |
497 """ | |
498 container = self.string_container(subclass) | |
499 return container(s) | |
500 | |
501 def insert_before(self, *args): | |
502 """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement | |
503 it because there is nothing before or after it in the parse tree. | |
504 """ | |
505 raise NotImplementedError("BeautifulSoup objects don't support insert_before().") | |
506 | |
507 def insert_after(self, *args): | |
508 """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement | |
509 it because there is nothing before or after it in the parse tree. | |
510 """ | |
511 raise NotImplementedError("BeautifulSoup objects don't support insert_after().") | |
512 | |
513 def popTag(self): | |
514 """Internal method called by _popToTag when a tag is closed.""" | |
515 tag = self.tagStack.pop() | |
516 if tag.name in self.open_tag_counter: | |
517 self.open_tag_counter[tag.name] -= 1 | |
518 if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]: | |
519 self.preserve_whitespace_tag_stack.pop() | |
520 if self.string_container_stack and tag == self.string_container_stack[-1]: | |
521 self.string_container_stack.pop() | |
522 #print("Pop", tag.name) | |
523 if self.tagStack: | |
524 self.currentTag = self.tagStack[-1] | |
525 return self.currentTag | |
526 | |
527 def pushTag(self, tag): | |
528 """Internal method called by handle_starttag when a tag is opened.""" | |
529 #print("Push", tag.name) | |
530 if self.currentTag is not None: | |
531 self.currentTag.contents.append(tag) | |
532 self.tagStack.append(tag) | |
533 self.currentTag = self.tagStack[-1] | |
534 if tag.name != self.ROOT_TAG_NAME: | |
535 self.open_tag_counter[tag.name] += 1 | |
536 if tag.name in self.builder.preserve_whitespace_tags: | |
537 self.preserve_whitespace_tag_stack.append(tag) | |
538 if tag.name in self.builder.string_containers: | |
539 self.string_container_stack.append(tag) | |
540 | |
541 def endData(self, containerClass=None): | |
542 """Method called by the TreeBuilder when the end of a data segment | |
543 occurs. | |
544 """ | |
545 containerClass = self.string_container(containerClass) | |
546 | |
547 if self.current_data: | |
548 current_data = ''.join(self.current_data) | |
549 # If whitespace is not preserved, and this string contains | |
550 # nothing but ASCII spaces, replace it with a single space | |
551 # or newline. | |
552 if not self.preserve_whitespace_tag_stack: | |
553 strippable = True | |
554 for i in current_data: | |
555 if i not in self.ASCII_SPACES: | |
556 strippable = False | |
557 break | |
558 if strippable: | |
559 if '\n' in current_data: | |
560 current_data = '\n' | |
561 else: | |
562 current_data = ' ' | |
563 | |
564 # Reset the data collector. | |
565 self.current_data = [] | |
566 | |
567 # Should we add this string to the tree at all? | |
568 if self.parse_only and len(self.tagStack) <= 1 and \ | |
569 (not self.parse_only.text or \ | |
570 not self.parse_only.search(current_data)): | |
571 return | |
572 | |
573 o = containerClass(current_data) | |
574 self.object_was_parsed(o) | |
575 | |
576 def object_was_parsed(self, o, parent=None, most_recent_element=None): | |
577 """Method called by the TreeBuilder to integrate an object into the parse tree.""" | |
578 if parent is None: | |
579 parent = self.currentTag | |
580 if most_recent_element is not None: | |
581 previous_element = most_recent_element | |
582 else: | |
583 previous_element = self._most_recent_element | |
584 | |
585 next_element = previous_sibling = next_sibling = None | |
586 if isinstance(o, Tag): | |
587 next_element = o.next_element | |
588 next_sibling = o.next_sibling | |
589 previous_sibling = o.previous_sibling | |
590 if previous_element is None: | |
591 previous_element = o.previous_element | |
592 | |
593 fix = parent.next_element is not None | |
594 | |
595 o.setup(parent, previous_element, next_element, previous_sibling, next_sibling) | |
596 | |
597 self._most_recent_element = o | |
598 parent.contents.append(o) | |
599 | |
600 # Check if we are inserting into an already parsed node. | |
601 if fix: | |
602 self._linkage_fixer(parent) | |
603 | |
604 def _linkage_fixer(self, el): | |
605 """Make sure linkage of this fragment is sound.""" | |
606 | |
607 first = el.contents[0] | |
608 child = el.contents[-1] | |
609 descendant = child | |
610 | |
611 if child is first and el.parent is not None: | |
612 # Parent should be linked to first child | |
613 el.next_element = child | |
614 # We are no longer linked to whatever this element is | |
615 prev_el = child.previous_element | |
616 if prev_el is not None and prev_el is not el: | |
617 prev_el.next_element = None | |
618 # First child should be linked to the parent, and no previous siblings. | |
619 child.previous_element = el | |
620 child.previous_sibling = None | |
621 | |
622 # We have no sibling as we've been appended as the last. | |
623 child.next_sibling = None | |
624 | |
625 # This index is a tag, dig deeper for a "last descendant" | |
626 if isinstance(child, Tag) and child.contents: | |
627 descendant = child._last_descendant(False) | |
628 | |
629 # As the final step, link last descendant. It should be linked | |
630 # to the parent's next sibling (if found), else walk up the chain | |
631 # and find a parent with a sibling. It should have no next sibling. | |
632 descendant.next_element = None | |
633 descendant.next_sibling = None | |
634 target = el | |
635 while True: | |
636 if target is None: | |
637 break | |
638 elif target.next_sibling is not None: | |
639 descendant.next_element = target.next_sibling | |
640 target.next_sibling.previous_element = child | |
641 break | |
642 target = target.parent | |
643 | |
644 def _popToTag(self, name, nsprefix=None, inclusivePop=True): | |
645 """Pops the tag stack up to and including the most recent | |
646 instance of the given tag. | |
647 | |
648 If there are no open tags with the given name, nothing will be | |
649 popped. | |
650 | |
651 :param name: Pop up to the most recent tag with this name. | |
652 :param nsprefix: The namespace prefix that goes with `name`. | |
653 :param inclusivePop: It this is false, pops the tag stack up | |
654 to but *not* including the most recent instqance of the | |
655 given tag. | |
656 | |
657 """ | |
658 #print("Popping to %s" % name) | |
659 if name == self.ROOT_TAG_NAME: | |
660 # The BeautifulSoup object itself can never be popped. | |
661 return | |
662 | |
663 most_recently_popped = None | |
664 | |
665 stack_size = len(self.tagStack) | |
666 for i in range(stack_size - 1, 0, -1): | |
667 if not self.open_tag_counter.get(name): | |
668 break | |
669 t = self.tagStack[i] | |
670 if (name == t.name and nsprefix == t.prefix): | |
671 if inclusivePop: | |
672 most_recently_popped = self.popTag() | |
673 break | |
674 most_recently_popped = self.popTag() | |
675 | |
676 return most_recently_popped | |
677 | |
678 def handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None, | |
679 sourcepos=None): | |
680 """Called by the tree builder when a new tag is encountered. | |
681 | |
682 :param name: Name of the tag. | |
683 :param nsprefix: Namespace prefix for the tag. | |
684 :param attrs: A dictionary of attribute values. | |
685 :param sourceline: The line number where this tag was found in its | |
686 source document. | |
687 :param sourcepos: The character position within `sourceline` where this | |
688 tag was found. | |
689 | |
690 If this method returns None, the tag was rejected by an active | |
691 SoupStrainer. You should proceed as if the tag had not occurred | |
692 in the document. For instance, if this was a self-closing tag, | |
693 don't call handle_endtag. | |
694 """ | |
695 # print("Start tag %s: %s" % (name, attrs)) | |
696 self.endData() | |
697 | |
698 if (self.parse_only and len(self.tagStack) <= 1 | |
699 and (self.parse_only.text | |
700 or not self.parse_only.search_tag(name, attrs))): | |
701 return None | |
702 | |
703 tag = self.element_classes.get(Tag, Tag)( | |
704 self, self.builder, name, namespace, nsprefix, attrs, | |
705 self.currentTag, self._most_recent_element, | |
706 sourceline=sourceline, sourcepos=sourcepos | |
707 ) | |
708 if tag is None: | |
709 return tag | |
710 if self._most_recent_element is not None: | |
711 self._most_recent_element.next_element = tag | |
712 self._most_recent_element = tag | |
713 self.pushTag(tag) | |
714 return tag | |
715 | |
716 def handle_endtag(self, name, nsprefix=None): | |
717 """Called by the tree builder when an ending tag is encountered. | |
718 | |
719 :param name: Name of the tag. | |
720 :param nsprefix: Namespace prefix for the tag. | |
721 """ | |
722 #print("End tag: " + name) | |
723 self.endData() | |
724 self._popToTag(name, nsprefix) | |
725 | |
726 def handle_data(self, data): | |
727 """Called by the tree builder when a chunk of textual data is encountered.""" | |
728 self.current_data.append(data) | |
729 | |
730 def decode(self, pretty_print=False, | |
731 eventual_encoding=DEFAULT_OUTPUT_ENCODING, | |
732 formatter="minimal"): | |
733 """Returns a string or Unicode representation of the parse tree | |
734 as an HTML or XML document. | |
735 | |
736 :param pretty_print: If this is True, indentation will be used to | |
737 make the document more readable. | |
738 :param eventual_encoding: The encoding of the final document. | |
739 If this is None, the document will be a Unicode string. | |
740 """ | |
741 if self.is_xml: | |
742 # Print the XML declaration | |
743 encoding_part = '' | |
744 if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS: | |
745 # This is a special Python encoding; it can't actually | |
746 # go into an XML document because it means nothing | |
747 # outside of Python. | |
748 eventual_encoding = None | |
749 if eventual_encoding != None: | |
750 encoding_part = ' encoding="%s"' % eventual_encoding | |
751 prefix = '<?xml version="1.0"%s?>\n' % encoding_part | |
752 else: | |
753 prefix = '' | |
754 if not pretty_print: | |
755 indent_level = None | |
756 else: | |
757 indent_level = 0 | |
758 return prefix + super(BeautifulSoup, self).decode( | |
759 indent_level, eventual_encoding, formatter) | |
760 | |
761 # Aliases to make it easier to get started quickly, e.g. 'from bs4 import _soup' | |
762 _s = BeautifulSoup | |
763 _soup = BeautifulSoup | |
764 | |
765 class BeautifulStoneSoup(BeautifulSoup): | |
766 """Deprecated interface to an XML parser.""" | |
767 | |
768 def __init__(self, *args, **kwargs): | |
769 kwargs['features'] = 'xml' | |
770 warnings.warn( | |
771 'The BeautifulStoneSoup class is deprecated. Instead of using ' | |
772 'it, pass features="xml" into the BeautifulSoup constructor.') | |
773 super(BeautifulStoneSoup, self).__init__(*args, **kwargs) | |
774 | |
775 | |
776 class StopParsing(Exception): | |
777 """Exception raised by a TreeBuilder if it's unable to continue parsing.""" | |
778 pass | |
779 | |
780 class FeatureNotFound(ValueError): | |
781 """Exception raised by the BeautifulSoup constructor if no parser with the | |
782 requested features is found. | |
783 """ | |
784 pass | |
785 | |
786 | |
787 #If this file is run as a script, act as an HTML pretty-printer. | |
788 if __name__ == '__main__': | |
789 import sys | |
790 soup = BeautifulSoup(sys.stdin) | |
791 print((soup.prettify())) |