Mercurial > repos > shellac > guppy_basecaller
comparison env/lib/python3.7/site-packages/bs4/__init__.py @ 0:26e78fe6e8c4 draft
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
| author | shellac |
|---|---|
| date | Sat, 02 May 2020 07:14:21 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:26e78fe6e8c4 |
|---|---|
| 1 """Beautiful Soup Elixir and Tonic - "The Screen-Scraper's Friend". | |
| 2 | |
| 3 http://www.crummy.com/software/BeautifulSoup/ | |
| 4 | |
| 5 Beautiful Soup uses a pluggable XML or HTML parser to parse a | |
| 6 (possibly invalid) document into a tree representation. Beautiful Soup | |
| 7 provides methods and Pythonic idioms that make it easy to navigate, | |
| 8 search, and modify the parse tree. | |
| 9 | |
| 10 Beautiful Soup works with Python 2.7 and up. It works better if lxml | |
| 11 and/or html5lib is installed. | |
| 12 | |
| 13 For more than you ever wanted to know about Beautiful Soup, see the | |
| 14 documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/ | |
| 15 """ | |
| 16 | |
| 17 __author__ = "Leonard Richardson (leonardr@segfault.org)" | |
| 18 __version__ = "4.9.0" | |
| 19 __copyright__ = "Copyright (c) 2004-2020 Leonard Richardson" | |
| 20 # Use of this source code is governed by the MIT license. | |
| 21 __license__ = "MIT" | |
| 22 | |
| 23 __all__ = ['BeautifulSoup'] | |
| 24 | |
| 25 import os | |
| 26 import re | |
| 27 import sys | |
| 28 import traceback | |
| 29 import warnings | |
| 30 | |
| 31 from .builder import builder_registry, ParserRejectedMarkup | |
| 32 from .dammit import UnicodeDammit | |
| 33 from .element import ( | |
| 34 CData, | |
| 35 Comment, | |
| 36 DEFAULT_OUTPUT_ENCODING, | |
| 37 Declaration, | |
| 38 Doctype, | |
| 39 NavigableString, | |
| 40 PageElement, | |
| 41 ProcessingInstruction, | |
| 42 ResultSet, | |
| 43 SoupStrainer, | |
| 44 Tag, | |
| 45 ) | |
| 46 | |
| 47 # The very first thing we do is give a useful error if someone is | |
| 48 # running this code under Python 3 without converting it. | |
| 49 'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' | |
| 50 | |
| 51 class BeautifulSoup(Tag): | |
| 52 """A data structure representing a parsed HTML or XML document. | |
| 53 | |
| 54 Most of the methods you'll call on a BeautifulSoup object are inherited from | |
| 55 PageElement or Tag. | |
| 56 | |
| 57 Internally, this class defines the basic interface called by the | |
| 58 tree builders when converting an HTML/XML document into a data | |
| 59 structure. The interface abstracts away the differences between | |
| 60 parsers. To write a new tree builder, you'll need to understand | |
| 61 these methods as a whole. | |
| 62 | |
| 63 These methods will be called by the BeautifulSoup constructor: | |
| 64 * reset() | |
| 65 * feed(markup) | |
| 66 | |
| 67 The tree builder may call these methods from its feed() implementation: | |
| 68 * handle_starttag(name, attrs) # See note about return value | |
| 69 * handle_endtag(name) | |
| 70 * handle_data(data) # Appends to the current data node | |
| 71 * endData(containerClass) # Ends the current data node | |
| 72 | |
| 73 No matter how complicated the underlying parser is, you should be | |
| 74 able to build a tree using 'start tag' events, 'end tag' events, | |
| 75 'data' events, and "done with data" events. | |
| 76 | |
| 77 If you encounter an empty-element tag (aka a self-closing tag, | |
| 78 like HTML's <br> tag), call handle_starttag and then | |
| 79 handle_endtag. | |
| 80 """ | |
| 81 | |
| 82 # Since BeautifulSoup subclasses Tag, it's possible to treat it as | |
| 83 # a Tag with a .name. This name makes it clear the BeautifulSoup | |
| 84 # object isn't a real markup tag. | |
| 85 ROOT_TAG_NAME = '[document]' | |
| 86 | |
| 87 # If the end-user gives no indication which tree builder they | |
| 88 # want, look for one with these features. | |
| 89 DEFAULT_BUILDER_FEATURES = ['html', 'fast'] | |
| 90 | |
| 91 # A string containing all ASCII whitespace characters, used in | |
| 92 # endData() to detect data chunks that seem 'empty'. | |
| 93 ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' | |
| 94 | |
| 95 NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n" | |
| 96 | |
| 97 def __init__(self, markup="", features=None, builder=None, | |
| 98 parse_only=None, from_encoding=None, exclude_encodings=None, | |
| 99 element_classes=None, **kwargs): | |
| 100 """Constructor. | |
| 101 | |
| 102 :param markup: A string or a file-like object representing | |
| 103 markup to be parsed. | |
| 104 | |
| 105 :param features: Desirable features of the parser to be | |
| 106 used. This may be the name of a specific parser ("lxml", | |
| 107 "lxml-xml", "html.parser", or "html5lib") or it may be the | |
| 108 type of markup to be used ("html", "html5", "xml"). It's | |
| 109 recommended that you name a specific parser, so that | |
| 110 Beautiful Soup gives you the same results across platforms | |
| 111 and virtual environments. | |
| 112 | |
| 113 :param builder: A TreeBuilder subclass to instantiate (or | |
| 114 instance to use) instead of looking one up based on | |
| 115 `features`. You only need to use this if you've implemented a | |
| 116 custom TreeBuilder. | |
| 117 | |
| 118 :param parse_only: A SoupStrainer. Only parts of the document | |
| 119 matching the SoupStrainer will be considered. This is useful | |
| 120 when parsing part of a document that would otherwise be too | |
| 121 large to fit into memory. | |
| 122 | |
| 123 :param from_encoding: A string indicating the encoding of the | |
| 124 document to be parsed. Pass this in if Beautiful Soup is | |
| 125 guessing wrongly about the document's encoding. | |
| 126 | |
| 127 :param exclude_encodings: A list of strings indicating | |
| 128 encodings known to be wrong. Pass this in if you don't know | |
| 129 the document's encoding but you know Beautiful Soup's guess is | |
| 130 wrong. | |
| 131 | |
| 132 :param element_classes: A dictionary mapping BeautifulSoup | |
| 133 classes like Tag and NavigableString, to other classes you'd | |
| 134 like to be instantiated instead as the parse tree is | |
| 135 built. This is useful for subclassing Tag or NavigableString | |
| 136 to modify default behavior. | |
| 137 | |
| 138 :param kwargs: For backwards compatibility purposes, the | |
| 139 constructor accepts certain keyword arguments used in | |
| 140 Beautiful Soup 3. None of these arguments do anything in | |
| 141 Beautiful Soup 4; they will result in a warning and then be | |
| 142 ignored. | |
| 143 | |
| 144 Apart from this, any keyword arguments passed into the | |
| 145 BeautifulSoup constructor are propagated to the TreeBuilder | |
| 146 constructor. This makes it possible to configure a | |
| 147 TreeBuilder by passing in arguments, not just by saying which | |
| 148 one to use. | |
| 149 """ | |
| 150 if 'convertEntities' in kwargs: | |
| 151 del kwargs['convertEntities'] | |
| 152 warnings.warn( | |
| 153 "BS4 does not respect the convertEntities argument to the " | |
| 154 "BeautifulSoup constructor. Entities are always converted " | |
| 155 "to Unicode characters.") | |
| 156 | |
| 157 if 'markupMassage' in kwargs: | |
| 158 del kwargs['markupMassage'] | |
| 159 warnings.warn( | |
| 160 "BS4 does not respect the markupMassage argument to the " | |
| 161 "BeautifulSoup constructor. The tree builder is responsible " | |
| 162 "for any necessary markup massage.") | |
| 163 | |
| 164 if 'smartQuotesTo' in kwargs: | |
| 165 del kwargs['smartQuotesTo'] | |
| 166 warnings.warn( | |
| 167 "BS4 does not respect the smartQuotesTo argument to the " | |
| 168 "BeautifulSoup constructor. Smart quotes are always converted " | |
| 169 "to Unicode characters.") | |
| 170 | |
| 171 if 'selfClosingTags' in kwargs: | |
| 172 del kwargs['selfClosingTags'] | |
| 173 warnings.warn( | |
| 174 "BS4 does not respect the selfClosingTags argument to the " | |
| 175 "BeautifulSoup constructor. The tree builder is responsible " | |
| 176 "for understanding self-closing tags.") | |
| 177 | |
| 178 if 'isHTML' in kwargs: | |
| 179 del kwargs['isHTML'] | |
| 180 warnings.warn( | |
| 181 "BS4 does not respect the isHTML argument to the " | |
| 182 "BeautifulSoup constructor. Suggest you use " | |
| 183 "features='lxml' for HTML and features='lxml-xml' for " | |
| 184 "XML.") | |
| 185 | |
| 186 def deprecated_argument(old_name, new_name): | |
| 187 if old_name in kwargs: | |
| 188 warnings.warn( | |
| 189 'The "%s" argument to the BeautifulSoup constructor ' | |
| 190 'has been renamed to "%s."' % (old_name, new_name)) | |
| 191 value = kwargs[old_name] | |
| 192 del kwargs[old_name] | |
| 193 return value | |
| 194 return None | |
| 195 | |
| 196 parse_only = parse_only or deprecated_argument( | |
| 197 "parseOnlyThese", "parse_only") | |
| 198 | |
| 199 from_encoding = from_encoding or deprecated_argument( | |
| 200 "fromEncoding", "from_encoding") | |
| 201 | |
| 202 if from_encoding and isinstance(markup, str): | |
| 203 warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.") | |
| 204 from_encoding = None | |
| 205 | |
| 206 self.element_classes = element_classes or dict() | |
| 207 | |
| 208 # We need this information to track whether or not the builder | |
| 209 # was specified well enough that we can omit the 'you need to | |
| 210 # specify a parser' warning. | |
| 211 original_builder = builder | |
| 212 original_features = features | |
| 213 | |
| 214 if isinstance(builder, type): | |
| 215 # A builder class was passed in; it needs to be instantiated. | |
| 216 builder_class = builder | |
| 217 builder = None | |
| 218 elif builder is None: | |
| 219 if isinstance(features, str): | |
| 220 features = [features] | |
| 221 if features is None or len(features) == 0: | |
| 222 features = self.DEFAULT_BUILDER_FEATURES | |
| 223 builder_class = builder_registry.lookup(*features) | |
| 224 if builder_class is None: | |
| 225 raise FeatureNotFound( | |
| 226 "Couldn't find a tree builder with the features you " | |
| 227 "requested: %s. Do you need to install a parser library?" | |
| 228 % ",".join(features)) | |
| 229 | |
| 230 # At this point either we have a TreeBuilder instance in | |
| 231 # builder, or we have a builder_class that we can instantiate | |
| 232 # with the remaining **kwargs. | |
| 233 if builder is None: | |
| 234 builder = builder_class(**kwargs) | |
| 235 if not original_builder and not ( | |
| 236 original_features == builder.NAME or | |
| 237 original_features in builder.ALTERNATE_NAMES | |
| 238 ): | |
| 239 if builder.is_xml: | |
| 240 markup_type = "XML" | |
| 241 else: | |
| 242 markup_type = "HTML" | |
| 243 | |
| 244 # This code adapted from warnings.py so that we get the same line | |
| 245 # of code as our warnings.warn() call gets, even if the answer is wrong | |
| 246 # (as it may be in a multithreading situation). | |
| 247 caller = None | |
| 248 try: | |
| 249 caller = sys._getframe(1) | |
| 250 except ValueError: | |
| 251 pass | |
| 252 if caller: | |
| 253 globals = caller.f_globals | |
| 254 line_number = caller.f_lineno | |
| 255 else: | |
| 256 globals = sys.__dict__ | |
| 257 line_number= 1 | |
| 258 filename = globals.get('__file__') | |
| 259 if filename: | |
| 260 fnl = filename.lower() | |
| 261 if fnl.endswith((".pyc", ".pyo")): | |
| 262 filename = filename[:-1] | |
| 263 if filename: | |
| 264 # If there is no filename at all, the user is most likely in a REPL, | |
| 265 # and the warning is not necessary. | |
| 266 values = dict( | |
| 267 filename=filename, | |
| 268 line_number=line_number, | |
| 269 parser=builder.NAME, | |
| 270 markup_type=markup_type | |
| 271 ) | |
| 272 warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % values, stacklevel=2) | |
| 273 else: | |
| 274 if kwargs: | |
| 275 warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.") | |
| 276 | |
| 277 self.builder = builder | |
| 278 self.is_xml = builder.is_xml | |
| 279 self.known_xml = self.is_xml | |
| 280 self._namespaces = dict() | |
| 281 self.parse_only = parse_only | |
| 282 | |
| 283 self.builder.initialize_soup(self) | |
| 284 | |
| 285 if hasattr(markup, 'read'): # It's a file-type object. | |
| 286 markup = markup.read() | |
| 287 elif len(markup) <= 256 and ( | |
| 288 (isinstance(markup, bytes) and not b'<' in markup) | |
| 289 or (isinstance(markup, str) and not '<' in markup) | |
| 290 ): | |
| 291 # Print out warnings for a couple beginner problems | |
| 292 # involving passing non-markup to Beautiful Soup. | |
| 293 # Beautiful Soup will still parse the input as markup, | |
| 294 # just in case that's what the user really wants. | |
| 295 if (isinstance(markup, str) | |
| 296 and not os.path.supports_unicode_filenames): | |
| 297 possible_filename = markup.encode("utf8") | |
| 298 else: | |
| 299 possible_filename = markup | |
| 300 is_file = False | |
| 301 try: | |
| 302 is_file = os.path.exists(possible_filename) | |
| 303 except Exception as e: | |
| 304 # This is almost certainly a problem involving | |
| 305 # characters not valid in filenames on this | |
| 306 # system. Just let it go. | |
| 307 pass | |
| 308 if is_file: | |
| 309 warnings.warn( | |
| 310 '"%s" looks like a filename, not markup. You should' | |
| 311 ' probably open this file and pass the filehandle into' | |
| 312 ' Beautiful Soup.' % self._decode_markup(markup) | |
| 313 ) | |
| 314 self._check_markup_is_url(markup) | |
| 315 | |
| 316 rejections = [] | |
| 317 success = False | |
| 318 for (self.markup, self.original_encoding, self.declared_html_encoding, | |
| 319 self.contains_replacement_characters) in ( | |
| 320 self.builder.prepare_markup( | |
| 321 markup, from_encoding, exclude_encodings=exclude_encodings)): | |
| 322 self.reset() | |
| 323 try: | |
| 324 self._feed() | |
| 325 success = True | |
| 326 break | |
| 327 except ParserRejectedMarkup as e: | |
| 328 rejections.append(e) | |
| 329 pass | |
| 330 | |
| 331 if not success: | |
| 332 other_exceptions = [str(e) for e in rejections] | |
| 333 raise ParserRejectedMarkup( | |
| 334 "The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.\n\nOriginal exception(s) from parser:\n " + "\n ".join(other_exceptions) | |
| 335 ) | |
| 336 | |
| 337 # Clear out the markup and remove the builder's circular | |
| 338 # reference to this object. | |
| 339 self.markup = None | |
| 340 self.builder.soup = None | |
| 341 | |
| 342 def __copy__(self): | |
| 343 """Copy a BeautifulSoup object by converting the document to a string and parsing it again.""" | |
| 344 copy = type(self)( | |
| 345 self.encode('utf-8'), builder=self.builder, from_encoding='utf-8' | |
| 346 ) | |
| 347 | |
| 348 # Although we encoded the tree to UTF-8, that may not have | |
| 349 # been the encoding of the original markup. Set the copy's | |
| 350 # .original_encoding to reflect the original object's | |
| 351 # .original_encoding. | |
| 352 copy.original_encoding = self.original_encoding | |
| 353 return copy | |
| 354 | |
| 355 def __getstate__(self): | |
| 356 # Frequently a tree builder can't be pickled. | |
| 357 d = dict(self.__dict__) | |
| 358 if 'builder' in d and not self.builder.picklable: | |
| 359 d['builder'] = None | |
| 360 return d | |
| 361 | |
| 362 @classmethod | |
| 363 def _decode_markup(cls, markup): | |
| 364 """Ensure `markup` is bytes so it's safe to send into warnings.warn. | |
| 365 | |
| 366 TODO: warnings.warn had this problem back in 2010 but it might not | |
| 367 anymore. | |
| 368 """ | |
| 369 if isinstance(markup, bytes): | |
| 370 decoded = markup.decode('utf-8', 'replace') | |
| 371 else: | |
| 372 decoded = markup | |
| 373 return decoded | |
| 374 | |
| 375 @classmethod | |
| 376 def _check_markup_is_url(cls, markup): | |
| 377 """Error-handling method to raise a warning if incoming markup looks | |
| 378 like a URL. | |
| 379 | |
| 380 :param markup: A string. | |
| 381 """ | |
| 382 if isinstance(markup, bytes): | |
| 383 space = b' ' | |
| 384 cant_start_with = (b"http:", b"https:") | |
| 385 elif isinstance(markup, str): | |
| 386 space = ' ' | |
| 387 cant_start_with = ("http:", "https:") | |
| 388 else: | |
| 389 return | |
| 390 | |
| 391 if any(markup.startswith(prefix) for prefix in cant_start_with): | |
| 392 if not space in markup: | |
| 393 warnings.warn( | |
| 394 '"%s" looks like a URL. Beautiful Soup is not an' | |
| 395 ' HTTP client. You should probably use an HTTP client like' | |
| 396 ' requests to get the document behind the URL, and feed' | |
| 397 ' that document to Beautiful Soup.' % cls._decode_markup( | |
| 398 markup | |
| 399 ) | |
| 400 ) | |
| 401 | |
| 402 def _feed(self): | |
| 403 """Internal method that parses previously set markup, creating a large | |
| 404 number of Tag and NavigableString objects. | |
| 405 """ | |
| 406 # Convert the document to Unicode. | |
| 407 self.builder.reset() | |
| 408 | |
| 409 self.builder.feed(self.markup) | |
| 410 # Close out any unfinished strings and close all the open tags. | |
| 411 self.endData() | |
| 412 while self.currentTag.name != self.ROOT_TAG_NAME: | |
| 413 self.popTag() | |
| 414 | |
| 415 def reset(self): | |
| 416 """Reset this object to a state as though it had never parsed any | |
| 417 markup. | |
| 418 """ | |
| 419 Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME) | |
| 420 self.hidden = 1 | |
| 421 self.builder.reset() | |
| 422 self.current_data = [] | |
| 423 self.currentTag = None | |
| 424 self.tagStack = [] | |
| 425 self.preserve_whitespace_tag_stack = [] | |
| 426 self.string_container_stack = [] | |
| 427 self.pushTag(self) | |
| 428 | |
| 429 def new_tag(self, name, namespace=None, nsprefix=None, attrs={}, | |
| 430 sourceline=None, sourcepos=None, **kwattrs): | |
| 431 """Create a new Tag associated with this BeautifulSoup object.""" | |
| 432 kwattrs.update(attrs) | |
| 433 return self.element_classes.get(Tag, Tag)( | |
| 434 None, self.builder, name, namespace, nsprefix, kwattrs, | |
| 435 sourceline=sourceline, sourcepos=sourcepos | |
| 436 ) | |
| 437 | |
| 438 def string_container(self, base_class=None): | |
| 439 container = base_class or NavigableString | |
| 440 | |
| 441 # There may be a general override of NavigableString. | |
| 442 container = self.element_classes.get( | |
| 443 container, container | |
| 444 ) | |
| 445 | |
| 446 # On top of that, we may be inside a tag that needs a special | |
| 447 # container class. | |
| 448 if self.string_container_stack: | |
| 449 container = self.builder.string_containers.get( | |
| 450 self.string_container_stack[-1].name, container | |
| 451 ) | |
| 452 return container | |
| 453 | |
| 454 def new_string(self, s, subclass=None): | |
| 455 """Create a new NavigableString associated with this BeautifulSoup | |
| 456 object. | |
| 457 """ | |
| 458 container = self.string_container(subclass) | |
| 459 return container(s) | |
| 460 | |
| 461 def insert_before(self, successor): | |
| 462 """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement | |
| 463 it because there is nothing before or after it in the parse tree. | |
| 464 """ | |
| 465 raise NotImplementedError("BeautifulSoup objects don't support insert_before().") | |
| 466 | |
| 467 def insert_after(self, successor): | |
| 468 """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement | |
| 469 it because there is nothing before or after it in the parse tree. | |
| 470 """ | |
| 471 raise NotImplementedError("BeautifulSoup objects don't support insert_after().") | |
| 472 | |
| 473 def popTag(self): | |
| 474 """Internal method called by _popToTag when a tag is closed.""" | |
| 475 tag = self.tagStack.pop() | |
| 476 if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]: | |
| 477 self.preserve_whitespace_tag_stack.pop() | |
| 478 if self.string_container_stack and tag == self.string_container_stack[-1]: | |
| 479 self.string_container_stack.pop() | |
| 480 #print "Pop", tag.name | |
| 481 if self.tagStack: | |
| 482 self.currentTag = self.tagStack[-1] | |
| 483 return self.currentTag | |
| 484 | |
| 485 def pushTag(self, tag): | |
| 486 """Internal method called by handle_starttag when a tag is opened.""" | |
| 487 #print "Push", tag.name | |
| 488 if self.currentTag is not None: | |
| 489 self.currentTag.contents.append(tag) | |
| 490 self.tagStack.append(tag) | |
| 491 self.currentTag = self.tagStack[-1] | |
| 492 if tag.name in self.builder.preserve_whitespace_tags: | |
| 493 self.preserve_whitespace_tag_stack.append(tag) | |
| 494 if tag.name in self.builder.string_containers: | |
| 495 self.string_container_stack.append(tag) | |
| 496 | |
| 497 def endData(self, containerClass=None): | |
| 498 """Method called by the TreeBuilder when the end of a data segment | |
| 499 occurs. | |
| 500 """ | |
| 501 containerClass = self.string_container(containerClass) | |
| 502 | |
| 503 if self.current_data: | |
| 504 current_data = ''.join(self.current_data) | |
| 505 # If whitespace is not preserved, and this string contains | |
| 506 # nothing but ASCII spaces, replace it with a single space | |
| 507 # or newline. | |
| 508 if not self.preserve_whitespace_tag_stack: | |
| 509 strippable = True | |
| 510 for i in current_data: | |
| 511 if i not in self.ASCII_SPACES: | |
| 512 strippable = False | |
| 513 break | |
| 514 if strippable: | |
| 515 if '\n' in current_data: | |
| 516 current_data = '\n' | |
| 517 else: | |
| 518 current_data = ' ' | |
| 519 | |
| 520 # Reset the data collector. | |
| 521 self.current_data = [] | |
| 522 | |
| 523 # Should we add this string to the tree at all? | |
| 524 if self.parse_only and len(self.tagStack) <= 1 and \ | |
| 525 (not self.parse_only.text or \ | |
| 526 not self.parse_only.search(current_data)): | |
| 527 return | |
| 528 | |
| 529 o = containerClass(current_data) | |
| 530 self.object_was_parsed(o) | |
| 531 | |
| 532 def object_was_parsed(self, o, parent=None, most_recent_element=None): | |
| 533 """Method called by the TreeBuilder to integrate an object into the parse tree.""" | |
| 534 if parent is None: | |
| 535 parent = self.currentTag | |
| 536 if most_recent_element is not None: | |
| 537 previous_element = most_recent_element | |
| 538 else: | |
| 539 previous_element = self._most_recent_element | |
| 540 | |
| 541 next_element = previous_sibling = next_sibling = None | |
| 542 if isinstance(o, Tag): | |
| 543 next_element = o.next_element | |
| 544 next_sibling = o.next_sibling | |
| 545 previous_sibling = o.previous_sibling | |
| 546 if previous_element is None: | |
| 547 previous_element = o.previous_element | |
| 548 | |
| 549 fix = parent.next_element is not None | |
| 550 | |
| 551 o.setup(parent, previous_element, next_element, previous_sibling, next_sibling) | |
| 552 | |
| 553 self._most_recent_element = o | |
| 554 parent.contents.append(o) | |
| 555 | |
| 556 # Check if we are inserting into an already parsed node. | |
| 557 if fix: | |
| 558 self._linkage_fixer(parent) | |
| 559 | |
| 560 def _linkage_fixer(self, el): | |
| 561 """Make sure linkage of this fragment is sound.""" | |
| 562 | |
| 563 first = el.contents[0] | |
| 564 child = el.contents[-1] | |
| 565 descendant = child | |
| 566 | |
| 567 if child is first and el.parent is not None: | |
| 568 # Parent should be linked to first child | |
| 569 el.next_element = child | |
| 570 # We are no longer linked to whatever this element is | |
| 571 prev_el = child.previous_element | |
| 572 if prev_el is not None and prev_el is not el: | |
| 573 prev_el.next_element = None | |
| 574 # First child should be linked to the parent, and no previous siblings. | |
| 575 child.previous_element = el | |
| 576 child.previous_sibling = None | |
| 577 | |
| 578 # We have no sibling as we've been appended as the last. | |
| 579 child.next_sibling = None | |
| 580 | |
| 581 # This index is a tag, dig deeper for a "last descendant" | |
| 582 if isinstance(child, Tag) and child.contents: | |
| 583 descendant = child._last_descendant(False) | |
| 584 | |
| 585 # As the final step, link last descendant. It should be linked | |
| 586 # to the parent's next sibling (if found), else walk up the chain | |
| 587 # and find a parent with a sibling. It should have no next sibling. | |
| 588 descendant.next_element = None | |
| 589 descendant.next_sibling = None | |
| 590 target = el | |
| 591 while True: | |
| 592 if target is None: | |
| 593 break | |
| 594 elif target.next_sibling is not None: | |
| 595 descendant.next_element = target.next_sibling | |
| 596 target.next_sibling.previous_element = child | |
| 597 break | |
| 598 target = target.parent | |
| 599 | |
| 600 def _popToTag(self, name, nsprefix=None, inclusivePop=True): | |
| 601 """Pops the tag stack up to and including the most recent | |
| 602 instance of the given tag. | |
| 603 | |
| 604 :param name: Pop up to the most recent tag with this name. | |
| 605 :param nsprefix: The namespace prefix that goes with `name`. | |
| 606 :param inclusivePop: It this is false, pops the tag stack up | |
| 607 to but *not* including the most recent instqance of the | |
| 608 given tag. | |
| 609 """ | |
| 610 #print "Popping to %s" % name | |
| 611 if name == self.ROOT_TAG_NAME: | |
| 612 # The BeautifulSoup object itself can never be popped. | |
| 613 return | |
| 614 | |
| 615 most_recently_popped = None | |
| 616 | |
| 617 stack_size = len(self.tagStack) | |
| 618 for i in range(stack_size - 1, 0, -1): | |
| 619 t = self.tagStack[i] | |
| 620 if (name == t.name and nsprefix == t.prefix): | |
| 621 if inclusivePop: | |
| 622 most_recently_popped = self.popTag() | |
| 623 break | |
| 624 most_recently_popped = self.popTag() | |
| 625 | |
| 626 return most_recently_popped | |
| 627 | |
| 628 def handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None, | |
| 629 sourcepos=None): | |
| 630 """Called by the tree builder when a new tag is encountered. | |
| 631 | |
| 632 :param name: Name of the tag. | |
| 633 :param nsprefix: Namespace prefix for the tag. | |
| 634 :param attrs: A dictionary of attribute values. | |
| 635 :param sourceline: The line number where this tag was found in its | |
| 636 source document. | |
| 637 :param sourcepos: The character position within `sourceline` where this | |
| 638 tag was found. | |
| 639 | |
| 640 If this method returns None, the tag was rejected by an active | |
| 641 SoupStrainer. You should proceed as if the tag had not occurred | |
| 642 in the document. For instance, if this was a self-closing tag, | |
| 643 don't call handle_endtag. | |
| 644 """ | |
| 645 # print "Start tag %s: %s" % (name, attrs) | |
| 646 self.endData() | |
| 647 | |
| 648 if (self.parse_only and len(self.tagStack) <= 1 | |
| 649 and (self.parse_only.text | |
| 650 or not self.parse_only.search_tag(name, attrs))): | |
| 651 return None | |
| 652 | |
| 653 tag = self.element_classes.get(Tag, Tag)( | |
| 654 self, self.builder, name, namespace, nsprefix, attrs, | |
| 655 self.currentTag, self._most_recent_element, | |
| 656 sourceline=sourceline, sourcepos=sourcepos | |
| 657 ) | |
| 658 if tag is None: | |
| 659 return tag | |
| 660 if self._most_recent_element is not None: | |
| 661 self._most_recent_element.next_element = tag | |
| 662 self._most_recent_element = tag | |
| 663 self.pushTag(tag) | |
| 664 return tag | |
| 665 | |
| 666 def handle_endtag(self, name, nsprefix=None): | |
| 667 """Called by the tree builder when an ending tag is encountered. | |
| 668 | |
| 669 :param name: Name of the tag. | |
| 670 :param nsprefix: Namespace prefix for the tag. | |
| 671 """ | |
| 672 #print "End tag: " + name | |
| 673 self.endData() | |
| 674 self._popToTag(name, nsprefix) | |
| 675 | |
| 676 def handle_data(self, data): | |
| 677 """Called by the tree builder when a chunk of textual data is encountered.""" | |
| 678 self.current_data.append(data) | |
| 679 | |
| 680 def decode(self, pretty_print=False, | |
| 681 eventual_encoding=DEFAULT_OUTPUT_ENCODING, | |
| 682 formatter="minimal"): | |
| 683 """Returns a string or Unicode representation of the parse tree | |
| 684 as an HTML or XML document. | |
| 685 | |
| 686 :param pretty_print: If this is True, indentation will be used to | |
| 687 make the document more readable. | |
| 688 :param eventual_encoding: The encoding of the final document. | |
| 689 If this is None, the document will be a Unicode string. | |
| 690 """ | |
| 691 if self.is_xml: | |
| 692 # Print the XML declaration | |
| 693 encoding_part = '' | |
| 694 if eventual_encoding != None: | |
| 695 encoding_part = ' encoding="%s"' % eventual_encoding | |
| 696 prefix = '<?xml version="1.0"%s?>\n' % encoding_part | |
| 697 else: | |
| 698 prefix = '' | |
| 699 if not pretty_print: | |
| 700 indent_level = None | |
| 701 else: | |
| 702 indent_level = 0 | |
| 703 return prefix + super(BeautifulSoup, self).decode( | |
| 704 indent_level, eventual_encoding, formatter) | |
| 705 | |
| 706 # Aliases to make it easier to get started quickly, e.g. 'from bs4 import _soup' | |
| 707 _s = BeautifulSoup | |
| 708 _soup = BeautifulSoup | |
| 709 | |
| 710 class BeautifulStoneSoup(BeautifulSoup): | |
| 711 """Deprecated interface to an XML parser.""" | |
| 712 | |
| 713 def __init__(self, *args, **kwargs): | |
| 714 kwargs['features'] = 'xml' | |
| 715 warnings.warn( | |
| 716 'The BeautifulStoneSoup class is deprecated. Instead of using ' | |
| 717 'it, pass features="xml" into the BeautifulSoup constructor.') | |
| 718 super(BeautifulStoneSoup, self).__init__(*args, **kwargs) | |
| 719 | |
| 720 | |
| 721 class StopParsing(Exception): | |
| 722 """Exception raised by a TreeBuilder if it's unable to continue parsing.""" | |
| 723 pass | |
| 724 | |
| 725 class FeatureNotFound(ValueError): | |
| 726 """Exception raised by the BeautifulSoup constructor if no parser with the | |
| 727 requested features is found. | |
| 728 """ | |
| 729 pass | |
| 730 | |
| 731 | |
| 732 #If this file is run as a script, act as an HTML pretty-printer. | |
| 733 if __name__ == '__main__': | |
| 734 import sys | |
| 735 soup = BeautifulSoup(sys.stdin) | |
| 736 print(soup.prettify()) |
