comparison env/lib/python3.9/site-packages/bs4/builder/_htmlparser.py @ 0:4f3585e2f14b draft default tip

"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author shellac
date Mon, 22 Mar 2021 18:12:50 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4f3585e2f14b
1 # encoding: utf-8
2 """Use the HTMLParser library to parse HTML files that aren't too bad."""
3
4 # Use of this source code is governed by the MIT license.
5 __license__ = "MIT"
6
7 __all__ = [
8 'HTMLParserTreeBuilder',
9 ]
10
11 from html.parser import HTMLParser
12
13 try:
14 from html.parser import HTMLParseError
15 except ImportError as e:
16 # HTMLParseError is removed in Python 3.5. Since it can never be
17 # thrown in 3.5, we can just define our own class as a placeholder.
18 class HTMLParseError(Exception):
19 pass
20
21 import sys
22 import warnings
23
24 # Starting in Python 3.2, the HTMLParser constructor takes a 'strict'
25 # argument, which we'd like to set to False. Unfortunately,
26 # http://bugs.python.org/issue13273 makes strict=True a better bet
27 # before Python 3.2.3.
28 #
29 # At the end of this file, we monkeypatch HTMLParser so that
30 # strict=True works well on Python 3.2.2.
31 major, minor, release = sys.version_info[:3]
32 CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3
33 CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3
34 CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4
35
36
37 from bs4.element import (
38 CData,
39 Comment,
40 Declaration,
41 Doctype,
42 ProcessingInstruction,
43 )
44 from bs4.dammit import EntitySubstitution, UnicodeDammit
45
46 from bs4.builder import (
47 HTML,
48 HTMLTreeBuilder,
49 STRICT,
50 )
51
52
53 HTMLPARSER = 'html.parser'
54
55 class BeautifulSoupHTMLParser(HTMLParser):
56 """A subclass of the Python standard library's HTMLParser class, which
57 listens for HTMLParser events and translates them into calls
58 to Beautiful Soup's tree construction API.
59 """
60
61 # Strategies for handling duplicate attributes
62 IGNORE = 'ignore'
63 REPLACE = 'replace'
64
65 def __init__(self, *args, **kwargs):
66 """Constructor.
67
68 :param on_duplicate_attribute: A strategy for what to do if a
69 tag includes the same attribute more than once. Accepted
70 values are: REPLACE (replace earlier values with later
71 ones, the default), IGNORE (keep the earliest value
72 encountered), or a callable. A callable must take three
73 arguments: the dictionary of attributes already processed,
74 the name of the duplicate attribute, and the most recent value
75 encountered.
76 """
77 self.on_duplicate_attribute = kwargs.pop(
78 'on_duplicate_attribute', self.REPLACE
79 )
80 HTMLParser.__init__(self, *args, **kwargs)
81
82 # Keep a list of empty-element tags that were encountered
83 # without an explicit closing tag. If we encounter a closing tag
84 # of this type, we'll associate it with one of those entries.
85 #
86 # This isn't a stack because we don't care about the
87 # order. It's a list of closing tags we've already handled and
88 # will ignore, assuming they ever show up.
89 self.already_closed_empty_element = []
90
91 def error(self, msg):
92 """In Python 3, HTMLParser subclasses must implement error(), although
93 this requirement doesn't appear to be documented.
94
95 In Python 2, HTMLParser implements error() by raising an exception,
96 which we don't want to do.
97
98 In any event, this method is called only on very strange
99 markup and our best strategy is to pretend it didn't happen
100 and keep going.
101 """
102 warnings.warn(msg)
103
104 def handle_startendtag(self, name, attrs):
105 """Handle an incoming empty-element tag.
106
107 This is only called when the markup looks like <tag/>.
108
109 :param name: Name of the tag.
110 :param attrs: Dictionary of the tag's attributes.
111 """
112 # is_startend() tells handle_starttag not to close the tag
113 # just because its name matches a known empty-element tag. We
114 # know that this is an empty-element tag and we want to call
115 # handle_endtag ourselves.
116 tag = self.handle_starttag(name, attrs, handle_empty_element=False)
117 self.handle_endtag(name)
118
119 def handle_starttag(self, name, attrs, handle_empty_element=True):
120 """Handle an opening tag, e.g. '<tag>'
121
122 :param name: Name of the tag.
123 :param attrs: Dictionary of the tag's attributes.
124 :param handle_empty_element: True if this tag is known to be
125 an empty-element tag (i.e. there is not expected to be any
126 closing tag).
127 """
128 # XXX namespace
129 attr_dict = {}
130 for key, value in attrs:
131 # Change None attribute values to the empty string
132 # for consistency with the other tree builders.
133 if value is None:
134 value = ''
135 if key in attr_dict:
136 # A single attribute shows up multiple times in this
137 # tag. How to handle it depends on the
138 # on_duplicate_attribute setting.
139 on_dupe = self.on_duplicate_attribute
140 if on_dupe == self.IGNORE:
141 pass
142 elif on_dupe in (None, self.REPLACE):
143 attr_dict[key] = value
144 else:
145 on_dupe(attr_dict, key, value)
146 else:
147 attr_dict[key] = value
148 attrvalue = '""'
149 #print("START", name)
150 sourceline, sourcepos = self.getpos()
151 tag = self.soup.handle_starttag(
152 name, None, None, attr_dict, sourceline=sourceline,
153 sourcepos=sourcepos
154 )
155 if tag and tag.is_empty_element and handle_empty_element:
156 # Unlike other parsers, html.parser doesn't send separate end tag
157 # events for empty-element tags. (It's handled in
158 # handle_startendtag, but only if the original markup looked like
159 # <tag/>.)
160 #
161 # So we need to call handle_endtag() ourselves. Since we
162 # know the start event is identical to the end event, we
163 # don't want handle_endtag() to cross off any previous end
164 # events for tags of this name.
165 self.handle_endtag(name, check_already_closed=False)
166
167 # But we might encounter an explicit closing tag for this tag
168 # later on. If so, we want to ignore it.
169 self.already_closed_empty_element.append(name)
170
171 def handle_endtag(self, name, check_already_closed=True):
172 """Handle a closing tag, e.g. '</tag>'
173
174 :param name: A tag name.
175 :param check_already_closed: True if this tag is expected to
176 be the closing portion of an empty-element tag,
177 e.g. '<tag></tag>'.
178 """
179 #print("END", name)
180 if check_already_closed and name in self.already_closed_empty_element:
181 # This is a redundant end tag for an empty-element tag.
182 # We've already called handle_endtag() for it, so just
183 # check it off the list.
184 #print("ALREADY CLOSED", name)
185 self.already_closed_empty_element.remove(name)
186 else:
187 self.soup.handle_endtag(name)
188
189 def handle_data(self, data):
190 """Handle some textual data that shows up between tags."""
191 self.soup.handle_data(data)
192
193 def handle_charref(self, name):
194 """Handle a numeric character reference by converting it to the
195 corresponding Unicode character and treating it as textual
196 data.
197
198 :param name: Character number, possibly in hexadecimal.
199 """
200 # XXX workaround for a bug in HTMLParser. Remove this once
201 # it's fixed in all supported versions.
202 # http://bugs.python.org/issue13633
203 if name.startswith('x'):
204 real_name = int(name.lstrip('x'), 16)
205 elif name.startswith('X'):
206 real_name = int(name.lstrip('X'), 16)
207 else:
208 real_name = int(name)
209
210 data = None
211 if real_name < 256:
212 # HTML numeric entities are supposed to reference Unicode
213 # code points, but sometimes they reference code points in
214 # some other encoding (ahem, Windows-1252). E.g. &#147;
215 # instead of &#201; for LEFT DOUBLE QUOTATION MARK. This
216 # code tries to detect this situation and compensate.
217 for encoding in (self.soup.original_encoding, 'windows-1252'):
218 if not encoding:
219 continue
220 try:
221 data = bytearray([real_name]).decode(encoding)
222 except UnicodeDecodeError as e:
223 pass
224 if not data:
225 try:
226 data = chr(real_name)
227 except (ValueError, OverflowError) as e:
228 pass
229 data = data or "\N{REPLACEMENT CHARACTER}"
230 self.handle_data(data)
231
232 def handle_entityref(self, name):
233 """Handle a named entity reference by converting it to the
234 corresponding Unicode character and treating it as textual
235 data.
236
237 :param name: Name of the entity reference.
238 """
239 character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
240 if character is not None:
241 data = character
242 else:
243 # If this were XML, it would be ambiguous whether "&foo"
244 # was an character entity reference with a missing
245 # semicolon or the literal string "&foo". Since this is
246 # HTML, we have a complete list of all character entity references,
247 # and this one wasn't found, so assume it's the literal string "&foo".
248 data = "&%s" % name
249 self.handle_data(data)
250
251 def handle_comment(self, data):
252 """Handle an HTML comment.
253
254 :param data: The text of the comment.
255 """
256 self.soup.endData()
257 self.soup.handle_data(data)
258 self.soup.endData(Comment)
259
260 def handle_decl(self, data):
261 """Handle a DOCTYPE declaration.
262
263 :param data: The text of the declaration.
264 """
265 self.soup.endData()
266 data = data[len("DOCTYPE "):]
267 self.soup.handle_data(data)
268 self.soup.endData(Doctype)
269
270 def unknown_decl(self, data):
271 """Handle a declaration of unknown type -- probably a CDATA block.
272
273 :param data: The text of the declaration.
274 """
275 if data.upper().startswith('CDATA['):
276 cls = CData
277 data = data[len('CDATA['):]
278 else:
279 cls = Declaration
280 self.soup.endData()
281 self.soup.handle_data(data)
282 self.soup.endData(cls)
283
284 def handle_pi(self, data):
285 """Handle a processing instruction.
286
287 :param data: The text of the instruction.
288 """
289 self.soup.endData()
290 self.soup.handle_data(data)
291 self.soup.endData(ProcessingInstruction)
292
293
294 class HTMLParserTreeBuilder(HTMLTreeBuilder):
295 """A Beautiful soup `TreeBuilder` that uses the `HTMLParser` parser,
296 found in the Python standard library.
297 """
298 is_xml = False
299 picklable = True
300 NAME = HTMLPARSER
301 features = [NAME, HTML, STRICT]
302
303 # The html.parser knows which line number and position in the
304 # original file is the source of an element.
305 TRACKS_LINE_NUMBERS = True
306
307 def __init__(self, parser_args=None, parser_kwargs=None, **kwargs):
308 """Constructor.
309
310 :param parser_args: Positional arguments to pass into
311 the BeautifulSoupHTMLParser constructor, once it's
312 invoked.
313 :param parser_kwargs: Keyword arguments to pass into
314 the BeautifulSoupHTMLParser constructor, once it's
315 invoked.
316 :param kwargs: Keyword arguments for the superclass constructor.
317 """
318 # Some keyword arguments will be pulled out of kwargs and placed
319 # into parser_kwargs.
320 extra_parser_kwargs = dict()
321 for arg in ('on_duplicate_attribute',):
322 if arg in kwargs:
323 value = kwargs.pop(arg)
324 extra_parser_kwargs[arg] = value
325 super(HTMLParserTreeBuilder, self).__init__(**kwargs)
326 parser_args = parser_args or []
327 parser_kwargs = parser_kwargs or {}
328 parser_kwargs.update(extra_parser_kwargs)
329 if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
330 parser_kwargs['strict'] = False
331 if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
332 parser_kwargs['convert_charrefs'] = False
333 self.parser_args = (parser_args, parser_kwargs)
334
335 def prepare_markup(self, markup, user_specified_encoding=None,
336 document_declared_encoding=None, exclude_encodings=None):
337
338 """Run any preliminary steps necessary to make incoming markup
339 acceptable to the parser.
340
341 :param markup: Some markup -- probably a bytestring.
342 :param user_specified_encoding: The user asked to try this encoding.
343 :param document_declared_encoding: The markup itself claims to be
344 in this encoding.
345 :param exclude_encodings: The user asked _not_ to try any of
346 these encodings.
347
348 :yield: A series of 4-tuples:
349 (markup, encoding, declared encoding,
350 has undergone character replacement)
351
352 Each 4-tuple represents a strategy for converting the
353 document to Unicode and parsing it. Each strategy will be tried
354 in turn.
355 """
356 if isinstance(markup, str):
357 # Parse Unicode as-is.
358 yield (markup, None, None, False)
359 return
360
361 # Ask UnicodeDammit to sniff the most likely encoding.
362 try_encodings = [user_specified_encoding, document_declared_encoding]
363 dammit = UnicodeDammit(markup, try_encodings, is_html=True,
364 exclude_encodings=exclude_encodings)
365 yield (dammit.markup, dammit.original_encoding,
366 dammit.declared_html_encoding,
367 dammit.contains_replacement_characters)
368
369 def feed(self, markup):
370 """Run some incoming markup through some parsing process,
371 populating the `BeautifulSoup` object in self.soup.
372 """
373 args, kwargs = self.parser_args
374 parser = BeautifulSoupHTMLParser(*args, **kwargs)
375 parser.soup = self.soup
376 try:
377 parser.feed(markup)
378 parser.close()
379 except HTMLParseError as e:
380 warnings.warn(RuntimeWarning(
381 "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
382 raise e
383 parser.already_closed_empty_element = []
384
385 # Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
386 # 3.2.3 code. This ensures they don't treat markup like <p></p> as a
387 # string.
388 #
389 # XXX This code can be removed once most Python 3 users are on 3.2.3.
390 if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
391 import re
392 attrfind_tolerant = re.compile(
393 r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
394 r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
395 HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant
396
397 locatestarttagend = re.compile(r"""
398 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
399 (?:\s+ # whitespace before attribute name
400 (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
401 (?:\s*=\s* # value indicator
402 (?:'[^']*' # LITA-enclosed value
403 |\"[^\"]*\" # LIT-enclosed value
404 |[^'\">\s]+ # bare value
405 )
406 )?
407 )
408 )*
409 \s* # trailing whitespace
410 """, re.VERBOSE)
411 BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend
412
413 from html.parser import tagfind, attrfind
414
415 def parse_starttag(self, i):
416 self.__starttag_text = None
417 endpos = self.check_for_whole_start_tag(i)
418 if endpos < 0:
419 return endpos
420 rawdata = self.rawdata
421 self.__starttag_text = rawdata[i:endpos]
422
423 # Now parse the data between i+1 and j into a tag and attrs
424 attrs = []
425 match = tagfind.match(rawdata, i+1)
426 assert match, 'unexpected call to parse_starttag()'
427 k = match.end()
428 self.lasttag = tag = rawdata[i+1:k].lower()
429 while k < endpos:
430 if self.strict:
431 m = attrfind.match(rawdata, k)
432 else:
433 m = attrfind_tolerant.match(rawdata, k)
434 if not m:
435 break
436 attrname, rest, attrvalue = m.group(1, 2, 3)
437 if not rest:
438 attrvalue = None
439 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
440 attrvalue[:1] == '"' == attrvalue[-1:]:
441 attrvalue = attrvalue[1:-1]
442 if attrvalue:
443 attrvalue = self.unescape(attrvalue)
444 attrs.append((attrname.lower(), attrvalue))
445 k = m.end()
446
447 end = rawdata[k:endpos].strip()
448 if end not in (">", "/>"):
449 lineno, offset = self.getpos()
450 if "\n" in self.__starttag_text:
451 lineno = lineno + self.__starttag_text.count("\n")
452 offset = len(self.__starttag_text) \
453 - self.__starttag_text.rfind("\n")
454 else:
455 offset = offset + len(self.__starttag_text)
456 if self.strict:
457 self.error("junk characters in start tag: %r"
458 % (rawdata[k:endpos][:20],))
459 self.handle_data(rawdata[i:endpos])
460 return endpos
461 if end.endswith('/>'):
462 # XHTML-style empty tag: <span attr="value" />
463 self.handle_startendtag(tag, attrs)
464 else:
465 self.handle_starttag(tag, attrs)
466 if tag in self.CDATA_CONTENT_ELEMENTS:
467 self.set_cdata_mode(tag)
468 return endpos
469
470 def set_cdata_mode(self, elem):
471 self.cdata_elem = elem.lower()
472 self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
473
474 BeautifulSoupHTMLParser.parse_starttag = parse_starttag
475 BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode
476
477 CONSTRUCTOR_TAKES_STRICT = True