comparison env/lib/python3.9/site-packages/lxml/html/html5parser.py @ 0:4f3585e2f14b draft default tip

"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author shellac
date Mon, 22 Mar 2021 18:12:50 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4f3585e2f14b
1 """
2 An interface to html5lib that mimics the lxml.html interface.
3 """
4 import sys
5 import string
6
7 from html5lib import HTMLParser as _HTMLParser
8 from html5lib.treebuilders.etree_lxml import TreeBuilder
9 from lxml import etree
10 from lxml.html import Element, XHTML_NAMESPACE, _contains_block_level_tag
11
12 # python3 compatibility
13 try:
14 _strings = basestring
15 except NameError:
16 _strings = (bytes, str)
17 try:
18 from urllib2 import urlopen
19 except ImportError:
20 from urllib.request import urlopen
21 try:
22 from urlparse import urlparse
23 except ImportError:
24 from urllib.parse import urlparse
25
26
27 class HTMLParser(_HTMLParser):
28 """An html5lib HTML parser with lxml as tree."""
29
30 def __init__(self, strict=False, **kwargs):
31 _HTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)
32
33
34 try:
35 from html5lib import XHTMLParser as _XHTMLParser
36 except ImportError:
37 pass
38 else:
39 class XHTMLParser(_XHTMLParser):
40 """An html5lib XHTML Parser with lxml as tree."""
41
42 def __init__(self, strict=False, **kwargs):
43 _XHTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)
44
45 xhtml_parser = XHTMLParser()
46
47
48 def _find_tag(tree, tag):
49 elem = tree.find(tag)
50 if elem is not None:
51 return elem
52 return tree.find('{%s}%s' % (XHTML_NAMESPACE, tag))
53
54
55 def document_fromstring(html, guess_charset=None, parser=None):
56 """
57 Parse a whole document into a string.
58
59 If `guess_charset` is true, or if the input is not Unicode but a
60 byte string, the `chardet` library will perform charset guessing
61 on the string.
62 """
63 if not isinstance(html, _strings):
64 raise TypeError('string required')
65
66 if parser is None:
67 parser = html_parser
68
69 options = {}
70 if guess_charset is None and isinstance(html, bytes):
71 # html5lib does not accept useChardet as an argument, if it
72 # detected the html argument would produce unicode objects.
73 guess_charset = True
74 if guess_charset is not None:
75 options['useChardet'] = guess_charset
76 return parser.parse(html, **options).getroot()
77
78
79 def fragments_fromstring(html, no_leading_text=False,
80 guess_charset=None, parser=None):
81 """Parses several HTML elements, returning a list of elements.
82
83 The first item in the list may be a string. If no_leading_text is true,
84 then it will be an error if there is leading text, and it will always be
85 a list of only elements.
86
87 If `guess_charset` is true, the `chardet` library will perform charset
88 guessing on the string.
89 """
90 if not isinstance(html, _strings):
91 raise TypeError('string required')
92
93 if parser is None:
94 parser = html_parser
95
96 options = {}
97 if guess_charset is None and isinstance(html, bytes):
98 # html5lib does not accept useChardet as an argument, if it
99 # detected the html argument would produce unicode objects.
100 guess_charset = False
101 if guess_charset is not None:
102 options['useChardet'] = guess_charset
103 children = parser.parseFragment(html, 'div', **options)
104 if children and isinstance(children[0], _strings):
105 if no_leading_text:
106 if children[0].strip():
107 raise etree.ParserError('There is leading text: %r' %
108 children[0])
109 del children[0]
110 return children
111
112
113 def fragment_fromstring(html, create_parent=False,
114 guess_charset=None, parser=None):
115 """Parses a single HTML element; it is an error if there is more than
116 one element, or if anything but whitespace precedes or follows the
117 element.
118
119 If 'create_parent' is true (or is a tag name) then a parent node
120 will be created to encapsulate the HTML in a single element. In
121 this case, leading or trailing text is allowed.
122
123 If `guess_charset` is true, the `chardet` library will perform charset
124 guessing on the string.
125 """
126 if not isinstance(html, _strings):
127 raise TypeError('string required')
128
129 accept_leading_text = bool(create_parent)
130
131 elements = fragments_fromstring(
132 html, guess_charset=guess_charset, parser=parser,
133 no_leading_text=not accept_leading_text)
134
135 if create_parent:
136 if not isinstance(create_parent, _strings):
137 create_parent = 'div'
138 new_root = Element(create_parent)
139 if elements:
140 if isinstance(elements[0], _strings):
141 new_root.text = elements[0]
142 del elements[0]
143 new_root.extend(elements)
144 return new_root
145
146 if not elements:
147 raise etree.ParserError('No elements found')
148 if len(elements) > 1:
149 raise etree.ParserError('Multiple elements found')
150 result = elements[0]
151 if result.tail and result.tail.strip():
152 raise etree.ParserError('Element followed by text: %r' % result.tail)
153 result.tail = None
154 return result
155
156
157 def fromstring(html, guess_charset=None, parser=None):
158 """Parse the html, returning a single element/document.
159
160 This tries to minimally parse the chunk of text, without knowing if it
161 is a fragment or a document.
162
163 'base_url' will set the document's base_url attribute (and the tree's
164 docinfo.URL)
165
166 If `guess_charset` is true, or if the input is not Unicode but a
167 byte string, the `chardet` library will perform charset guessing
168 on the string.
169 """
170 if not isinstance(html, _strings):
171 raise TypeError('string required')
172 doc = document_fromstring(html, parser=parser,
173 guess_charset=guess_charset)
174
175 # document starts with doctype or <html>, full document!
176 start = html[:50]
177 if isinstance(start, bytes):
178 # Allow text comparison in python3.
179 # Decode as ascii, that also covers latin-1 and utf-8 for the
180 # characters we need.
181 start = start.decode('ascii', 'replace')
182
183 start = start.lstrip().lower()
184 if start.startswith('<html') or start.startswith('<!doctype'):
185 return doc
186
187 head = _find_tag(doc, 'head')
188
189 # if the head is not empty we have a full document
190 if len(head):
191 return doc
192
193 body = _find_tag(doc, 'body')
194
195 # The body has just one element, so it was probably a single
196 # element passed in
197 if (len(body) == 1 and (not body.text or not body.text.strip())
198 and (not body[-1].tail or not body[-1].tail.strip())):
199 return body[0]
200
201 # Now we have a body which represents a bunch of tags which have the
202 # content that was passed in. We will create a fake container, which
203 # is the body tag, except <body> implies too much structure.
204 if _contains_block_level_tag(body):
205 body.tag = 'div'
206 else:
207 body.tag = 'span'
208 return body
209
210
211 def parse(filename_url_or_file, guess_charset=None, parser=None):
212 """Parse a filename, URL, or file-like object into an HTML document
213 tree. Note: this returns a tree, not an element. Use
214 ``parse(...).getroot()`` to get the document root.
215
216 If ``guess_charset`` is true, the ``useChardet`` option is passed into
217 html5lib to enable character detection. This option is on by default
218 when parsing from URLs, off by default when parsing from file(-like)
219 objects (which tend to return Unicode more often than not), and on by
220 default when parsing from a file path (which is read in binary mode).
221 """
222 if parser is None:
223 parser = html_parser
224 if not isinstance(filename_url_or_file, _strings):
225 fp = filename_url_or_file
226 if guess_charset is None:
227 # assume that file-like objects return Unicode more often than bytes
228 guess_charset = False
229 elif _looks_like_url(filename_url_or_file):
230 fp = urlopen(filename_url_or_file)
231 if guess_charset is None:
232 # assume that URLs return bytes
233 guess_charset = True
234 else:
235 fp = open(filename_url_or_file, 'rb')
236 if guess_charset is None:
237 guess_charset = True
238
239 options = {}
240 # html5lib does not accept useChardet as an argument, if it
241 # detected the html argument would produce unicode objects.
242 if guess_charset:
243 options['useChardet'] = guess_charset
244 return parser.parse(fp, **options)
245
246
247 def _looks_like_url(str):
248 scheme = urlparse(str)[0]
249 if not scheme:
250 return False
251 elif (sys.platform == 'win32' and
252 scheme in string.ascii_letters
253 and len(scheme) == 1):
254 # looks like a 'normal' absolute path
255 return False
256 else:
257 return True
258
259
260 html_parser = HTMLParser()