comparison env/lib/python3.9/site-packages/lxml/doctestcompare.py @ 0:4f3585e2f14b draft default tip

"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author shellac
date Mon, 22 Mar 2021 18:12:50 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4f3585e2f14b
1 """
2 lxml-based doctest output comparison.
3
4 Note: normally, you should just import the `lxml.usedoctest` and
5 `lxml.html.usedoctest` modules from within a doctest, instead of this
6 one::
7
8 >>> import lxml.usedoctest # for XML output
9
10 >>> import lxml.html.usedoctest # for HTML output
11
12 To use this module directly, you must call ``lxmldoctest.install()``,
13 which will cause doctest to use this in all subsequent calls.
14
15 This changes the way output is checked and comparisons are made for
16 XML or HTML-like content.
17
18 XML or HTML content is noticed because the example starts with ``<``
19 (it's HTML if it starts with ``<html``). You can also use the
20 ``PARSE_HTML`` and ``PARSE_XML`` flags to force parsing.
21
22 Some rough wildcard-like things are allowed. Whitespace is generally
23 ignored (except in attributes). In text (attributes and text in the
24 body) you can use ``...`` as a wildcard. In an example it also
25 matches any trailing tags in the element, though it does not match
26 leading tags. You may create a tag ``<any>`` or include an ``any``
27 attribute in the tag. An ``any`` tag matches any tag, while the
28 attribute matches any and all attributes.
29
30 When a match fails, the reformatted example and gotten text is
31 displayed (indented), and a rough diff-like output is given. Anything
32 marked with ``+`` is in the output but wasn't supposed to be, and
33 similarly ``-`` means its in the example but wasn't in the output.
34
35 You can disable parsing on one line with ``# doctest:+NOPARSE_MARKUP``
36 """
37
38 from lxml import etree
39 import sys
40 import re
41 import doctest
42 try:
43 from html import escape as html_escape
44 except ImportError:
45 from cgi import escape as html_escape
46
47 __all__ = ['PARSE_HTML', 'PARSE_XML', 'NOPARSE_MARKUP', 'LXMLOutputChecker',
48 'LHTMLOutputChecker', 'install', 'temp_install']
49
50 try:
51 _basestring = basestring
52 except NameError:
53 _basestring = (str, bytes)
54
55 _IS_PYTHON_3 = sys.version_info[0] >= 3
56
57 PARSE_HTML = doctest.register_optionflag('PARSE_HTML')
58 PARSE_XML = doctest.register_optionflag('PARSE_XML')
59 NOPARSE_MARKUP = doctest.register_optionflag('NOPARSE_MARKUP')
60
61 OutputChecker = doctest.OutputChecker
62
63 def strip(v):
64 if v is None:
65 return None
66 else:
67 return v.strip()
68
69 def norm_whitespace(v):
70 return _norm_whitespace_re.sub(' ', v)
71
72 _html_parser = etree.HTMLParser(recover=False, remove_blank_text=True)
73
74 def html_fromstring(html):
75 return etree.fromstring(html, _html_parser)
76
77 # We use this to distinguish repr()s from elements:
78 _repr_re = re.compile(r'^<[^>]+ (at|object) ')
79 _norm_whitespace_re = re.compile(r'[ \t\n][ \t\n]+')
80
81 class LXMLOutputChecker(OutputChecker):
82
83 empty_tags = (
84 'param', 'img', 'area', 'br', 'basefont', 'input',
85 'base', 'meta', 'link', 'col')
86
87 def get_default_parser(self):
88 return etree.XML
89
90 def check_output(self, want, got, optionflags):
91 alt_self = getattr(self, '_temp_override_self', None)
92 if alt_self is not None:
93 super_method = self._temp_call_super_check_output
94 self = alt_self
95 else:
96 super_method = OutputChecker.check_output
97 parser = self.get_parser(want, got, optionflags)
98 if not parser:
99 return super_method(
100 self, want, got, optionflags)
101 try:
102 want_doc = parser(want)
103 except etree.XMLSyntaxError:
104 return False
105 try:
106 got_doc = parser(got)
107 except etree.XMLSyntaxError:
108 return False
109 return self.compare_docs(want_doc, got_doc)
110
111 def get_parser(self, want, got, optionflags):
112 parser = None
113 if NOPARSE_MARKUP & optionflags:
114 return None
115 if PARSE_HTML & optionflags:
116 parser = html_fromstring
117 elif PARSE_XML & optionflags:
118 parser = etree.XML
119 elif (want.strip().lower().startswith('<html')
120 and got.strip().startswith('<html')):
121 parser = html_fromstring
122 elif (self._looks_like_markup(want)
123 and self._looks_like_markup(got)):
124 parser = self.get_default_parser()
125 return parser
126
127 def _looks_like_markup(self, s):
128 s = s.strip()
129 return (s.startswith('<')
130 and not _repr_re.search(s))
131
132 def compare_docs(self, want, got):
133 if not self.tag_compare(want.tag, got.tag):
134 return False
135 if not self.text_compare(want.text, got.text, True):
136 return False
137 if not self.text_compare(want.tail, got.tail, True):
138 return False
139 if 'any' not in want.attrib:
140 want_keys = sorted(want.attrib.keys())
141 got_keys = sorted(got.attrib.keys())
142 if want_keys != got_keys:
143 return False
144 for key in want_keys:
145 if not self.text_compare(want.attrib[key], got.attrib[key], False):
146 return False
147 if want.text != '...' or len(want):
148 want_children = list(want)
149 got_children = list(got)
150 while want_children or got_children:
151 if not want_children or not got_children:
152 return False
153 want_first = want_children.pop(0)
154 got_first = got_children.pop(0)
155 if not self.compare_docs(want_first, got_first):
156 return False
157 if not got_children and want_first.tail == '...':
158 break
159 return True
160
161 def text_compare(self, want, got, strip):
162 want = want or ''
163 got = got or ''
164 if strip:
165 want = norm_whitespace(want).strip()
166 got = norm_whitespace(got).strip()
167 want = '^%s$' % re.escape(want)
168 want = want.replace(r'\.\.\.', '.*')
169 if re.search(want, got):
170 return True
171 else:
172 return False
173
174 def tag_compare(self, want, got):
175 if want == 'any':
176 return True
177 if (not isinstance(want, _basestring)
178 or not isinstance(got, _basestring)):
179 return want == got
180 want = want or ''
181 got = got or ''
182 if want.startswith('{...}'):
183 # Ellipsis on the namespace
184 return want.split('}')[-1] == got.split('}')[-1]
185 else:
186 return want == got
187
188 def output_difference(self, example, got, optionflags):
189 want = example.want
190 parser = self.get_parser(want, got, optionflags)
191 errors = []
192 if parser is not None:
193 try:
194 want_doc = parser(want)
195 except etree.XMLSyntaxError:
196 e = sys.exc_info()[1]
197 errors.append('In example: %s' % e)
198 try:
199 got_doc = parser(got)
200 except etree.XMLSyntaxError:
201 e = sys.exc_info()[1]
202 errors.append('In actual output: %s' % e)
203 if parser is None or errors:
204 value = OutputChecker.output_difference(
205 self, example, got, optionflags)
206 if errors:
207 errors.append(value)
208 return '\n'.join(errors)
209 else:
210 return value
211 html = parser is html_fromstring
212 diff_parts = ['Expected:',
213 self.format_doc(want_doc, html, 2),
214 'Got:',
215 self.format_doc(got_doc, html, 2),
216 'Diff:',
217 self.collect_diff(want_doc, got_doc, html, 2)]
218 return '\n'.join(diff_parts)
219
220 def html_empty_tag(self, el, html=True):
221 if not html:
222 return False
223 if el.tag not in self.empty_tags:
224 return False
225 if el.text or len(el):
226 # This shouldn't happen (contents in an empty tag)
227 return False
228 return True
229
230 def format_doc(self, doc, html, indent, prefix=''):
231 parts = []
232 if not len(doc):
233 # No children...
234 parts.append(' '*indent)
235 parts.append(prefix)
236 parts.append(self.format_tag(doc))
237 if not self.html_empty_tag(doc, html):
238 if strip(doc.text):
239 parts.append(self.format_text(doc.text))
240 parts.append(self.format_end_tag(doc))
241 if strip(doc.tail):
242 parts.append(self.format_text(doc.tail))
243 parts.append('\n')
244 return ''.join(parts)
245 parts.append(' '*indent)
246 parts.append(prefix)
247 parts.append(self.format_tag(doc))
248 if not self.html_empty_tag(doc, html):
249 parts.append('\n')
250 if strip(doc.text):
251 parts.append(' '*indent)
252 parts.append(self.format_text(doc.text))
253 parts.append('\n')
254 for el in doc:
255 parts.append(self.format_doc(el, html, indent+2))
256 parts.append(' '*indent)
257 parts.append(self.format_end_tag(doc))
258 parts.append('\n')
259 if strip(doc.tail):
260 parts.append(' '*indent)
261 parts.append(self.format_text(doc.tail))
262 parts.append('\n')
263 return ''.join(parts)
264
265 def format_text(self, text, strip=True):
266 if text is None:
267 return ''
268 if strip:
269 text = text.strip()
270 return html_escape(text, 1)
271
272 def format_tag(self, el):
273 attrs = []
274 if isinstance(el, etree.CommentBase):
275 # FIXME: probably PIs should be handled specially too?
276 return '<!--'
277 for name, value in sorted(el.attrib.items()):
278 attrs.append('%s="%s"' % (name, self.format_text(value, False)))
279 if not attrs:
280 return '<%s>' % el.tag
281 return '<%s %s>' % (el.tag, ' '.join(attrs))
282
283 def format_end_tag(self, el):
284 if isinstance(el, etree.CommentBase):
285 # FIXME: probably PIs should be handled specially too?
286 return '-->'
287 return '</%s>' % el.tag
288
289 def collect_diff(self, want, got, html, indent):
290 parts = []
291 if not len(want) and not len(got):
292 parts.append(' '*indent)
293 parts.append(self.collect_diff_tag(want, got))
294 if not self.html_empty_tag(got, html):
295 parts.append(self.collect_diff_text(want.text, got.text))
296 parts.append(self.collect_diff_end_tag(want, got))
297 parts.append(self.collect_diff_text(want.tail, got.tail))
298 parts.append('\n')
299 return ''.join(parts)
300 parts.append(' '*indent)
301 parts.append(self.collect_diff_tag(want, got))
302 parts.append('\n')
303 if strip(want.text) or strip(got.text):
304 parts.append(' '*indent)
305 parts.append(self.collect_diff_text(want.text, got.text))
306 parts.append('\n')
307 want_children = list(want)
308 got_children = list(got)
309 while want_children or got_children:
310 if not want_children:
311 parts.append(self.format_doc(got_children.pop(0), html, indent+2, '+'))
312 continue
313 if not got_children:
314 parts.append(self.format_doc(want_children.pop(0), html, indent+2, '-'))
315 continue
316 parts.append(self.collect_diff(
317 want_children.pop(0), got_children.pop(0), html, indent+2))
318 parts.append(' '*indent)
319 parts.append(self.collect_diff_end_tag(want, got))
320 parts.append('\n')
321 if strip(want.tail) or strip(got.tail):
322 parts.append(' '*indent)
323 parts.append(self.collect_diff_text(want.tail, got.tail))
324 parts.append('\n')
325 return ''.join(parts)
326
327 def collect_diff_tag(self, want, got):
328 if not self.tag_compare(want.tag, got.tag):
329 tag = '%s (got: %s)' % (want.tag, got.tag)
330 else:
331 tag = got.tag
332 attrs = []
333 any = want.tag == 'any' or 'any' in want.attrib
334 for name, value in sorted(got.attrib.items()):
335 if name not in want.attrib and not any:
336 attrs.append('+%s="%s"' % (name, self.format_text(value, False)))
337 else:
338 if name in want.attrib:
339 text = self.collect_diff_text(want.attrib[name], value, False)
340 else:
341 text = self.format_text(value, False)
342 attrs.append('%s="%s"' % (name, text))
343 if not any:
344 for name, value in sorted(want.attrib.items()):
345 if name in got.attrib:
346 continue
347 attrs.append('-%s="%s"' % (name, self.format_text(value, False)))
348 if attrs:
349 tag = '<%s %s>' % (tag, ' '.join(attrs))
350 else:
351 tag = '<%s>' % tag
352 return tag
353
354 def collect_diff_end_tag(self, want, got):
355 if want.tag != got.tag:
356 tag = '%s (got: %s)' % (want.tag, got.tag)
357 else:
358 tag = got.tag
359 return '</%s>' % tag
360
361 def collect_diff_text(self, want, got, strip=True):
362 if self.text_compare(want, got, strip):
363 if not got:
364 return ''
365 return self.format_text(got, strip)
366 text = '%s (got: %s)' % (want, got)
367 return self.format_text(text, strip)
368
369 class LHTMLOutputChecker(LXMLOutputChecker):
370 def get_default_parser(self):
371 return html_fromstring
372
373 def install(html=False):
374 """
375 Install doctestcompare for all future doctests.
376
377 If html is true, then by default the HTML parser will be used;
378 otherwise the XML parser is used.
379 """
380 if html:
381 doctest.OutputChecker = LHTMLOutputChecker
382 else:
383 doctest.OutputChecker = LXMLOutputChecker
384
385 def temp_install(html=False, del_module=None):
386 """
387 Use this *inside* a doctest to enable this checker for this
388 doctest only.
389
390 If html is true, then by default the HTML parser will be used;
391 otherwise the XML parser is used.
392 """
393 if html:
394 Checker = LHTMLOutputChecker
395 else:
396 Checker = LXMLOutputChecker
397 frame = _find_doctest_frame()
398 dt_self = frame.f_locals['self']
399 checker = Checker()
400 old_checker = dt_self._checker
401 dt_self._checker = checker
402 # The unfortunate thing is that there is a local variable 'check'
403 # in the function that runs the doctests, that is a bound method
404 # into the output checker. We have to update that. We can't
405 # modify the frame, so we have to modify the object in place. The
406 # only way to do this is to actually change the func_code
407 # attribute of the method. We change it, and then wait for
408 # __record_outcome to be run, which signals the end of the __run
409 # method, at which point we restore the previous check_output
410 # implementation.
411 if _IS_PYTHON_3:
412 check_func = frame.f_locals['check'].__func__
413 checker_check_func = checker.check_output.__func__
414 else:
415 check_func = frame.f_locals['check'].im_func
416 checker_check_func = checker.check_output.im_func
417 # Because we can't patch up func_globals, this is the only global
418 # in check_output that we care about:
419 doctest.etree = etree
420 _RestoreChecker(dt_self, old_checker, checker,
421 check_func, checker_check_func,
422 del_module)
423
424 class _RestoreChecker(object):
425 def __init__(self, dt_self, old_checker, new_checker, check_func, clone_func,
426 del_module):
427 self.dt_self = dt_self
428 self.checker = old_checker
429 self.checker._temp_call_super_check_output = self.call_super
430 self.checker._temp_override_self = new_checker
431 self.check_func = check_func
432 self.clone_func = clone_func
433 self.del_module = del_module
434 self.install_clone()
435 self.install_dt_self()
436 def install_clone(self):
437 if _IS_PYTHON_3:
438 self.func_code = self.check_func.__code__
439 self.func_globals = self.check_func.__globals__
440 self.check_func.__code__ = self.clone_func.__code__
441 else:
442 self.func_code = self.check_func.func_code
443 self.func_globals = self.check_func.func_globals
444 self.check_func.func_code = self.clone_func.func_code
445 def uninstall_clone(self):
446 if _IS_PYTHON_3:
447 self.check_func.__code__ = self.func_code
448 else:
449 self.check_func.func_code = self.func_code
450 def install_dt_self(self):
451 self.prev_func = self.dt_self._DocTestRunner__record_outcome
452 self.dt_self._DocTestRunner__record_outcome = self
453 def uninstall_dt_self(self):
454 self.dt_self._DocTestRunner__record_outcome = self.prev_func
455 def uninstall_module(self):
456 if self.del_module:
457 import sys
458 del sys.modules[self.del_module]
459 if '.' in self.del_module:
460 package, module = self.del_module.rsplit('.', 1)
461 package_mod = sys.modules[package]
462 delattr(package_mod, module)
463 def __call__(self, *args, **kw):
464 self.uninstall_clone()
465 self.uninstall_dt_self()
466 del self.checker._temp_override_self
467 del self.checker._temp_call_super_check_output
468 result = self.prev_func(*args, **kw)
469 self.uninstall_module()
470 return result
471 def call_super(self, *args, **kw):
472 self.uninstall_clone()
473 try:
474 return self.check_func(*args, **kw)
475 finally:
476 self.install_clone()
477
478 def _find_doctest_frame():
479 import sys
480 frame = sys._getframe(1)
481 while frame:
482 l = frame.f_locals
483 if 'BOOM' in l:
484 # Sign of doctest
485 return frame
486 frame = frame.f_back
487 raise LookupError(
488 "Could not find doctest (only use this function *inside* a doctest)")
489
490 __test__ = {
491 'basic': '''
492 >>> temp_install()
493 >>> print """<xml a="1" b="2">stuff</xml>"""
494 <xml b="2" a="1">...</xml>
495 >>> print """<xml xmlns="http://example.com"><tag attr="bar" /></xml>"""
496 <xml xmlns="...">
497 <tag attr="..." />
498 </xml>
499 >>> print """<xml>blahblahblah<foo /></xml>""" # doctest: +NOPARSE_MARKUP, +ELLIPSIS
500 <xml>...foo /></xml>
501 '''}
502
503 if __name__ == '__main__':
504 import doctest
505 doctest.testmod()
506
507