Mercurial > repos > guerler > springsuite
comparison planemo/lib/python3.7/site-packages/humanfriendly/terminal/html.py @ 1:56ad4e20f292 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
| author | guerler |
|---|---|
| date | Fri, 31 Jul 2020 00:32:28 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 0:d30785e31577 | 1:56ad4e20f292 |
|---|---|
| 1 # Human friendly input/output in Python. | |
| 2 # | |
| 3 # Author: Peter Odding <peter@peterodding.com> | |
| 4 # Last Change: February 29, 2020 | |
| 5 # URL: https://humanfriendly.readthedocs.io | |
| 6 | |
| 7 """Convert HTML with simple text formatting to text with ANSI escape sequences.""" | |
| 8 | |
| 9 # Standard library modules. | |
| 10 import re | |
| 11 | |
| 12 # Modules included in our package. | |
| 13 from humanfriendly.compat import HTMLParser, StringIO, name2codepoint, unichr | |
| 14 from humanfriendly.text import compact_empty_lines | |
| 15 from humanfriendly.terminal import ANSI_COLOR_CODES, ANSI_RESET, ansi_style | |
| 16 | |
| 17 # Public identifiers that require documentation. | |
| 18 __all__ = ('HTMLConverter', 'html_to_ansi') | |
| 19 | |
| 20 | |
| 21 def html_to_ansi(data, callback=None): | |
| 22 """ | |
| 23 Convert HTML with simple text formatting to text with ANSI escape sequences. | |
| 24 | |
| 25 :param data: The HTML to convert (a string). | |
| 26 :param callback: Optional callback to pass to :class:`HTMLConverter`. | |
| 27 :returns: Text with ANSI escape sequences (a string). | |
| 28 | |
| 29 Please refer to the documentation of the :class:`HTMLConverter` class for | |
| 30 details about the conversion process (like which tags are supported) and an | |
| 31 example with a screenshot. | |
| 32 """ | |
| 33 converter = HTMLConverter(callback=callback) | |
| 34 return converter(data) | |
| 35 | |
| 36 | |
| 37 class HTMLConverter(HTMLParser): | |
| 38 | |
| 39 """ | |
| 40 Convert HTML with simple text formatting to text with ANSI escape sequences. | |
| 41 | |
| 42 The following text styles are supported: | |
| 43 | |
| 44 - Bold: ``<b>``, ``<strong>`` and ``<span style="font-weight: bold;">`` | |
| 45 - Italic: ``<i>``, ``<em>`` and ``<span style="font-style: italic;">`` | |
| 46 - Strike-through: ``<del>``, ``<s>`` and ``<span style="text-decoration: line-through;">`` | |
| 47 - Underline: ``<ins>``, ``<u>`` and ``<span style="text-decoration: underline">`` | |
| 48 | |
| 49 Colors can be specified as follows: | |
| 50 | |
| 51 - Foreground color: ``<span style="color: #RRGGBB;">`` | |
| 52 - Background color: ``<span style="background-color: #RRGGBB;">`` | |
| 53 | |
| 54 Here's a small demonstration: | |
| 55 | |
| 56 .. code-block:: python | |
| 57 | |
| 58 from humanfriendly.text import dedent | |
| 59 from humanfriendly.terminal import html_to_ansi | |
| 60 | |
| 61 print(html_to_ansi(dedent(''' | |
| 62 <b>Hello world!</b> | |
| 63 <i>Is this thing on?</i> | |
| 64 I guess I can <u>underline</u> or <s>strike-through</s> text? | |
| 65 And what about <span style="color: red">color</span>? | |
| 66 '''))) | |
| 67 | |
| 68 rainbow_colors = [ | |
| 69 '#FF0000', '#E2571E', '#FF7F00', '#FFFF00', '#00FF00', | |
| 70 '#96BF33', '#0000FF', '#4B0082', '#8B00FF', '#FFFFFF', | |
| 71 ] | |
| 72 html_rainbow = "".join('<span style="color: %s">o</span>' % c for c in rainbow_colors) | |
| 73 print(html_to_ansi("Let's try a rainbow: %s" % html_rainbow)) | |
| 74 | |
| 75 Here's what the results look like: | |
| 76 | |
| 77 .. image:: images/html-to-ansi.png | |
| 78 | |
| 79 Some more details: | |
| 80 | |
| 81 - Nested tags are supported, within reasonable limits. | |
| 82 | |
| 83 - Text in ``<code>`` and ``<pre>`` tags will be highlighted in a | |
| 84 different color from the main text (currently this is yellow). | |
| 85 | |
| 86 - ``<a href="URL">TEXT</a>`` is converted to the format "TEXT (URL)" where | |
| 87 the uppercase symbols are highlighted in light blue with an underline. | |
| 88 | |
| 89 - ``<div>``, ``<p>`` and ``<pre>`` tags are considered block level tags | |
| 90 and are wrapped in vertical whitespace to prevent their content from | |
| 91 "running into" surrounding text. This may cause runs of multiple empty | |
| 92 lines to be emitted. As a *workaround* the :func:`__call__()` method | |
| 93 will automatically call :func:`.compact_empty_lines()` on the generated | |
| 94 output before returning it to the caller. Of course this won't work | |
| 95 when `output` is set to something like :data:`sys.stdout`. | |
| 96 | |
| 97 - ``<br>`` is converted to a single plain text line break. | |
| 98 | |
| 99 Implementation notes: | |
| 100 | |
| 101 - A list of dictionaries with style information is used as a stack where | |
| 102 new styling can be pushed and a pop will restore the previous styling. | |
| 103 When new styling is pushed, it is merged with (but overrides) the current | |
| 104 styling. | |
| 105 | |
| 106 - If you're going to be converting a lot of HTML it might be useful from | |
| 107 a performance standpoint to re-use an existing :class:`HTMLConverter` | |
| 108 object for unrelated HTML fragments, in this case take a look at the | |
| 109 :func:`__call__()` method (it makes this use case very easy). | |
| 110 | |
| 111 .. versionadded:: 4.15 | |
| 112 :class:`humanfriendly.terminal.HTMLConverter` was added to the | |
| 113 `humanfriendly` package during the initial development of my new | |
| 114 `chat-archive <https://chat-archive.readthedocs.io/>`_ project, whose | |
| 115 command line interface makes for a great demonstration of the | |
| 116 flexibility that this feature provides (hint: check out how the search | |
| 117 keyword highlighting combines with the regular highlighting). | |
| 118 """ | |
| 119 | |
| 120 BLOCK_TAGS = ('div', 'p', 'pre') | |
| 121 """The names of tags that are padded with vertical whitespace.""" | |
| 122 | |
| 123 def __init__(self, *args, **kw): | |
| 124 """ | |
| 125 Initialize an :class:`HTMLConverter` object. | |
| 126 | |
| 127 :param callback: Optional keyword argument to specify a function that | |
| 128 will be called to process text fragments before they | |
| 129 are emitted on the output stream. Note that link text | |
| 130 and preformatted text fragments are not processed by | |
| 131 this callback. | |
| 132 :param output: Optional keyword argument to redirect the output to the | |
| 133 given file-like object. If this is not given a new | |
| 134 :class:`~python3:io.StringIO` object is created. | |
| 135 """ | |
| 136 # Hide our optional keyword arguments from the superclass. | |
| 137 self.callback = kw.pop("callback", None) | |
| 138 self.output = kw.pop("output", None) | |
| 139 # Initialize the superclass. | |
| 140 HTMLParser.__init__(self, *args, **kw) | |
| 141 | |
| 142 def __call__(self, data): | |
| 143 """ | |
| 144 Reset the parser, convert some HTML and get the text with ANSI escape sequences. | |
| 145 | |
| 146 :param data: The HTML to convert to text (a string). | |
| 147 :returns: The converted text (only in case `output` is | |
| 148 a :class:`~python3:io.StringIO` object). | |
| 149 """ | |
| 150 self.reset() | |
| 151 self.feed(data) | |
| 152 self.close() | |
| 153 if isinstance(self.output, StringIO): | |
| 154 return compact_empty_lines(self.output.getvalue()) | |
| 155 | |
| 156 @property | |
| 157 def current_style(self): | |
| 158 """Get the current style from the top of the stack (a dictionary).""" | |
| 159 return self.stack[-1] if self.stack else {} | |
| 160 | |
| 161 def close(self): | |
| 162 """ | |
| 163 Close previously opened ANSI escape sequences. | |
| 164 | |
| 165 This method overrides the same method in the superclass to ensure that | |
| 166 an :data:`.ANSI_RESET` code is emitted when parsing reaches the end of | |
| 167 the input but a style is still active. This is intended to prevent | |
| 168 malformed HTML from messing up terminal output. | |
| 169 """ | |
| 170 if any(self.stack): | |
| 171 self.output.write(ANSI_RESET) | |
| 172 self.stack = [] | |
| 173 HTMLParser.close(self) | |
| 174 | |
| 175 def emit_style(self, style=None): | |
| 176 """ | |
| 177 Emit an ANSI escape sequence for the given or current style to the output stream. | |
| 178 | |
| 179 :param style: A dictionary with arguments for :func:`.ansi_style()` or | |
| 180 :data:`None`, in which case the style at the top of the | |
| 181 stack is emitted. | |
| 182 """ | |
| 183 # Clear the current text styles. | |
| 184 self.output.write(ANSI_RESET) | |
| 185 # Apply a new text style? | |
| 186 style = self.current_style if style is None else style | |
| 187 if style: | |
| 188 self.output.write(ansi_style(**style)) | |
| 189 | |
| 190 def handle_charref(self, value): | |
| 191 """ | |
| 192 Process a decimal or hexadecimal numeric character reference. | |
| 193 | |
| 194 :param value: The decimal or hexadecimal value (a string). | |
| 195 """ | |
| 196 self.output.write(unichr(int(value[1:], 16) if value.startswith('x') else int(value))) | |
| 197 | |
| 198 def handle_data(self, data): | |
| 199 """ | |
| 200 Process textual data. | |
| 201 | |
| 202 :param data: The decoded text (a string). | |
| 203 """ | |
| 204 if self.link_url: | |
| 205 # Link text is captured literally so that we can reliably check | |
| 206 # whether the text and the URL of the link are the same string. | |
| 207 self.link_text = data | |
| 208 elif self.callback and self.preformatted_text_level == 0: | |
| 209 # Text that is not part of a link and not preformatted text is | |
| 210 # passed to the user defined callback to allow for arbitrary | |
| 211 # pre-processing. | |
| 212 data = self.callback(data) | |
| 213 # All text is emitted unmodified on the output stream. | |
| 214 self.output.write(data) | |
| 215 | |
| 216 def handle_endtag(self, tag): | |
| 217 """ | |
| 218 Process the end of an HTML tag. | |
| 219 | |
| 220 :param tag: The name of the tag (a string). | |
| 221 """ | |
| 222 if tag in ('a', 'b', 'code', 'del', 'em', 'i', 'ins', 'pre', 's', 'strong', 'span', 'u'): | |
| 223 old_style = self.current_style | |
| 224 # The following conditional isn't necessary for well formed | |
| 225 # HTML but prevents raising exceptions on malformed HTML. | |
| 226 if self.stack: | |
| 227 self.stack.pop(-1) | |
| 228 new_style = self.current_style | |
| 229 if tag == 'a': | |
| 230 if self.urls_match(self.link_text, self.link_url): | |
| 231 # Don't render the URL when it's part of the link text. | |
| 232 self.emit_style(new_style) | |
| 233 else: | |
| 234 self.emit_style(new_style) | |
| 235 self.output.write(' (') | |
| 236 self.emit_style(old_style) | |
| 237 self.output.write(self.render_url(self.link_url)) | |
| 238 self.emit_style(new_style) | |
| 239 self.output.write(')') | |
| 240 else: | |
| 241 self.emit_style(new_style) | |
| 242 if tag in ('code', 'pre'): | |
| 243 self.preformatted_text_level -= 1 | |
| 244 if tag in self.BLOCK_TAGS: | |
| 245 # Emit an empty line after block level tags. | |
| 246 self.output.write('\n\n') | |
| 247 | |
| 248 def handle_entityref(self, name): | |
| 249 """ | |
| 250 Process a named character reference. | |
| 251 | |
| 252 :param name: The name of the character reference (a string). | |
| 253 """ | |
| 254 self.output.write(unichr(name2codepoint[name])) | |
| 255 | |
| 256 def handle_starttag(self, tag, attrs): | |
| 257 """ | |
| 258 Process the start of an HTML tag. | |
| 259 | |
| 260 :param tag: The name of the tag (a string). | |
| 261 :param attrs: A list of tuples with two strings each. | |
| 262 """ | |
| 263 if tag in self.BLOCK_TAGS: | |
| 264 # Emit an empty line before block level tags. | |
| 265 self.output.write('\n\n') | |
| 266 if tag == 'a': | |
| 267 self.push_styles(color='blue', bright=True, underline=True) | |
| 268 # Store the URL that the link points to for later use, so that we | |
| 269 # can render the link text before the URL (with the reasoning that | |
| 270 # this is the most intuitive way to present a link in a plain text | |
| 271 # interface). | |
| 272 self.link_url = next((v for n, v in attrs if n == 'href'), '') | |
| 273 elif tag == 'b' or tag == 'strong': | |
| 274 self.push_styles(bold=True) | |
| 275 elif tag == 'br': | |
| 276 self.output.write('\n') | |
| 277 elif tag == 'code' or tag == 'pre': | |
| 278 self.push_styles(color='yellow') | |
| 279 self.preformatted_text_level += 1 | |
| 280 elif tag == 'del' or tag == 's': | |
| 281 self.push_styles(strike_through=True) | |
| 282 elif tag == 'em' or tag == 'i': | |
| 283 self.push_styles(italic=True) | |
| 284 elif tag == 'ins' or tag == 'u': | |
| 285 self.push_styles(underline=True) | |
| 286 elif tag == 'span': | |
| 287 styles = {} | |
| 288 css = next((v for n, v in attrs if n == 'style'), "") | |
| 289 for rule in css.split(';'): | |
| 290 name, _, value = rule.partition(':') | |
| 291 name = name.strip() | |
| 292 value = value.strip() | |
| 293 if name == 'background-color': | |
| 294 styles['background'] = self.parse_color(value) | |
| 295 elif name == 'color': | |
| 296 styles['color'] = self.parse_color(value) | |
| 297 elif name == 'font-style' and value == 'italic': | |
| 298 styles['italic'] = True | |
| 299 elif name == 'font-weight' and value == 'bold': | |
| 300 styles['bold'] = True | |
| 301 elif name == 'text-decoration' and value == 'line-through': | |
| 302 styles['strike_through'] = True | |
| 303 elif name == 'text-decoration' and value == 'underline': | |
| 304 styles['underline'] = True | |
| 305 self.push_styles(**styles) | |
| 306 | |
| 307 def normalize_url(self, url): | |
| 308 """ | |
| 309 Normalize a URL to enable string equality comparison. | |
| 310 | |
| 311 :param url: The URL to normalize (a string). | |
| 312 :returns: The normalized URL (a string). | |
| 313 """ | |
| 314 return re.sub('^mailto:', '', url) | |
| 315 | |
| 316 def parse_color(self, value): | |
| 317 """ | |
| 318 Convert a CSS color to something that :func:`.ansi_style()` understands. | |
| 319 | |
| 320 :param value: A string like ``rgb(1,2,3)``, ``#AABBCC`` or ``yellow``. | |
| 321 :returns: A color value supported by :func:`.ansi_style()` or :data:`None`. | |
| 322 """ | |
| 323 # Parse an 'rgb(N,N,N)' expression. | |
| 324 if value.startswith('rgb'): | |
| 325 tokens = re.findall(r'\d+', value) | |
| 326 if len(tokens) == 3: | |
| 327 return tuple(map(int, tokens)) | |
| 328 # Parse an '#XXXXXX' expression. | |
| 329 elif value.startswith('#'): | |
| 330 value = value[1:] | |
| 331 length = len(value) | |
| 332 if length == 6: | |
| 333 # Six hex digits (proper notation). | |
| 334 return ( | |
| 335 int(value[:2], 16), | |
| 336 int(value[2:4], 16), | |
| 337 int(value[4:6], 16), | |
| 338 ) | |
| 339 elif length == 3: | |
| 340 # Three hex digits (shorthand). | |
| 341 return ( | |
| 342 int(value[0], 16), | |
| 343 int(value[1], 16), | |
| 344 int(value[2], 16), | |
| 345 ) | |
| 346 # Try to recognize a named color. | |
| 347 value = value.lower() | |
| 348 if value in ANSI_COLOR_CODES: | |
| 349 return value | |
| 350 | |
| 351 def push_styles(self, **changes): | |
| 352 """ | |
| 353 Push new style information onto the stack. | |
| 354 | |
| 355 :param changes: Any keyword arguments are passed on to :func:`.ansi_style()`. | |
| 356 | |
| 357 This method is a helper for :func:`handle_starttag()` | |
| 358 that does the following: | |
| 359 | |
| 360 1. Make a copy of the current styles (from the top of the stack), | |
| 361 2. Apply the given `changes` to the copy of the current styles, | |
| 362 3. Add the new styles to the stack, | |
| 363 4. Emit the appropriate ANSI escape sequence to the output stream. | |
| 364 """ | |
| 365 prototype = self.current_style | |
| 366 if prototype: | |
| 367 new_style = dict(prototype) | |
| 368 new_style.update(changes) | |
| 369 else: | |
| 370 new_style = changes | |
| 371 self.stack.append(new_style) | |
| 372 self.emit_style(new_style) | |
| 373 | |
| 374 def render_url(self, url): | |
| 375 """ | |
| 376 Prepare a URL for rendering on the terminal. | |
| 377 | |
| 378 :param url: The URL to simplify (a string). | |
| 379 :returns: The simplified URL (a string). | |
| 380 | |
| 381 This method pre-processes a URL before rendering on the terminal. The | |
| 382 following modifications are made: | |
| 383 | |
| 384 - The ``mailto:`` prefix is stripped. | |
| 385 - Spaces are converted to ``%20``. | |
| 386 - A trailing parenthesis is converted to ``%29``. | |
| 387 """ | |
| 388 url = re.sub('^mailto:', '', url) | |
| 389 url = re.sub(' ', '%20', url) | |
| 390 url = re.sub(r'\)$', '%29', url) | |
| 391 return url | |
| 392 | |
| 393 def reset(self): | |
| 394 """ | |
| 395 Reset the state of the HTML parser and ANSI converter. | |
| 396 | |
| 397 When `output` is a :class:`~python3:io.StringIO` object a new | |
| 398 instance will be created (and the old one garbage collected). | |
| 399 """ | |
| 400 # Reset the state of the superclass. | |
| 401 HTMLParser.reset(self) | |
| 402 # Reset our instance variables. | |
| 403 self.link_text = None | |
| 404 self.link_url = None | |
| 405 self.preformatted_text_level = 0 | |
| 406 if self.output is None or isinstance(self.output, StringIO): | |
| 407 # If the caller specified something like output=sys.stdout then it | |
| 408 # doesn't make much sense to negate that choice here in reset(). | |
| 409 self.output = StringIO() | |
| 410 self.stack = [] | |
| 411 | |
| 412 def urls_match(self, a, b): | |
| 413 """ | |
| 414 Compare two URLs for equality using :func:`normalize_url()`. | |
| 415 | |
| 416 :param a: A string containing a URL. | |
| 417 :param b: A string containing a URL. | |
| 418 :returns: :data:`True` if the URLs are the same, :data:`False` otherwise. | |
| 419 | |
| 420 This method is used by :func:`handle_endtag()` to omit the URL of a | |
| 421 hyperlink (``<a href="...">``) when the link text is that same URL. | |
| 422 """ | |
| 423 return self.normalize_url(a) == self.normalize_url(b) |
