Mercurial > repos > guerler > springsuite
comparison planemo/lib/python3.7/site-packages/bleach/linkifier.py @ 0:d30785e31577 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
| author | guerler |
|---|---|
| date | Fri, 31 Jul 2020 00:18:57 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:d30785e31577 |
|---|---|
| 1 from __future__ import unicode_literals | |
| 2 import re | |
| 3 import six | |
| 4 | |
| 5 from bleach import callbacks as linkify_callbacks | |
| 6 from bleach import html5lib_shim | |
| 7 from bleach.utils import alphabetize_attributes, force_unicode | |
| 8 | |
| 9 | |
| 10 #: List of default callbacks | |
| 11 DEFAULT_CALLBACKS = [linkify_callbacks.nofollow] | |
| 12 | |
| 13 | |
| 14 TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az | |
| 15 ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat | |
| 16 cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk | |
| 17 dm do dz ec edu ee eg er es et eu fi fj fk fm fo fr ga gb gd ge gf gg | |
| 18 gh gi gl gm gn gov gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il | |
| 19 im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp | |
| 20 kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk | |
| 21 ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne | |
| 22 net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn post | |
| 23 pr pro ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl | |
| 24 sm sn so sr ss st su sv sx sy sz tc td tel tf tg th tj tk tl tm tn to | |
| 25 tp tr travel tt tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws | |
| 26 xn xxx ye yt yu za zm zw""".split() | |
| 27 | |
| 28 # Make sure that .com doesn't get matched by .co first | |
| 29 TLDS.reverse() | |
| 30 | |
| 31 | |
| 32 def build_url_re(tlds=TLDS, protocols=html5lib_shim.allowed_protocols): | |
| 33 """Builds the url regex used by linkifier | |
| 34 | |
| 35 If you want a different set of tlds or allowed protocols, pass those in | |
| 36 and stomp on the existing ``url_re``:: | |
| 37 | |
| 38 from bleach import linkifier | |
| 39 | |
| 40 my_url_re = linkifier.build_url_re(my_tlds_list, my_protocols) | |
| 41 | |
| 42 linker = LinkifyFilter(url_re=my_url_re) | |
| 43 | |
| 44 """ | |
| 45 return re.compile( | |
| 46 r"""\(* # Match any opening parentheses. | |
| 47 \b(?<![@.])(?:(?:{0}):/{{0,3}}(?:(?:\w+:)?\w+@)?)? # http:// | |
| 48 ([\w-]+\.)+(?:{1})(?:\:[0-9]+)?(?!\.\w)\b # xx.yy.tld(:##)? | |
| 49 (?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)? | |
| 50 # /path/zz (excluding "unsafe" chars from RFC 1738, | |
| 51 # except for # and ~, which happen in practice) | |
| 52 """.format('|'.join(sorted(protocols)), '|'.join(sorted(tlds))), | |
| 53 re.IGNORECASE | re.VERBOSE | re.UNICODE) | |
| 54 | |
| 55 | |
| 56 URL_RE = build_url_re() | |
| 57 | |
| 58 | |
| 59 PROTO_RE = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE) | |
| 60 | |
| 61 | |
| 62 def build_email_re(tlds=TLDS): | |
| 63 """Builds the email regex used by linkifier | |
| 64 | |
| 65 If you want a different set of tlds, pass those in and stomp on the existing ``email_re``:: | |
| 66 | |
| 67 from bleach import linkifier | |
| 68 | |
| 69 my_email_re = linkifier.build_email_re(my_tlds_list) | |
| 70 | |
| 71 linker = LinkifyFilter(email_re=my_url_re) | |
| 72 | |
| 73 """ | |
| 74 # open and closing braces doubled below for format string | |
| 75 return re.compile( | |
| 76 r"""(?<!//) | |
| 77 (([-!#$%&'*+/=?^_`{{}}|~0-9A-Z]+ | |
| 78 (\.[-!#$%&'*+/=?^_`{{}}|~0-9A-Z]+)* # dot-atom | |
| 79 |^"([\001-\010\013\014\016-\037!#-\[\]-\177] | |
| 80 |\\[\001-\011\013\014\016-\177])*" # quoted-string | |
| 81 )@(?:[A-Z0-9](?:[A-Z0-9-]{{0,61}}[A-Z0-9])?\.)+(?:{0})) # domain | |
| 82 """.format('|'.join(tlds)), | |
| 83 re.IGNORECASE | re.MULTILINE | re.VERBOSE) | |
| 84 | |
| 85 | |
| 86 EMAIL_RE = build_email_re() | |
| 87 | |
| 88 | |
| 89 class Linker(object): | |
| 90 """Convert URL-like strings in an HTML fragment to links | |
| 91 | |
| 92 This function converts strings that look like URLs, domain names and email | |
| 93 addresses in text that may be an HTML fragment to links, while preserving: | |
| 94 | |
| 95 1. links already in the string | |
| 96 2. urls found in attributes | |
| 97 3. email addresses | |
| 98 | |
| 99 linkify does a best-effort approach and tries to recover from bad | |
| 100 situations due to crazy text. | |
| 101 | |
| 102 """ | |
| 103 def __init__(self, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=False, | |
| 104 url_re=URL_RE, email_re=EMAIL_RE, recognized_tags=html5lib_shim.HTML_TAGS): | |
| 105 """Creates a Linker instance | |
| 106 | |
| 107 :arg list callbacks: list of callbacks to run when adjusting tag attributes; | |
| 108 defaults to ``bleach.linkifier.DEFAULT_CALLBACKS`` | |
| 109 | |
| 110 :arg list skip_tags: list of tags that you don't want to linkify the | |
| 111 contents of; for example, you could set this to ``['pre']`` to skip | |
| 112 linkifying contents of ``pre`` tags | |
| 113 | |
| 114 :arg bool parse_email: whether or not to linkify email addresses | |
| 115 | |
| 116 :arg re url_re: url matching regex | |
| 117 | |
| 118 :arg re email_re: email matching regex | |
| 119 | |
| 120 :arg list-of-strings recognized_tags: the list of tags that linkify knows about; | |
| 121 everything else gets escaped | |
| 122 | |
| 123 :returns: linkified text as unicode | |
| 124 | |
| 125 """ | |
| 126 self.callbacks = callbacks | |
| 127 self.skip_tags = skip_tags | |
| 128 self.parse_email = parse_email | |
| 129 self.url_re = url_re | |
| 130 self.email_re = email_re | |
| 131 | |
| 132 # Create a parser/tokenizer that allows all HTML tags and escapes | |
| 133 # anything not in that list. | |
| 134 self.parser = html5lib_shim.BleachHTMLParser( | |
| 135 tags=recognized_tags, | |
| 136 strip=False, | |
| 137 consume_entities=True, | |
| 138 namespaceHTMLElements=False, | |
| 139 ) | |
| 140 self.walker = html5lib_shim.getTreeWalker('etree') | |
| 141 self.serializer = html5lib_shim.BleachHTMLSerializer( | |
| 142 quote_attr_values='always', | |
| 143 omit_optional_tags=False, | |
| 144 | |
| 145 # linkify does not sanitize | |
| 146 sanitize=False, | |
| 147 | |
| 148 # linkify alphabetizes | |
| 149 alphabetical_attributes=False, | |
| 150 ) | |
| 151 | |
| 152 def linkify(self, text): | |
| 153 """Linkify specified text | |
| 154 | |
| 155 :arg str text: the text to add links to | |
| 156 | |
| 157 :returns: linkified text as unicode | |
| 158 | |
| 159 :raises TypeError: if ``text`` is not a text type | |
| 160 | |
| 161 """ | |
| 162 if not isinstance(text, six.string_types): | |
| 163 raise TypeError('argument must be of text type') | |
| 164 | |
| 165 text = force_unicode(text) | |
| 166 | |
| 167 if not text: | |
| 168 return '' | |
| 169 | |
| 170 dom = self.parser.parseFragment(text) | |
| 171 filtered = LinkifyFilter( | |
| 172 source=self.walker(dom), | |
| 173 callbacks=self.callbacks, | |
| 174 skip_tags=self.skip_tags, | |
| 175 parse_email=self.parse_email, | |
| 176 url_re=self.url_re, | |
| 177 email_re=self.email_re, | |
| 178 ) | |
| 179 return self.serializer.render(filtered) | |
| 180 | |
| 181 | |
| 182 class LinkifyFilter(html5lib_shim.Filter): | |
| 183 """html5lib filter that linkifies text | |
| 184 | |
| 185 This will do the following: | |
| 186 | |
| 187 * convert email addresses into links | |
| 188 * convert urls into links | |
| 189 * edit existing links by running them through callbacks--the default is to | |
| 190 add a ``rel="nofollow"`` | |
| 191 | |
| 192 This filter can be used anywhere html5lib filters can be used. | |
| 193 | |
| 194 """ | |
| 195 def __init__(self, source, callbacks=None, skip_tags=None, parse_email=False, | |
| 196 url_re=URL_RE, email_re=EMAIL_RE): | |
| 197 """Creates a LinkifyFilter instance | |
| 198 | |
| 199 :arg TreeWalker source: stream | |
| 200 | |
| 201 :arg list callbacks: list of callbacks to run when adjusting tag attributes; | |
| 202 defaults to ``bleach.linkifier.DEFAULT_CALLBACKS`` | |
| 203 | |
| 204 :arg list skip_tags: list of tags that you don't want to linkify the | |
| 205 contents of; for example, you could set this to ``['pre']`` to skip | |
| 206 linkifying contents of ``pre`` tags | |
| 207 | |
| 208 :arg bool parse_email: whether or not to linkify email addresses | |
| 209 | |
| 210 :arg re url_re: url matching regex | |
| 211 | |
| 212 :arg re email_re: email matching regex | |
| 213 | |
| 214 """ | |
| 215 super(LinkifyFilter, self).__init__(source) | |
| 216 | |
| 217 self.callbacks = callbacks or [] | |
| 218 self.skip_tags = skip_tags or [] | |
| 219 self.parse_email = parse_email | |
| 220 | |
| 221 self.url_re = url_re | |
| 222 self.email_re = email_re | |
| 223 | |
| 224 def apply_callbacks(self, attrs, is_new): | |
| 225 """Given an attrs dict and an is_new bool, runs through callbacks | |
| 226 | |
| 227 Callbacks can return an adjusted attrs dict or ``None``. In the case of | |
| 228 ``None``, we stop going through callbacks and return that and the link | |
| 229 gets dropped. | |
| 230 | |
| 231 :arg dict attrs: map of ``(namespace, name)`` -> ``value`` | |
| 232 | |
| 233 :arg bool is_new: whether or not this link was added by linkify | |
| 234 | |
| 235 :returns: adjusted attrs dict or ``None`` | |
| 236 | |
| 237 """ | |
| 238 for cb in self.callbacks: | |
| 239 attrs = cb(attrs, is_new) | |
| 240 if attrs is None: | |
| 241 return None | |
| 242 return attrs | |
| 243 | |
| 244 def extract_character_data(self, token_list): | |
| 245 """Extracts and squashes character sequences in a token stream""" | |
| 246 # FIXME(willkg): This is a terrible idea. What it does is drop all the | |
| 247 # tags from the token list and merge the Characters and SpaceCharacters | |
| 248 # tokens into a single text. | |
| 249 # | |
| 250 # So something like this:: | |
| 251 # | |
| 252 # "<span>" "<b>" "some text" "</b>" "</span>" | |
| 253 # | |
| 254 # gets converted to "some text". | |
| 255 # | |
| 256 # This gets used to figure out the ``_text`` fauxttribute value for | |
| 257 # linkify callables. | |
| 258 # | |
| 259 # I'm not really sure how else to support that ``_text`` fauxttribute and | |
| 260 # maintain some modicum of backwards compatibility with previous versions | |
| 261 # of Bleach. | |
| 262 | |
| 263 out = [] | |
| 264 for token in token_list: | |
| 265 token_type = token['type'] | |
| 266 if token_type in ['Characters', 'SpaceCharacters']: | |
| 267 out.append(token['data']) | |
| 268 | |
| 269 return ''.join(out) | |
| 270 | |
| 271 def handle_email_addresses(self, src_iter): | |
| 272 """Handle email addresses in character tokens""" | |
| 273 for token in src_iter: | |
| 274 if token['type'] == 'Characters': | |
| 275 text = token['data'] | |
| 276 new_tokens = [] | |
| 277 end = 0 | |
| 278 | |
| 279 # For each email address we find in the text | |
| 280 for match in self.email_re.finditer(text): | |
| 281 if match.start() > end: | |
| 282 new_tokens.append( | |
| 283 {'type': 'Characters', 'data': text[end:match.start()]} | |
| 284 ) | |
| 285 | |
| 286 # Run attributes through the callbacks to see what we | |
| 287 # should do with this match | |
| 288 attrs = { | |
| 289 (None, 'href'): 'mailto:%s' % match.group(0), | |
| 290 '_text': match.group(0) | |
| 291 } | |
| 292 attrs = self.apply_callbacks(attrs, True) | |
| 293 | |
| 294 if attrs is None: | |
| 295 # Just add the text--but not as a link | |
| 296 new_tokens.append( | |
| 297 {'type': 'Characters', 'data': match.group(0)} | |
| 298 ) | |
| 299 | |
| 300 else: | |
| 301 # Add an "a" tag for the new link | |
| 302 _text = attrs.pop('_text', '') | |
| 303 attrs = alphabetize_attributes(attrs) | |
| 304 new_tokens.extend([ | |
| 305 {'type': 'StartTag', 'name': 'a', 'data': attrs}, | |
| 306 {'type': 'Characters', 'data': force_unicode(_text)}, | |
| 307 {'type': 'EndTag', 'name': 'a'} | |
| 308 ]) | |
| 309 end = match.end() | |
| 310 | |
| 311 if new_tokens: | |
| 312 # Yield the adjusted set of tokens and then continue | |
| 313 # through the loop | |
| 314 if end < len(text): | |
| 315 new_tokens.append({'type': 'Characters', 'data': text[end:]}) | |
| 316 | |
| 317 for new_token in new_tokens: | |
| 318 yield new_token | |
| 319 | |
| 320 continue | |
| 321 | |
| 322 yield token | |
| 323 | |
| 324 def strip_non_url_bits(self, fragment): | |
| 325 """Strips non-url bits from the url | |
| 326 | |
| 327 This accounts for over-eager matching by the regex. | |
| 328 | |
| 329 """ | |
| 330 prefix = suffix = '' | |
| 331 | |
| 332 while fragment: | |
| 333 # Try removing ( from the beginning and, if it's balanced, from the | |
| 334 # end, too | |
| 335 if fragment.startswith('('): | |
| 336 prefix = prefix + '(' | |
| 337 fragment = fragment[1:] | |
| 338 | |
| 339 if fragment.endswith(')'): | |
| 340 suffix = ')' + suffix | |
| 341 fragment = fragment[:-1] | |
| 342 continue | |
| 343 | |
| 344 # Now try extraneous things from the end. For example, sometimes we | |
| 345 # pick up ) at the end of a url, but the url is in a parenthesized | |
| 346 # phrase like: | |
| 347 # | |
| 348 # "i looked at the site (at http://example.com)" | |
| 349 | |
| 350 if fragment.endswith(')') and '(' not in fragment: | |
| 351 fragment = fragment[:-1] | |
| 352 suffix = ')' + suffix | |
| 353 continue | |
| 354 | |
| 355 # Handle commas | |
| 356 if fragment.endswith(','): | |
| 357 fragment = fragment[:-1] | |
| 358 suffix = ',' + suffix | |
| 359 continue | |
| 360 | |
| 361 # Handle periods | |
| 362 if fragment.endswith('.'): | |
| 363 fragment = fragment[:-1] | |
| 364 suffix = '.' + suffix | |
| 365 continue | |
| 366 | |
| 367 # Nothing matched, so we're done | |
| 368 break | |
| 369 | |
| 370 return fragment, prefix, suffix | |
| 371 | |
| 372 def handle_links(self, src_iter): | |
| 373 """Handle links in character tokens""" | |
| 374 in_a = False # happens, if parse_email=True and if a mail was found | |
| 375 for token in src_iter: | |
| 376 if in_a: | |
| 377 if token['type'] == 'EndTag' and token['name'] == 'a': | |
| 378 in_a = False | |
| 379 yield token | |
| 380 continue | |
| 381 elif token['type'] == 'StartTag' and token['name'] == 'a': | |
| 382 in_a = True | |
| 383 yield token | |
| 384 continue | |
| 385 if token['type'] == 'Characters': | |
| 386 text = token['data'] | |
| 387 new_tokens = [] | |
| 388 end = 0 | |
| 389 | |
| 390 for match in self.url_re.finditer(text): | |
| 391 if match.start() > end: | |
| 392 new_tokens.append( | |
| 393 {'type': 'Characters', 'data': text[end:match.start()]} | |
| 394 ) | |
| 395 | |
| 396 url = match.group(0) | |
| 397 prefix = suffix = '' | |
| 398 | |
| 399 # Sometimes we pick up too much in the url match, so look for | |
| 400 # bits we should drop and remove them from the match | |
| 401 url, prefix, suffix = self.strip_non_url_bits(url) | |
| 402 | |
| 403 # If there's no protocol, add one | |
| 404 if PROTO_RE.search(url): | |
| 405 href = url | |
| 406 else: | |
| 407 href = 'http://%s' % url | |
| 408 | |
| 409 attrs = { | |
| 410 (None, 'href'): href, | |
| 411 '_text': url | |
| 412 } | |
| 413 attrs = self.apply_callbacks(attrs, True) | |
| 414 | |
| 415 if attrs is None: | |
| 416 # Just add the text | |
| 417 new_tokens.append( | |
| 418 {'type': 'Characters', 'data': prefix + url + suffix} | |
| 419 ) | |
| 420 | |
| 421 else: | |
| 422 # Add the "a" tag! | |
| 423 if prefix: | |
| 424 new_tokens.append( | |
| 425 {'type': 'Characters', 'data': prefix} | |
| 426 ) | |
| 427 | |
| 428 _text = attrs.pop('_text', '') | |
| 429 attrs = alphabetize_attributes(attrs) | |
| 430 | |
| 431 new_tokens.extend([ | |
| 432 {'type': 'StartTag', 'name': 'a', 'data': attrs}, | |
| 433 {'type': 'Characters', 'data': force_unicode(_text)}, | |
| 434 {'type': 'EndTag', 'name': 'a'}, | |
| 435 ]) | |
| 436 | |
| 437 if suffix: | |
| 438 new_tokens.append( | |
| 439 {'type': 'Characters', 'data': suffix} | |
| 440 ) | |
| 441 | |
| 442 end = match.end() | |
| 443 | |
| 444 if new_tokens: | |
| 445 # Yield the adjusted set of tokens and then continue | |
| 446 # through the loop | |
| 447 if end < len(text): | |
| 448 new_tokens.append({'type': 'Characters', 'data': text[end:]}) | |
| 449 | |
| 450 for new_token in new_tokens: | |
| 451 yield new_token | |
| 452 | |
| 453 continue | |
| 454 | |
| 455 yield token | |
| 456 | |
| 457 def handle_a_tag(self, token_buffer): | |
| 458 """Handle the "a" tag | |
| 459 | |
| 460 This could adjust the link or drop it altogether depending on what the | |
| 461 callbacks return. | |
| 462 | |
| 463 This yields the new set of tokens. | |
| 464 | |
| 465 """ | |
| 466 a_token = token_buffer[0] | |
| 467 if a_token['data']: | |
| 468 attrs = a_token['data'] | |
| 469 else: | |
| 470 attrs = {} | |
| 471 text = self.extract_character_data(token_buffer) | |
| 472 attrs['_text'] = text | |
| 473 | |
| 474 attrs = self.apply_callbacks(attrs, False) | |
| 475 | |
| 476 if attrs is None: | |
| 477 # We're dropping the "a" tag and everything else and replacing | |
| 478 # it with character data. So emit that token. | |
| 479 yield {'type': 'Characters', 'data': text} | |
| 480 | |
| 481 else: | |
| 482 new_text = attrs.pop('_text', '') | |
| 483 a_token['data'] = alphabetize_attributes(attrs) | |
| 484 | |
| 485 if text == new_text: | |
| 486 # The callbacks didn't change the text, so we yield the new "a" | |
| 487 # token, then whatever else was there, then the end "a" token | |
| 488 yield a_token | |
| 489 for mem in token_buffer[1:]: | |
| 490 yield mem | |
| 491 | |
| 492 else: | |
| 493 # If the callbacks changed the text, then we're going to drop | |
| 494 # all the tokens between the start and end "a" tags and replace | |
| 495 # it with the new text | |
| 496 yield a_token | |
| 497 yield {'type': 'Characters', 'data': force_unicode(new_text)} | |
| 498 yield token_buffer[-1] | |
| 499 | |
| 500 def __iter__(self): | |
| 501 in_a = False | |
| 502 in_skip_tag = None | |
| 503 | |
| 504 token_buffer = [] | |
| 505 | |
| 506 for token in super(LinkifyFilter, self).__iter__(): | |
| 507 if in_a: | |
| 508 # Handle the case where we're in an "a" tag--we want to buffer tokens | |
| 509 # until we hit an end "a" tag. | |
| 510 if token['type'] == 'EndTag' and token['name'] == 'a': | |
| 511 # Add the end tag to the token buffer and then handle them | |
| 512 # and yield anything returned | |
| 513 token_buffer.append(token) | |
| 514 for new_token in self.handle_a_tag(token_buffer): | |
| 515 yield new_token | |
| 516 | |
| 517 # Clear "a" related state and continue since we've yielded all | |
| 518 # the tokens we're going to yield | |
| 519 in_a = False | |
| 520 token_buffer = [] | |
| 521 else: | |
| 522 token_buffer.append(token) | |
| 523 continue | |
| 524 | |
| 525 if token['type'] in ['StartTag', 'EmptyTag']: | |
| 526 if token['name'] in self.skip_tags: | |
| 527 # Skip tags start a "special mode" where we don't linkify | |
| 528 # anything until the end tag. | |
| 529 in_skip_tag = token['name'] | |
| 530 | |
| 531 elif token['name'] == 'a': | |
| 532 # The "a" tag is special--we switch to a slurp mode and | |
| 533 # slurp all the tokens until the end "a" tag and then | |
| 534 # figure out what to do with them there. | |
| 535 in_a = True | |
| 536 token_buffer.append(token) | |
| 537 | |
| 538 # We buffer the start tag, so we don't want to yield it, | |
| 539 # yet | |
| 540 continue | |
| 541 | |
| 542 elif in_skip_tag and self.skip_tags: | |
| 543 # NOTE(willkg): We put this clause here since in_a and | |
| 544 # switching in and out of in_a takes precedence. | |
| 545 if token['type'] == 'EndTag' and token['name'] == in_skip_tag: | |
| 546 in_skip_tag = None | |
| 547 | |
| 548 elif not in_a and not in_skip_tag and token['type'] == 'Characters': | |
| 549 new_stream = iter([token]) | |
| 550 if self.parse_email: | |
| 551 new_stream = self.handle_email_addresses(new_stream) | |
| 552 | |
| 553 new_stream = self.handle_links(new_stream) | |
| 554 | |
| 555 for token in new_stream: | |
| 556 yield token | |
| 557 | |
| 558 # We've already yielded this token, so continue | |
| 559 continue | |
| 560 | |
| 561 yield token |
