Mercurial > repos > shellac > sam_consensus_v3
comparison env/lib/python3.9/site-packages/bleach/linkifier.py @ 0:4f3585e2f14b draft default tip
"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
| author | shellac |
|---|---|
| date | Mon, 22 Mar 2021 18:12:50 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:4f3585e2f14b |
|---|---|
| 1 from __future__ import unicode_literals | |
| 2 import re | |
| 3 import six | |
| 4 | |
| 5 from bleach import callbacks as linkify_callbacks | |
| 6 from bleach import html5lib_shim | |
| 7 from bleach.utils import alphabetize_attributes, force_unicode | |
| 8 | |
| 9 | |
| 10 #: List of default callbacks | |
| 11 DEFAULT_CALLBACKS = [linkify_callbacks.nofollow] | |
| 12 | |
| 13 | |
| 14 TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az | |
| 15 ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat | |
| 16 cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk | |
| 17 dm do dz ec edu ee eg er es et eu fi fj fk fm fo fr ga gb gd ge gf gg | |
| 18 gh gi gl gm gn gov gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il | |
| 19 im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp | |
| 20 kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk | |
| 21 ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne | |
| 22 net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn post | |
| 23 pr pro ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl | |
| 24 sm sn so sr ss st su sv sx sy sz tc td tel tf tg th tj tk tl tm tn to | |
| 25 tp tr travel tt tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws | |
| 26 xn xxx ye yt yu za zm zw""".split() | |
| 27 | |
| 28 # Make sure that .com doesn't get matched by .co first | |
| 29 TLDS.reverse() | |
| 30 | |
| 31 | |
| 32 def build_url_re(tlds=TLDS, protocols=html5lib_shim.allowed_protocols): | |
| 33 """Builds the url regex used by linkifier | |
| 34 | |
| 35 If you want a different set of tlds or allowed protocols, pass those in | |
| 36 and stomp on the existing ``url_re``:: | |
| 37 | |
| 38 from bleach import linkifier | |
| 39 | |
| 40 my_url_re = linkifier.build_url_re(my_tlds_list, my_protocols) | |
| 41 | |
| 42 linker = LinkifyFilter(url_re=my_url_re) | |
| 43 | |
| 44 """ | |
| 45 return re.compile( | |
| 46 r"""\(* # Match any opening parentheses. | |
| 47 \b(?<![@.])(?:(?:{0}):/{{0,3}}(?:(?:\w+:)?\w+@)?)? # http:// | |
| 48 ([\w-]+\.)+(?:{1})(?:\:[0-9]+)?(?!\.\w)\b # xx.yy.tld(:##)? | |
| 49 (?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)? | |
| 50 # /path/zz (excluding "unsafe" chars from RFC 1738, | |
| 51 # except for # and ~, which happen in practice) | |
| 52 """.format( | |
| 53 "|".join(sorted(protocols)), "|".join(sorted(tlds)) | |
| 54 ), | |
| 55 re.IGNORECASE | re.VERBOSE | re.UNICODE, | |
| 56 ) | |
| 57 | |
| 58 | |
| 59 URL_RE = build_url_re() | |
| 60 | |
| 61 | |
| 62 PROTO_RE = re.compile(r"^[\w-]+:/{0,3}", re.IGNORECASE) | |
| 63 | |
| 64 | |
| 65 def build_email_re(tlds=TLDS): | |
| 66 """Builds the email regex used by linkifier | |
| 67 | |
| 68 If you want a different set of tlds, pass those in and stomp on the existing ``email_re``:: | |
| 69 | |
| 70 from bleach import linkifier | |
| 71 | |
| 72 my_email_re = linkifier.build_email_re(my_tlds_list) | |
| 73 | |
| 74 linker = LinkifyFilter(email_re=my_url_re) | |
| 75 | |
| 76 """ | |
| 77 # open and closing braces doubled below for format string | |
| 78 return re.compile( | |
| 79 r"""(?<!//) | |
| 80 (([-!#$%&'*+/=?^_`{{}}|~0-9A-Z]+ | |
| 81 (\.[-!#$%&'*+/=?^_`{{}}|~0-9A-Z]+)* # dot-atom | |
| 82 |^"([\001-\010\013\014\016-\037!#-\[\]-\177] | |
| 83 |\\[\001-\011\013\014\016-\177])*" # quoted-string | |
| 84 )@(?:[A-Z0-9](?:[A-Z0-9-]{{0,61}}[A-Z0-9])?\.)+(?:{0})) # domain | |
| 85 """.format( | |
| 86 "|".join(tlds) | |
| 87 ), | |
| 88 re.IGNORECASE | re.MULTILINE | re.VERBOSE, | |
| 89 ) | |
| 90 | |
| 91 | |
| 92 EMAIL_RE = build_email_re() | |
| 93 | |
| 94 | |
| 95 class Linker(object): | |
| 96 """Convert URL-like strings in an HTML fragment to links | |
| 97 | |
| 98 This function converts strings that look like URLs, domain names and email | |
| 99 addresses in text that may be an HTML fragment to links, while preserving: | |
| 100 | |
| 101 1. links already in the string | |
| 102 2. urls found in attributes | |
| 103 3. email addresses | |
| 104 | |
| 105 linkify does a best-effort approach and tries to recover from bad | |
| 106 situations due to crazy text. | |
| 107 | |
| 108 """ | |
| 109 | |
| 110 def __init__( | |
| 111 self, | |
| 112 callbacks=DEFAULT_CALLBACKS, | |
| 113 skip_tags=None, | |
| 114 parse_email=False, | |
| 115 url_re=URL_RE, | |
| 116 email_re=EMAIL_RE, | |
| 117 recognized_tags=html5lib_shim.HTML_TAGS, | |
| 118 ): | |
| 119 """Creates a Linker instance | |
| 120 | |
| 121 :arg list callbacks: list of callbacks to run when adjusting tag attributes; | |
| 122 defaults to ``bleach.linkifier.DEFAULT_CALLBACKS`` | |
| 123 | |
| 124 :arg list skip_tags: list of tags that you don't want to linkify the | |
| 125 contents of; for example, you could set this to ``['pre']`` to skip | |
| 126 linkifying contents of ``pre`` tags | |
| 127 | |
| 128 :arg bool parse_email: whether or not to linkify email addresses | |
| 129 | |
| 130 :arg re url_re: url matching regex | |
| 131 | |
| 132 :arg re email_re: email matching regex | |
| 133 | |
| 134 :arg list-of-strings recognized_tags: the list of tags that linkify knows about; | |
| 135 everything else gets escaped | |
| 136 | |
| 137 :returns: linkified text as unicode | |
| 138 | |
| 139 """ | |
| 140 self.callbacks = callbacks | |
| 141 self.skip_tags = skip_tags | |
| 142 self.parse_email = parse_email | |
| 143 self.url_re = url_re | |
| 144 self.email_re = email_re | |
| 145 | |
| 146 # Create a parser/tokenizer that allows all HTML tags and escapes | |
| 147 # anything not in that list. | |
| 148 self.parser = html5lib_shim.BleachHTMLParser( | |
| 149 tags=recognized_tags, | |
| 150 strip=False, | |
| 151 consume_entities=True, | |
| 152 namespaceHTMLElements=False, | |
| 153 ) | |
| 154 self.walker = html5lib_shim.getTreeWalker("etree") | |
| 155 self.serializer = html5lib_shim.BleachHTMLSerializer( | |
| 156 quote_attr_values="always", | |
| 157 omit_optional_tags=False, | |
| 158 # linkify does not sanitize | |
| 159 sanitize=False, | |
| 160 # linkify alphabetizes | |
| 161 alphabetical_attributes=False, | |
| 162 ) | |
| 163 | |
| 164 def linkify(self, text): | |
| 165 """Linkify specified text | |
| 166 | |
| 167 :arg str text: the text to add links to | |
| 168 | |
| 169 :returns: linkified text as unicode | |
| 170 | |
| 171 :raises TypeError: if ``text`` is not a text type | |
| 172 | |
| 173 """ | |
| 174 if not isinstance(text, six.string_types): | |
| 175 raise TypeError("argument must be of text type") | |
| 176 | |
| 177 text = force_unicode(text) | |
| 178 | |
| 179 if not text: | |
| 180 return "" | |
| 181 | |
| 182 dom = self.parser.parseFragment(text) | |
| 183 filtered = LinkifyFilter( | |
| 184 source=self.walker(dom), | |
| 185 callbacks=self.callbacks, | |
| 186 skip_tags=self.skip_tags, | |
| 187 parse_email=self.parse_email, | |
| 188 url_re=self.url_re, | |
| 189 email_re=self.email_re, | |
| 190 ) | |
| 191 return self.serializer.render(filtered) | |
| 192 | |
| 193 | |
| 194 class LinkifyFilter(html5lib_shim.Filter): | |
| 195 """html5lib filter that linkifies text | |
| 196 | |
| 197 This will do the following: | |
| 198 | |
| 199 * convert email addresses into links | |
| 200 * convert urls into links | |
| 201 * edit existing links by running them through callbacks--the default is to | |
| 202 add a ``rel="nofollow"`` | |
| 203 | |
| 204 This filter can be used anywhere html5lib filters can be used. | |
| 205 | |
| 206 """ | |
| 207 | |
| 208 def __init__( | |
| 209 self, | |
| 210 source, | |
| 211 callbacks=DEFAULT_CALLBACKS, | |
| 212 skip_tags=None, | |
| 213 parse_email=False, | |
| 214 url_re=URL_RE, | |
| 215 email_re=EMAIL_RE, | |
| 216 ): | |
| 217 """Creates a LinkifyFilter instance | |
| 218 | |
| 219 :arg TreeWalker source: stream | |
| 220 | |
| 221 :arg list callbacks: list of callbacks to run when adjusting tag attributes; | |
| 222 defaults to ``bleach.linkifier.DEFAULT_CALLBACKS`` | |
| 223 | |
| 224 :arg list skip_tags: list of tags that you don't want to linkify the | |
| 225 contents of; for example, you could set this to ``['pre']`` to skip | |
| 226 linkifying contents of ``pre`` tags | |
| 227 | |
| 228 :arg bool parse_email: whether or not to linkify email addresses | |
| 229 | |
| 230 :arg re url_re: url matching regex | |
| 231 | |
| 232 :arg re email_re: email matching regex | |
| 233 | |
| 234 """ | |
| 235 super(LinkifyFilter, self).__init__(source) | |
| 236 | |
| 237 self.callbacks = callbacks or [] | |
| 238 self.skip_tags = skip_tags or [] | |
| 239 self.parse_email = parse_email | |
| 240 | |
| 241 self.url_re = url_re | |
| 242 self.email_re = email_re | |
| 243 | |
| 244 def apply_callbacks(self, attrs, is_new): | |
| 245 """Given an attrs dict and an is_new bool, runs through callbacks | |
| 246 | |
| 247 Callbacks can return an adjusted attrs dict or ``None``. In the case of | |
| 248 ``None``, we stop going through callbacks and return that and the link | |
| 249 gets dropped. | |
| 250 | |
| 251 :arg dict attrs: map of ``(namespace, name)`` -> ``value`` | |
| 252 | |
| 253 :arg bool is_new: whether or not this link was added by linkify | |
| 254 | |
| 255 :returns: adjusted attrs dict or ``None`` | |
| 256 | |
| 257 """ | |
| 258 for cb in self.callbacks: | |
| 259 attrs = cb(attrs, is_new) | |
| 260 if attrs is None: | |
| 261 return None | |
| 262 return attrs | |
| 263 | |
| 264 def extract_character_data(self, token_list): | |
| 265 """Extracts and squashes character sequences in a token stream""" | |
| 266 # FIXME(willkg): This is a terrible idea. What it does is drop all the | |
| 267 # tags from the token list and merge the Characters and SpaceCharacters | |
| 268 # tokens into a single text. | |
| 269 # | |
| 270 # So something like this:: | |
| 271 # | |
| 272 # "<span>" "<b>" "some text" "</b>" "</span>" | |
| 273 # | |
| 274 # gets converted to "some text". | |
| 275 # | |
| 276 # This gets used to figure out the ``_text`` fauxttribute value for | |
| 277 # linkify callables. | |
| 278 # | |
| 279 # I'm not really sure how else to support that ``_text`` fauxttribute and | |
| 280 # maintain some modicum of backwards compatibility with previous versions | |
| 281 # of Bleach. | |
| 282 | |
| 283 out = [] | |
| 284 for token in token_list: | |
| 285 token_type = token["type"] | |
| 286 if token_type in ["Characters", "SpaceCharacters"]: | |
| 287 out.append(token["data"]) | |
| 288 | |
| 289 return "".join(out) | |
| 290 | |
| 291 def handle_email_addresses(self, src_iter): | |
| 292 """Handle email addresses in character tokens""" | |
| 293 for token in src_iter: | |
| 294 if token["type"] == "Characters": | |
| 295 text = token["data"] | |
| 296 new_tokens = [] | |
| 297 end = 0 | |
| 298 | |
| 299 # For each email address we find in the text | |
| 300 for match in self.email_re.finditer(text): | |
| 301 if match.start() > end: | |
| 302 new_tokens.append( | |
| 303 {"type": "Characters", "data": text[end : match.start()]} | |
| 304 ) | |
| 305 | |
| 306 # Run attributes through the callbacks to see what we | |
| 307 # should do with this match | |
| 308 attrs = { | |
| 309 (None, "href"): "mailto:%s" % match.group(0), | |
| 310 "_text": match.group(0), | |
| 311 } | |
| 312 attrs = self.apply_callbacks(attrs, True) | |
| 313 | |
| 314 if attrs is None: | |
| 315 # Just add the text--but not as a link | |
| 316 new_tokens.append( | |
| 317 {"type": "Characters", "data": match.group(0)} | |
| 318 ) | |
| 319 | |
| 320 else: | |
| 321 # Add an "a" tag for the new link | |
| 322 _text = attrs.pop("_text", "") | |
| 323 attrs = alphabetize_attributes(attrs) | |
| 324 new_tokens.extend( | |
| 325 [ | |
| 326 {"type": "StartTag", "name": "a", "data": attrs}, | |
| 327 {"type": "Characters", "data": force_unicode(_text)}, | |
| 328 {"type": "EndTag", "name": "a"}, | |
| 329 ] | |
| 330 ) | |
| 331 end = match.end() | |
| 332 | |
| 333 if new_tokens: | |
| 334 # Yield the adjusted set of tokens and then continue | |
| 335 # through the loop | |
| 336 if end < len(text): | |
| 337 new_tokens.append({"type": "Characters", "data": text[end:]}) | |
| 338 | |
| 339 for new_token in new_tokens: | |
| 340 yield new_token | |
| 341 | |
| 342 continue | |
| 343 | |
| 344 yield token | |
| 345 | |
| 346 def strip_non_url_bits(self, fragment): | |
| 347 """Strips non-url bits from the url | |
| 348 | |
| 349 This accounts for over-eager matching by the regex. | |
| 350 | |
| 351 """ | |
| 352 prefix = suffix = "" | |
| 353 | |
| 354 while fragment: | |
| 355 # Try removing ( from the beginning and, if it's balanced, from the | |
| 356 # end, too | |
| 357 if fragment.startswith("("): | |
| 358 prefix = prefix + "(" | |
| 359 fragment = fragment[1:] | |
| 360 | |
| 361 if fragment.endswith(")"): | |
| 362 suffix = ")" + suffix | |
| 363 fragment = fragment[:-1] | |
| 364 continue | |
| 365 | |
| 366 # Now try extraneous things from the end. For example, sometimes we | |
| 367 # pick up ) at the end of a url, but the url is in a parenthesized | |
| 368 # phrase like: | |
| 369 # | |
| 370 # "i looked at the site (at http://example.com)" | |
| 371 | |
| 372 if fragment.endswith(")") and "(" not in fragment: | |
| 373 fragment = fragment[:-1] | |
| 374 suffix = ")" + suffix | |
| 375 continue | |
| 376 | |
| 377 # Handle commas | |
| 378 if fragment.endswith(","): | |
| 379 fragment = fragment[:-1] | |
| 380 suffix = "," + suffix | |
| 381 continue | |
| 382 | |
| 383 # Handle periods | |
| 384 if fragment.endswith("."): | |
| 385 fragment = fragment[:-1] | |
| 386 suffix = "." + suffix | |
| 387 continue | |
| 388 | |
| 389 # Nothing matched, so we're done | |
| 390 break | |
| 391 | |
| 392 return fragment, prefix, suffix | |
| 393 | |
| 394 def handle_links(self, src_iter): | |
| 395 """Handle links in character tokens""" | |
| 396 in_a = False # happens, if parse_email=True and if a mail was found | |
| 397 for token in src_iter: | |
| 398 if in_a: | |
| 399 if token["type"] == "EndTag" and token["name"] == "a": | |
| 400 in_a = False | |
| 401 yield token | |
| 402 continue | |
| 403 elif token["type"] == "StartTag" and token["name"] == "a": | |
| 404 in_a = True | |
| 405 yield token | |
| 406 continue | |
| 407 if token["type"] == "Characters": | |
| 408 text = token["data"] | |
| 409 new_tokens = [] | |
| 410 end = 0 | |
| 411 | |
| 412 for match in self.url_re.finditer(text): | |
| 413 if match.start() > end: | |
| 414 new_tokens.append( | |
| 415 {"type": "Characters", "data": text[end : match.start()]} | |
| 416 ) | |
| 417 | |
| 418 url = match.group(0) | |
| 419 prefix = suffix = "" | |
| 420 | |
| 421 # Sometimes we pick up too much in the url match, so look for | |
| 422 # bits we should drop and remove them from the match | |
| 423 url, prefix, suffix = self.strip_non_url_bits(url) | |
| 424 | |
| 425 # If there's no protocol, add one | |
| 426 if PROTO_RE.search(url): | |
| 427 href = url | |
| 428 else: | |
| 429 href = "http://%s" % url | |
| 430 | |
| 431 attrs = {(None, "href"): href, "_text": url} | |
| 432 attrs = self.apply_callbacks(attrs, True) | |
| 433 | |
| 434 if attrs is None: | |
| 435 # Just add the text | |
| 436 new_tokens.append( | |
| 437 {"type": "Characters", "data": prefix + url + suffix} | |
| 438 ) | |
| 439 | |
| 440 else: | |
| 441 # Add the "a" tag! | |
| 442 if prefix: | |
| 443 new_tokens.append({"type": "Characters", "data": prefix}) | |
| 444 | |
| 445 _text = attrs.pop("_text", "") | |
| 446 attrs = alphabetize_attributes(attrs) | |
| 447 | |
| 448 new_tokens.extend( | |
| 449 [ | |
| 450 {"type": "StartTag", "name": "a", "data": attrs}, | |
| 451 {"type": "Characters", "data": force_unicode(_text)}, | |
| 452 {"type": "EndTag", "name": "a"}, | |
| 453 ] | |
| 454 ) | |
| 455 | |
| 456 if suffix: | |
| 457 new_tokens.append({"type": "Characters", "data": suffix}) | |
| 458 | |
| 459 end = match.end() | |
| 460 | |
| 461 if new_tokens: | |
| 462 # Yield the adjusted set of tokens and then continue | |
| 463 # through the loop | |
| 464 if end < len(text): | |
| 465 new_tokens.append({"type": "Characters", "data": text[end:]}) | |
| 466 | |
| 467 for new_token in new_tokens: | |
| 468 yield new_token | |
| 469 | |
| 470 continue | |
| 471 | |
| 472 yield token | |
| 473 | |
| 474 def handle_a_tag(self, token_buffer): | |
| 475 """Handle the "a" tag | |
| 476 | |
| 477 This could adjust the link or drop it altogether depending on what the | |
| 478 callbacks return. | |
| 479 | |
| 480 This yields the new set of tokens. | |
| 481 | |
| 482 """ | |
| 483 a_token = token_buffer[0] | |
| 484 if a_token["data"]: | |
| 485 attrs = a_token["data"] | |
| 486 else: | |
| 487 attrs = {} | |
| 488 text = self.extract_character_data(token_buffer) | |
| 489 attrs["_text"] = text | |
| 490 | |
| 491 attrs = self.apply_callbacks(attrs, False) | |
| 492 | |
| 493 if attrs is None: | |
| 494 # We're dropping the "a" tag and everything else and replacing | |
| 495 # it with character data. So emit that token. | |
| 496 yield {"type": "Characters", "data": text} | |
| 497 | |
| 498 else: | |
| 499 new_text = attrs.pop("_text", "") | |
| 500 a_token["data"] = alphabetize_attributes(attrs) | |
| 501 | |
| 502 if text == new_text: | |
| 503 # The callbacks didn't change the text, so we yield the new "a" | |
| 504 # token, then whatever else was there, then the end "a" token | |
| 505 yield a_token | |
| 506 for mem in token_buffer[1:]: | |
| 507 yield mem | |
| 508 | |
| 509 else: | |
| 510 # If the callbacks changed the text, then we're going to drop | |
| 511 # all the tokens between the start and end "a" tags and replace | |
| 512 # it with the new text | |
| 513 yield a_token | |
| 514 yield {"type": "Characters", "data": force_unicode(new_text)} | |
| 515 yield token_buffer[-1] | |
| 516 | |
| 517 def __iter__(self): | |
| 518 in_a = False | |
| 519 in_skip_tag = None | |
| 520 | |
| 521 token_buffer = [] | |
| 522 | |
| 523 for token in super(LinkifyFilter, self).__iter__(): | |
| 524 if in_a: | |
| 525 # Handle the case where we're in an "a" tag--we want to buffer tokens | |
| 526 # until we hit an end "a" tag. | |
| 527 if token["type"] == "EndTag" and token["name"] == "a": | |
| 528 # Add the end tag to the token buffer and then handle them | |
| 529 # and yield anything returned | |
| 530 token_buffer.append(token) | |
| 531 for new_token in self.handle_a_tag(token_buffer): | |
| 532 yield new_token | |
| 533 | |
| 534 # Clear "a" related state and continue since we've yielded all | |
| 535 # the tokens we're going to yield | |
| 536 in_a = False | |
| 537 token_buffer = [] | |
| 538 else: | |
| 539 token_buffer.append(token) | |
| 540 continue | |
| 541 | |
| 542 if token["type"] in ["StartTag", "EmptyTag"]: | |
| 543 if token["name"] in self.skip_tags: | |
| 544 # Skip tags start a "special mode" where we don't linkify | |
| 545 # anything until the end tag. | |
| 546 in_skip_tag = token["name"] | |
| 547 | |
| 548 elif token["name"] == "a": | |
| 549 # The "a" tag is special--we switch to a slurp mode and | |
| 550 # slurp all the tokens until the end "a" tag and then | |
| 551 # figure out what to do with them there. | |
| 552 in_a = True | |
| 553 token_buffer.append(token) | |
| 554 | |
| 555 # We buffer the start tag, so we don't want to yield it, | |
| 556 # yet | |
| 557 continue | |
| 558 | |
| 559 elif in_skip_tag and self.skip_tags: | |
| 560 # NOTE(willkg): We put this clause here since in_a and | |
| 561 # switching in and out of in_a takes precedence. | |
| 562 if token["type"] == "EndTag" and token["name"] == in_skip_tag: | |
| 563 in_skip_tag = None | |
| 564 | |
| 565 elif not in_a and not in_skip_tag and token["type"] == "Characters": | |
| 566 new_stream = iter([token]) | |
| 567 if self.parse_email: | |
| 568 new_stream = self.handle_email_addresses(new_stream) | |
| 569 | |
| 570 new_stream = self.handle_links(new_stream) | |
| 571 | |
| 572 for token in new_stream: | |
| 573 yield token | |
| 574 | |
| 575 # We've already yielded this token, so continue | |
| 576 continue | |
| 577 | |
| 578 yield token |
